diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/aftonbladet.py | 64 | ||||
| -rw-r--r-- | youtube_dl/extractor/clubic.py | 11 | ||||
| -rw-r--r-- | youtube_dl/extractor/dctp.py | 67 | ||||
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/instagram.py | 23 | ||||
| -rw-r--r-- | youtube_dl/extractor/ketnet.py | 26 | ||||
| -rw-r--r-- | youtube_dl/extractor/leeco.py | 8 | ||||
| -rw-r--r-- | youtube_dl/extractor/limelight.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/tvland.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/twitch.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/vgtv.py | 13 | ||||
| -rw-r--r-- | youtube_dl/extractor/vk.py | 88 | 
12 files changed, 162 insertions, 147 deletions
| diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py deleted file mode 100644 index 5766b4fe8..000000000 --- a/youtube_dl/extractor/aftonbladet.py +++ /dev/null @@ -1,64 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import int_or_none - - -class AftonbladetIE(InfoExtractor): -    _VALID_URL = r'https?://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)' -    _TEST = { -        'url': 'http://tv.aftonbladet.se/abtv/articles/36015', -        'info_dict': { -            'id': '36015', -            'ext': 'mp4', -            'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna', -            'description': 'Jupiters måne mest aktiv av alla himlakroppar', -            'timestamp': 1394142732, -            'upload_date': '20140306', -        }, -    } - -    def _real_extract(self, url): -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) - -        # find internal video meta data -        meta_url = 'http://aftonbladet-play-metadata.cdn.drvideo.aptoma.no/video/%s.json' -        player_config = self._parse_json(self._html_search_regex( -            r'data-player-config="([^"]+)"', webpage, 'player config'), video_id) -        internal_meta_id = player_config['aptomaVideoId'] -        internal_meta_url = meta_url % internal_meta_id -        internal_meta_json = self._download_json( -            internal_meta_url, video_id, 'Downloading video meta data') - -        # find internal video formats -        format_url = 'http://aftonbladet-play.videodata.drvideo.aptoma.no/actions/video/?id=%s' -        internal_video_id = internal_meta_json['videoId'] -        internal_formats_url = format_url % internal_video_id -        internal_formats_json = self._download_json( -            internal_formats_url, video_id, 'Downloading video formats') - -        formats = [] -        for fmt in internal_formats_json['formats']['http']['pseudostreaming']['mp4']: -            p = fmt['paths'][0] -            formats.append({ -                'url': 'http://%s:%d/%s/%s' % (p['address'], p['port'], p['path'], p['filename']), -                'ext': 'mp4', -                'width': int_or_none(fmt.get('width')), -                'height': int_or_none(fmt.get('height')), -                'tbr': int_or_none(fmt.get('bitrate')), -                'protocol': 'http', -            }) -        self._sort_formats(formats) - -        return { -            'id': video_id, -            'title': internal_meta_json['title'], -            'formats': formats, -            'thumbnail': internal_meta_json.get('imageUrl'), -            'description': internal_meta_json.get('shortPreamble'), -            'timestamp': int_or_none(internal_meta_json.get('timePublished')), -            'duration': int_or_none(internal_meta_json.get('duration')), -            'view_count': int_or_none(internal_meta_json.get('views')), -        } diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py index 2fba93543..f7ee3a8f8 100644 --- a/youtube_dl/extractor/clubic.py +++ b/youtube_dl/extractor/clubic.py @@ -1,9 +1,6 @@  # coding: utf-8  from __future__ import unicode_literals -import json -import re -  from .common import InfoExtractor  from ..utils import (      clean_html, @@ -30,16 +27,14 @@ class ClubicIE(InfoExtractor):      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          player_url = 'http://player.m6web.fr/v1/player/clubic/%s.html' % video_id          player_page = self._download_webpage(player_url, video_id) -        config_json = self._search_regex( +        config = self._parse_json(self._search_regex(              r'(?m)M6\.Player\.config\s*=\s*(\{.+?\});$', player_page, -            'configuration') -        config = json.loads(config_json) +            'configuration'), video_id)          video_info = config['videoInfo']          sources = config['sources'] diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index a47e04993..14ba88715 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -1,61 +1,54 @@ -# encoding: utf-8 +# coding: utf-8  from __future__ import unicode_literals  from .common import InfoExtractor -from ..compat import compat_str +from ..utils import unified_strdate  class DctpTvIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(#/)?filme/(?P<id>.+?)/$'      _TEST = {          'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', +        'md5': '174dd4a8a6225cf5655952f969cfbe24',          'info_dict': { -            'id': '1324', +            'id': '95eaa4f33dad413aa17b4ee613cccc6c',              'display_id': 'videoinstallation-fuer-eine-kaufhausfassade', -            'ext': 'flv', -            'title': 'Videoinstallation für eine Kaufhausfassade' +            'ext': 'mp4', +            'title': 'Videoinstallation für eine Kaufhausfassade', +            'description': 'Kurzfilm', +            'upload_date': '20110407', +            'thumbnail': 're:^https?://.*\.jpg$',          }, -        'params': { -            # rtmp download -            'skip_download': True, -        }      }      def _real_extract(self, url):          video_id = self._match_id(url) -        base_url = 'http://dctp-ivms2-restapi.s3.amazonaws.com/' -        version_json = self._download_json( -            base_url + 'version.json', -            video_id, note='Determining file version') -        version = version_json['version_name'] -        info_json = self._download_json( -            '{0}{1}/restapi/slugs/{2}.json'.format(base_url, version, video_id), -            video_id, note='Fetching object ID') -        object_id = compat_str(info_json['object_id']) -        meta_json = self._download_json( -            '{0}{1}/restapi/media/{2}.json'.format(base_url, version, object_id), -            video_id, note='Downloading metadata') -        uuid = meta_json['uuid'] -        title = meta_json['title'] -        wide = meta_json['is_wide'] -        if wide: -            ratio = '16x9' -        else: -            ratio = '4x3' -        play_path = 'mp4:{0}_dctp_0500_{1}.m4v'.format(uuid, ratio) +        webpage = self._download_webpage(url, video_id) + +        object_id = self._html_search_meta('DC.identifier', webpage)          servers_json = self._download_json( -            'http://www.dctp.tv/streaming_servers/', +            'http://www.dctp.tv/elastic_streaming_client/get_streaming_server/',              video_id, note='Downloading server list') -        url = servers_json[0]['endpoint'] +        server = servers_json[0]['server'] +        m3u8_path = self._search_regex( +            r'\'([^\'"]+/playlist\.m3u8)"', webpage, 'm3u8 path') +        formats = self._extract_m3u8_formats( +            'http://%s%s' % (server, m3u8_path), video_id, ext='mp4', +            entry_protocol='m3u8_native') + +        title = self._og_search_title(webpage) +        description = self._html_search_meta('DC.description', webpage) +        upload_date = unified_strdate( +            self._html_search_meta('DC.date.created', webpage)) +        thumbnail = self._og_search_thumbnail(webpage)          return {              'id': object_id,              'title': title, -            'format': 'rtmp', -            'url': url, -            'play_path': play_path, -            'rtmp_real_time': True, -            'ext': 'flv', -            'display_id': video_id +            'formats': formats, +            'display_id': video_id, +            'description': description, +            'upload_date': upload_date, +            'thumbnail': thumbnail,          } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 23fd2a308..09b3b4942 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -31,7 +31,6 @@ from .aenetworks import (      HistoryTopicIE,  )  from .afreecatv import AfreecaTVIE -from .aftonbladet import AftonbladetIE  from .airmozilla import AirMozillaIE  from .aljazeera import AlJazeeraIE  from .alphaporno import AlphaPornoIE diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 8f7f232be..196407b06 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -29,6 +29,7 @@ class InstagramIE(InfoExtractor):              'uploader': 'Naomi Leonor Phan-Quang',              'like_count': int,              'comment_count': int, +            'comments': list,          },      }, {          # missing description @@ -44,6 +45,7 @@ class InstagramIE(InfoExtractor):              'uploader': 'Britney Spears',              'like_count': int,              'comment_count': int, +            'comments': list,          },          'params': {              'skip_download': True, @@ -82,7 +84,7 @@ class InstagramIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          (video_url, description, thumbnail, timestamp, uploader, -         uploader_id, like_count, comment_count) = [None] * 8 +         uploader_id, like_count, comment_count, height, width) = [None] * 10          shared_data = self._parse_json(              self._search_regex( @@ -94,6 +96,8 @@ class InstagramIE(InfoExtractor):                  shared_data, lambda x: x['entry_data']['PostPage'][0]['media'], dict)              if media:                  video_url = media.get('video_url') +                height = int_or_none(media.get('dimensions', {}).get('height')) +                width = int_or_none(media.get('dimensions', {}).get('width'))                  description = media.get('caption')                  thumbnail = media.get('display_src')                  timestamp = int_or_none(media.get('date')) @@ -101,10 +105,24 @@ class InstagramIE(InfoExtractor):                  uploader_id = media.get('owner', {}).get('username')                  like_count = int_or_none(media.get('likes', {}).get('count'))                  comment_count = int_or_none(media.get('comments', {}).get('count')) +                comments = [{ +                    'author': comment.get('user', {}).get('username'), +                    'author_id': comment.get('user', {}).get('id'), +                    'id': comment.get('id'), +                    'text': comment.get('text'), +                    'timestamp': int_or_none(comment.get('created_at')), +                } for comment in media.get( +                    'comments', {}).get('nodes', []) if comment.get('text')]          if not video_url:              video_url = self._og_search_video_url(webpage, secure=False) +        formats = [{ +            'url': video_url, +            'width': width, +            'height': height, +        }] +          if not uploader_id:              uploader_id = self._search_regex(                  r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', @@ -121,7 +139,7 @@ class InstagramIE(InfoExtractor):          return {              'id': video_id, -            'url': video_url, +            'formats': formats,              'ext': 'mp4',              'title': 'Video by %s' % uploader_id,              'description': description, @@ -131,6 +149,7 @@ class InstagramIE(InfoExtractor):              'uploader': uploader,              'like_count': like_count,              'comment_count': comment_count, +            'comments': comments,          } diff --git a/youtube_dl/extractor/ketnet.py b/youtube_dl/extractor/ketnet.py index aaf3f807a..eb0a16008 100644 --- a/youtube_dl/extractor/ketnet.py +++ b/youtube_dl/extractor/ketnet.py @@ -21,6 +21,10 @@ class KetnetIE(InfoExtractor):      }, {          'url': 'https://www.ketnet.be/achter-de-schermen/sien-repeteert-voor-stars-for-life',          'only_matching': True, +    }, { +        # mzsource, geo restricted to Belgium +        'url': 'https://www.ketnet.be/kijken/nachtwacht/de-bermadoe', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -36,9 +40,25 @@ class KetnetIE(InfoExtractor):          title = config['title'] -        formats = self._extract_m3u8_formats( -            config['source']['hls'], video_id, 'mp4', -            entry_protocol='m3u8_native', m3u8_id='hls') +        formats = [] +        for source_key in ('', 'mz'): +            source = config.get('%ssource' % source_key) +            if not isinstance(source, dict): +                continue +            for format_id, format_url in source.items(): +                if format_id == 'hls': +                    formats.extend(self._extract_m3u8_formats( +                        format_url, video_id, 'mp4', +                        entry_protocol='m3u8_native', m3u8_id=format_id, +                        fatal=False)) +                elif format_id == 'hds': +                    formats.extend(self._extract_f4m_formats( +                        format_url, video_id, f4m_id=format_id, fatal=False)) +                else: +                    formats.append({ +                        'url': format_url, +                        'format_id': format_id, +                    })          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index e9cc9aa59..c48a5aad1 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -29,7 +29,7 @@ from ..utils import (  class LeIE(InfoExtractor):      IE_DESC = '乐视网' -    _VALID_URL = r'https?://(?:www\.le\.com/ptv/vplay|sports\.le\.com/video)/(?P<id>\d+)\.html' +    _VALID_URL = r'https?://(?:www\.le\.com/ptv/vplay|(?:sports\.le|(?:www\.)?lesports)\.com/(?:match|video))/(?P<id>\d+)\.html'      _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html' @@ -73,6 +73,12 @@ class LeIE(InfoExtractor):      }, {          'url': 'http://sports.le.com/video/25737697.html',          'only_matching': True, +    }, { +        'url': 'http://www.lesports.com/match/1023203003.html', +        'only_matching': True, +    }, { +        'url': 'http://sports.le.com/match/1023203003.html', +        'only_matching': True,      }]      # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 6752ffee2..b7bfa7a6d 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -59,7 +59,7 @@ class LimelightBaseIE(InfoExtractor):                      format_id = 'rtmp'                      if stream.get('videoBitRate'):                          format_id += '-%d' % int_or_none(stream['videoBitRate']) -                    http_url = 'http://%s/%s' % (rtmp.group('host').replace('csl.', 'cpl.'), rtmp.group('playpath')[4:]) +                    http_url = 'http://cpl.delvenetworks.com/' + rtmp.group('playpath')[4:]                      urls.append(http_url)                      http_fmt = fmt.copy()                      http_fmt.update({ diff --git a/youtube_dl/extractor/tvland.py b/youtube_dl/extractor/tvland.py index cb76a2a58..957cf1ea2 100644 --- a/youtube_dl/extractor/tvland.py +++ b/youtube_dl/extractor/tvland.py @@ -6,7 +6,7 @@ from .mtv import MTVServicesInfoExtractor  class TVLandIE(MTVServicesInfoExtractor):      IE_NAME = 'tvland.com' -    _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|episodes)/(?P<id>[^/?#.]+)' +    _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|(?:full-)?episodes)/(?P<id>[^/?#.]+)'      _FEED_URL = 'http://www.tvland.com/feeds/mrss/'      _TESTS = [{          # Geo-restricted. Without a proxy metadata are still there. With a @@ -28,4 +28,7 @@ class TVLandIE(MTVServicesInfoExtractor):              'upload_date': '20151228',              'timestamp': 1451289600,          }, +    }, { +        'url': 'http://www.tvland.com/full-episodes/iu0hz6/younger-a-kiss-is-just-a-kiss-season-3-ep-301', +        'only_matching': True,      }] diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index bc352391e..46c2cfe7b 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -247,6 +247,7 @@ class TwitchVodIE(TwitchItemBaseIE):              # m3u8 download              'skip_download': True,          }, +        'skip': 'HTTP Error 404: Not Found',      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 185756301..3b38ac700 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -22,6 +22,7 @@ class VGTVIE(XstreamIE):          'fvn.no/fvntv': 'fvntv',          'aftenposten.no/webtv': 'aptv',          'ap.vgtv.no/webtv': 'aptv', +        'tv.aftonbladet.se/abtv': 'abtv',      }      _APP_NAME_TO_VENDOR = { @@ -30,6 +31,7 @@ class VGTVIE(XstreamIE):          'satv': 'sa',          'fvntv': 'fvn',          'aptv': 'ap', +        'abtv': 'ab',      }      _VALID_URL = r'''(?x) @@ -40,7 +42,8 @@ class VGTVIE(XstreamIE):                      /?                      (?:                          \#!/(?:video|live)/| -                        embed?.*id= +                        embed?.*id=| +                        articles/                      )|                      (?P<appname>                          %s @@ -135,6 +138,14 @@ class VGTVIE(XstreamIE):              'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk',              'only_matching': True,          }, +        { +            'url': 'http://tv.aftonbladet.se/abtv/articles/36015', +            'only_matching': True, +        }, +        { +            'url': 'abtv:140026', +            'only_matching': True, +        }      ]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index f26e0732c..58799d413 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -20,7 +20,7 @@ from ..utils import (      remove_start,      str_to_int,      unescapeHTML, -    unified_strdate, +    unified_timestamp,      urlencode_postdata,  )  from .dailymotion import DailymotionIE @@ -106,6 +106,7 @@ class VKIE(VKBaseIE):                  'title': 'ProtivoGunz - Хуёвая песня',                  'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',                  'duration': 195, +                'timestamp': 1329060660,                  'upload_date': '20120212',                  'view_count': int,              }, @@ -119,6 +120,7 @@ class VKIE(VKBaseIE):                  'uploader': 'Tom Cruise',                  'title': 'No name',                  'duration': 9, +                'timestamp': 1374374880,                  'upload_date': '20130721',                  'view_count': int,              } @@ -195,6 +197,7 @@ class VKIE(VKBaseIE):                  'upload_date': '20150709',                  'view_count': int,              }, +            'skip': 'Removed',          },          {              # youtube embed @@ -226,7 +229,7 @@ class VKIE(VKBaseIE):              },              'params': {                  'skip_download': True, -            } +            },          },          {              # video key is extra_data not url\d+ @@ -237,11 +240,31 @@ class VKIE(VKBaseIE):                  'ext': 'mp4',                  'title': 'S-Dance, репетиции к The way show',                  'uploader': 'THE WAY SHOW | 17 апреля', +                'timestamp': 1454870100,                  'upload_date': '20160207',                  'view_count': int,              },          },          { +            # finished live stream, live_mp4 +            'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2', +            'md5': '90d22d051fccbbe9becfccc615be6791', +            'info_dict': { +                'id': '456242764', +                'ext': 'mp4', +                'title': 'ИгроМир 2016 — день 1', +                'uploader': 'Игромания', +                'duration': 5239, +                'view_count': int, +            }, +        }, +        { +            # live stream, hls and rtmp links,most likely already finished live +            # stream by the time you are reading this comment +            'url': 'https://vk.com/video-140332_456239111', +            'only_matching': True, +        }, +        {              # removed video, just testing that we match the pattern              'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',              'only_matching': True, @@ -349,42 +372,51 @@ class VKIE(VKBaseIE):          data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars')          data = json.loads(data_json) -        # Extract upload date -        upload_date = None -        mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page) -        if mobj is not None: -            mobj.group(1) + ' ' + mobj.group(2) -            upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2)) - -        view_count = None -        views = self._html_search_regex( -            r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', -            info_page, 'view count', default=None) -        if views: -            view_count = str_to_int(self._search_regex( -                r'([\d,.]+)', views, 'view count', fatal=False)) +        title = unescapeHTML(data['md_title']) + +        if data.get('live') == 2: +            title = self._live_title(title) + +        timestamp = unified_timestamp(self._html_search_regex( +            r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, +            'upload date', fatal=False)) + +        view_count = str_to_int(self._search_regex( +            r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', +            info_page, 'view count', fatal=False))          formats = [] -        for k, v in data.items(): -            if not k.startswith('url') and not k.startswith('cache') and k != 'extra_data' or not v: +        for format_id, format_url in data.items(): +            if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')):                  continue -            height = int_or_none(self._search_regex( -                r'^(?:url|cache)(\d+)', k, 'height', default=None)) -            formats.append({ -                'format_id': k, -                'url': v, -                'height': height, -            }) +            if format_id.startswith(('url', 'cache')) or format_id in ('extra_data', 'live_mp4'): +                height = int_or_none(self._search_regex( +                    r'^(?:url|cache)(\d+)', format_id, 'height', default=None)) +                formats.append({ +                    'format_id': format_id, +                    'url': format_url, +                    'height': height, +                }) +            elif format_id == 'hls': +                formats.extend(self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', m3u8_id=format_id, +                    fatal=False, live=True)) +            elif format_id == 'rtmp': +                formats.append({ +                    'format_id': format_id, +                    'url': format_url, +                    'ext': 'flv', +                })          self._sort_formats(formats)          return { -            'id': compat_str(data['vid']), +            'id': compat_str(data.get('vid') or video_id),              'formats': formats, -            'title': unescapeHTML(data['md_title']), +            'title': title,              'thumbnail': data.get('jpg'),              'uploader': data.get('md_author'),              'duration': data.get('duration'), -            'upload_date': upload_date, +            'timestamp': timestamp,              'view_count': view_count,          } | 
