diff options
| -rw-r--r-- | youtube_dl/extractor/cliprs.py | 90 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/ebaumsworld.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/glide.py | 31 | ||||
| -rw-r--r-- | youtube_dl/extractor/jwplatform.py | 19 | ||||
| -rw-r--r-- | youtube_dl/extractor/screencastomatic.py | 36 | ||||
| -rw-r--r-- | youtube_dl/extractor/telebruxelles.py | 14 | 
8 files changed, 149 insertions, 47 deletions
| diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py new file mode 100644 index 000000000..4f9320ea5 --- /dev/null +++ b/youtube_dl/extractor/cliprs.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    float_or_none, +    int_or_none, +    parse_iso8601, +) + + +class ClipRsIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+' +    _TEST = { +        'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', +        'md5': 'c412d57815ba07b56f9edc7b5d6a14e5', +        'info_dict': { +            'id': '1488842.1399140381', +            'ext': 'mp4', +            'title': 'PREMIJERA Frajle predstavljaju novi spot za pesmu Moli me, moli', +            'description': 'md5:56ce2c3b4ab31c5a2e0b17cb9a453026', +            'duration': 229, +            'timestamp': 1459850243, +            'upload_date': '20160405', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        video_id = self._search_regex( +            r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id') + +        response = self._download_json( +            'http://qi.ckm.onetapi.pl/', video_id, +            query={ +                'body[id]': video_id, +                'body[jsonrpc]': '2.0', +                'body[method]': 'get_asset_detail', +                'body[params][ID_Publikacji]': video_id, +                'body[params][Service]': 'www.onet.pl', +                'content-type': 'application/jsonp', +                'x-onet-app': 'player.front.onetapi.pl', +            }) + +        error = response.get('error') +        if error: +            raise ExtractorError( +                '%s said: %s' % (self.IE_NAME, error['message']), expected=True) + +        video = response['result'].get('0') + +        formats = [] +        for _, formats_dict in video['formats'].items(): +            if not isinstance(formats_dict, dict): +                continue +            for format_id, format_list in formats_dict.items(): +                if not isinstance(format_list, list): +                    continue +                for f in format_list: +                    if not f.get('url'): +                        continue +                    formats.append({ +                        'url': f['url'], +                        'format_id': format_id, +                        'height': int_or_none(f.get('vertical_resolution')), +                        'width': int_or_none(f.get('horizontal_resolution')), +                        'abr': float_or_none(f.get('audio_bitrate')), +                        'vbr': float_or_none(f.get('video_bitrate')), +                    }) +        self._sort_formats(formats) + +        meta = video.get('meta', {}) + +        title = self._og_search_title(webpage, default=None) or meta['title'] +        description = self._og_search_description(webpage, default=None) or meta.get('description') +        duration = meta.get('length') or meta.get('lenght') +        timestamp = parse_iso8601(meta.get('addDate'), ' ') + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'duration': duration, +            'timestamp': timestamp, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 17d00721c..5269059d0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -376,7 +376,6 @@ class InfoExtractor(object):                  self.to_screen('%s' % (note,))              else:                  self.to_screen('%s: %s' % (video_id, note)) -        # data, headers and query params will be ignored for `Request` objects          if isinstance(url_or_request, compat_urllib_request.Request):              url_or_request = update_Request(                  url_or_request, data=data, headers=headers, query=query) diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py index b6bfd2b2d..c97682cd3 100644 --- a/youtube_dl/extractor/ebaumsworld.py +++ b/youtube_dl/extractor/ebaumsworld.py @@ -4,10 +4,10 @@ from .common import InfoExtractor  class EbaumsWorldIE(InfoExtractor): -    _VALID_URL = r'https?://www\.ebaumsworld\.com/video/watch/(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?ebaumsworld\.com/videos/[^/]+/(?P<id>\d+)'      _TEST = { -        'url': 'http://www.ebaumsworld.com/video/watch/83367677/', +        'url': 'http://www.ebaumsworld.com/videos/a-giant-python-opens-the-door/83367677/',          'info_dict': {              'id': '83367677',              'ext': 'mp4', diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c2fa83918..c234ff127 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -123,6 +123,7 @@ from .chirbit import (  )  from .cinchcast import CinchcastIE  from .cinemassacre import CinemassacreIE +from .cliprs import ClipRsIE  from .clipfish import ClipfishIE  from .cliphunter import CliphunterIE  from .clipsyndicate import ClipsyndicateIE diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py index 9561ed5fb..62ff84835 100644 --- a/youtube_dl/extractor/glide.py +++ b/youtube_dl/extractor/glide.py @@ -2,6 +2,7 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..utils import unified_strdate  class GlideIE(InfoExtractor): @@ -15,26 +16,38 @@ class GlideIE(InfoExtractor):              'ext': 'mp4',              'title': 'Damon Timm\'s Glide message',              'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$', +            'uploader': 'Damon Timm', +            'upload_date': '20140919',          }      }      def _real_extract(self, url):          video_id = self._match_id(url) +          webpage = self._download_webpage(url, video_id) +          title = self._html_search_regex( -            r'<title>(.*?)</title>', webpage, 'title') -        video_url = self.http_scheme() + self._search_regex( -            r'<source src="(.*?)" type="video/mp4">', webpage, 'video URL') -        thumbnail_url = self._search_regex( -            r'<img id="video-thumbnail" src="(.*?)"', -            webpage, 'thumbnail url', fatal=False) -        thumbnail = ( -            thumbnail_url if thumbnail_url is None -            else self.http_scheme() + thumbnail_url) +            r'<title>(.+?)</title>', webpage, 'title') +        video_url = self._proto_relative_url(self._search_regex( +            r'<source[^>]+src=(["\'])(?P<url>.+?)\1', +            webpage, 'video URL', default=None, +            group='url')) or self._og_search_video_url(webpage) +        thumbnail = self._proto_relative_url(self._search_regex( +            r'<img[^>]+id=["\']video-thumbnail["\'][^>]+src=(["\'])(?P<url>.+?)\1', +            webpage, 'thumbnail url', default=None, +            group='url')) or self._og_search_thumbnail(webpage) +        uploader = self._search_regex( +            r'<div[^>]+class=["\']info-name["\'][^>]*>([^<]+)', +            webpage, 'uploader', fatal=False) +        upload_date = unified_strdate(self._search_regex( +            r'<div[^>]+class="info-date"[^>]*>([^<]+)', +            webpage, 'upload date', fatal=False))          return {              'id': video_id,              'title': title,              'url': video_url,              'thumbnail': thumbnail, +            'uploader': uploader, +            'upload_date': upload_date,          } diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 6770685d7..8a5e562db 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -4,16 +4,15 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( +    float_or_none, +    int_or_none, +)  class JWPlatformBaseIE(InfoExtractor):      def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True):          video_data = jwplayer_data['playlist'][0] -        subtitles = {} -        for track in video_data['tracks']: -            if track['kind'] == 'captions': -                subtitles[track['label']] = [{'url': self._proto_relative_url(track['file'])}]          formats = []          for source in video_data['sources']: @@ -35,12 +34,22 @@ class JWPlatformBaseIE(InfoExtractor):                  })          self._sort_formats(formats) +        subtitles = {} +        tracks = video_data.get('tracks') +        if tracks and isinstance(tracks, list): +            for track in tracks: +                if track.get('file') and track.get('kind') == 'captions': +                    subtitles.setdefault(track.get('label') or 'en', []).append({ +                        'url': self._proto_relative_url(track['file']) +                    }) +          return {              'id': video_id,              'title': video_data['title'] if require_title else video_data.get('title'),              'description': video_data.get('description'),              'thumbnail': self._proto_relative_url(video_data.get('image')),              'timestamp': int_or_none(video_data.get('pubdate')), +            'duration': float_or_none(jwplayer_data.get('duration')),              'subtitles': subtitles,              'formats': formats,          } diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index 05337421c..7a88a42cd 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -1,15 +1,11 @@  # coding: utf-8  from __future__ import unicode_literals -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( -    ExtractorError, -    js_to_json, -) +from .jwplatform import JWPlatformBaseIE +from ..utils import js_to_json -class ScreencastOMaticIE(InfoExtractor): +class ScreencastOMaticIE(JWPlatformBaseIE):      _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P<id>[0-9a-zA-Z]+)'      _TEST = {          'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', @@ -20,6 +16,7 @@ class ScreencastOMaticIE(InfoExtractor):              'title': 'Welcome to 3-4 Philosophy @ DECV!',              'thumbnail': 're:^https?://.*\.jpg$',              'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', +            'duration': 369.163,          }      } @@ -27,23 +24,14 @@ class ScreencastOMaticIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        setup_js = self._search_regex( -            r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", -            webpage, 'setup code') -        data = self._parse_json(setup_js, video_id, transform_source=js_to_json) -        try: -            video_data = next( -                m for m in data['modes'] if m.get('type') == 'html5') -        except StopIteration: -            raise ExtractorError('Could not find any video entries!') -        video_url = compat_urlparse.urljoin(url, video_data['config']['file']) -        thumbnail = data.get('image') +        jwplayer_data = self._parse_json( +            self._search_regex( +                r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'), +            video_id, transform_source=js_to_json) -        return { -            'id': video_id, +        info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) +        info_dict.update({              'title': self._og_search_title(webpage),              'description': self._og_search_description(webpage), -            'url': video_url, -            'ext': 'mp4', -            'thumbnail': thumbnail, -        } +        }) +        return info_dict diff --git a/youtube_dl/extractor/telebruxelles.py b/youtube_dl/extractor/telebruxelles.py index a3d05f97d..eefecc490 100644 --- a/youtube_dl/extractor/telebruxelles.py +++ b/youtube_dl/extractor/telebruxelles.py @@ -1,11 +1,13 @@  # coding: utf-8  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  class TeleBruxellesIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?telebruxelles\.be/(news|sport|dernier-jt)/?(?P<id>[^/#?]+)' +    _VALID_URL = r'https?://(?:www\.)?(?:telebruxelles|bx1)\.be/(news|sport|dernier-jt)/?(?P<id>[^/#?]+)'      _TESTS = [{          'url': 'http://www.telebruxelles.be/news/auditions-devant-parlement-francken-galant-tres-attendus/',          'md5': '59439e568c9ee42fb77588b2096b214f', @@ -39,18 +41,18 @@ class TeleBruxellesIE(InfoExtractor):          webpage = self._download_webpage(url, display_id)          article_id = self._html_search_regex( -            r"<article id=\"post-(\d+)\"", webpage, 'article ID') +            r"<article id=\"post-(\d+)\"", webpage, 'article ID', default=None)          title = self._html_search_regex(              r'<h1 class=\"entry-title\">(.*?)</h1>', webpage, 'title') -        description = self._og_search_description(webpage) +        description = self._og_search_description(webpage, default=None)          rtmp_url = self._html_search_regex( -            r"file: \"(rtmp://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}/vod/mp4:\" \+ \"\w+\" \+ \".mp4)\"", +            r'file\s*:\s*"(rtmp://[^/]+/vod/mp4:"\s*\+\s*"[^"]+"\s*\+\s*".mp4)"',              webpage, 'RTMP url') -        rtmp_url = rtmp_url.replace("\" + \"", "") +        rtmp_url = re.sub(r'"\s*\+\s*"', '', rtmp_url)          return { -            'id': article_id, +            'id': article_id or display_id,              'display_id': display_id,              'title': title,              'description': description, | 
