diff options
Diffstat (limited to 'youtube_dl/extractor')
36 files changed, 1116 insertions, 920 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 042b1e921..365c0b86f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -15,7 +15,6 @@ from .adobetv import (      AdobeTVVideoIE,  )  from .adultswim import AdultSwimIE -from .aftenposten import AftenpostenIE  from .aftonbladet import AftonbladetIE  from .airmozilla import AirMozillaIE  from .aljazeera import AlJazeeraIE @@ -26,7 +25,10 @@ from .aol import AolIE  from .allocine import AllocineIE  from .aparat import AparatIE  from .appleconnect import AppleConnectIE -from .appletrailers import AppleTrailersIE +from .appletrailers import ( +    AppleTrailersIE, +    AppleTrailersSectionIE, +)  from .archiveorg import ArchiveOrgIE  from .ard import (      ARDIE, @@ -61,8 +63,11 @@ from .beatportpro import BeatportProIE  from .bet import BetIE  from .bild import BildIE  from .bilibili import BiliBiliIE +from .bleacherreport import ( +    BleacherReportIE, +    BleacherReportCMSIE, +)  from .blinkx import BlinkxIE -from .bliptv import BlipTVIE, BlipTVUserIE  from .bloomberg import BloombergIE  from .bpb import BpbIE  from .br import BRIE @@ -78,7 +83,6 @@ from .camdemy import (      CamdemyIE,      CamdemyFolderIE  ) -from .canal13cl import Canal13clIE  from .canalplus import CanalplusIE  from .canalc2 import Canalc2IE  from .cbs import CBSIE @@ -232,6 +236,7 @@ from .globo import (  from .godtube import GodTubeIE  from .goldenmoustache import GoldenMoustacheIE  from .golem import GolemIE +from .googledrive import GoogleDriveIE  from .googleplus import GooglePlusIE  from .googlesearch import GoogleSearchIE  from .goshgay import GoshgayIE @@ -282,6 +287,7 @@ from .jadorecettepub import JadoreCettePubIE  from .jeuxvideo import JeuxVideoIE  from .jove import JoveIE  from .jukebox import JukeboxIE +from .jwplatform import JWPlatformIE  from .jpopsukitv import JpopsukiIE  from .kaltura import KalturaIE  from .kanalplay import KanalPlayIE @@ -336,6 +342,7 @@ from .lynda import (  from .m6 import M6IE  from .macgamestore import MacGameStoreIE  from .mailru import MailRuIE +from .makertv import MakerTVIE  from .malemotion import MalemotionIE  from .mdr import MDRIE  from .metacafe import MetacafeIE @@ -586,10 +593,6 @@ from .snagfilms import (  )  from .snotr import SnotrIE  from .sohu import SohuIE -from .soompi import ( -    SoompiIE, -    SoompiShowIE, -)  from .soundcloud import (      SoundcloudIE,      SoundcloudSetIE, @@ -648,6 +651,7 @@ from .teachingchannel import TeachingChannelIE  from .teamcoco import TeamcocoIE  from .techtalks import TechTalksIE  from .ted import TEDIE +from .tele13 import Tele13IE  from .telebruxelles import TeleBruxellesIE  from .telecinco import TelecincoIE  from .telegraaf import TelegraafIE diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index c0e5d1abf..6a29e587f 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -23,6 +23,7 @@ class ABCIE(InfoExtractor):              'title': 'Australia to help staff Ebola treatment centre in Sierra Leone',              'description': 'md5:809ad29c67a05f54eb41f2a105693a67',          }, +        'skip': 'this video has expired',      }, {          'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326',          'md5': 'db2a5369238b51f9811ad815b69dc086', @@ -36,6 +37,7 @@ class ABCIE(InfoExtractor):              'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill',          },          'add_ie': ['Youtube'], +        'skip': 'Not accessible from Travis CI server',      }, {          'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080',          'md5': 'b96eee7c9edf4fc5a358a0252881cc1f', @@ -58,6 +60,9 @@ class ABCIE(InfoExtractor):              r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);',              webpage)          if mobj is None: +            expired = self._html_search_regex(r'(?s)class="expired-(?:video|audio)".+?<span>(.+?)</span>', webpage, 'expired', None) +            if expired: +                raise ExtractorError('%s said: %s' % (self.IE_NAME, expired), expected=True)              raise ExtractorError('Unable to extract video urls')          urls_info = self._parse_json( diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 3ae618e71..bf21a6887 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -68,7 +68,7 @@ class AdultSwimIE(InfoExtractor):                  'md5': '3e346a2ab0087d687a05e1e7f3b3e529',                  'info_dict': {                      'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', -                    'ext': 'flv', +                    'ext': 'mp4',                      'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',                      'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',                  }, @@ -79,6 +79,10 @@ class AdultSwimIE(InfoExtractor):              'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',              'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',          }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }      }]      @staticmethod diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py deleted file mode 100644 index 0c00acfb5..000000000 --- a/youtube_dl/extractor/aftenposten.py +++ /dev/null @@ -1,23 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class AftenpostenIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P<id>\d+)' -    _TEST = { -        'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', -        'md5': 'fd828cd29774a729bf4d4425fe192972', -        'info_dict': { -            'id': '21039', -            'ext': 'mov', -            'title': 'TRAILER: "Sweatshop" - I can´t take any more', -            'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', -            'timestamp': 1416927969, -            'upload_date': '20141125', -        } -    } - -    def _real_extract(self, url): -        return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream') diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py new file mode 100644 index 000000000..dcc3c97f1 --- /dev/null +++ b/youtube_dl/extractor/amp.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    parse_iso8601, +) + + +class AMPIE(InfoExtractor): +    # parse Akamai Adaptive Media Player feed +    def _extract_feed_info(self, url): +        item = self._download_json( +            url, None, 'Downloading Akamai AMP feed', +            'Unable to download Akamai AMP feed')['channel']['item'] + +        video_id = item['guid'] + +        def get_media_node(name, default=None): +            media_name = 'media-%s' % name +            media_group = item.get('media-group') or item +            return media_group.get(media_name) or item.get(media_name) or item.get(name, default) + +        thumbnails = [] +        media_thumbnail = get_media_node('thumbnail') +        if media_thumbnail: +            if isinstance(media_thumbnail, dict): +                media_thumbnail = [media_thumbnail] +            for thumbnail_data in media_thumbnail: +                thumbnail = thumbnail_data['@attributes'] +                thumbnails.append({ +                    'url': self._proto_relative_url(thumbnail['url'], 'http:'), +                    'width': int_or_none(thumbnail.get('width')), +                    'height': int_or_none(thumbnail.get('height')), +                }) + +        subtitles = {} +        media_subtitle = get_media_node('subTitle') +        if media_subtitle: +            if isinstance(media_subtitle, dict): +                media_subtitle = [media_subtitle] +            for subtitle_data in media_subtitle: +                subtitle = subtitle_data['@attributes'] +                lang = subtitle.get('lang') or 'en' +                subtitles[lang] = [{'url': subtitle['href']}] + +        formats = [] +        media_content = get_media_node('content') +        if isinstance(media_content, dict): +            media_content = [media_content] +        for media_data in media_content: +            media = media_data['@attributes'] +            media_type = media['type'] +            if media_type == 'video/f4m': +                f4m_formats = self._extract_f4m_formats( +                    media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', +                    video_id, f4m_id='hds', fatal=False) +                if f4m_formats: +                    formats.extend(f4m_formats) +            elif media_type == 'application/x-mpegURL': +                m3u8_formats = self._extract_m3u8_formats( +                    media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False) +                if m3u8_formats: +                    formats.extend(m3u8_formats) +            else: +                formats.append({ +                    'format_id': media_data['media-category']['@attributes']['label'], +                    'url': media['url'], +                    'tbr': int_or_none(media.get('bitrate')), +                    'filesize': int_or_none(media.get('fileSize')), +                }) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': get_media_node('title'), +            'description': get_media_node('description'), +            'thumbnails': thumbnails, +            'timestamp': parse_iso8601(item.get('pubDate'), ' '), +            'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index f68dc3236..62ed0c918 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -11,6 +11,7 @@ from ..utils import (  class AppleTrailersIE(InfoExtractor): +    IE_NAME = 'appletrailers'      _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'      _TESTS = [{          'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', @@ -64,6 +65,12 @@ class AppleTrailersIE(InfoExtractor):              },          ]      }, { +        'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', +        'info_dict': { +            'id': 'blackthorn', +        }, +        'playlist_mincount': 2, +    }, {          'url': 'http://trailers.apple.com/ca/metropole/autrui/',          'only_matching': True,      }] @@ -79,7 +86,7 @@ class AppleTrailersIE(InfoExtractor):          def fix_html(s):              s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s) -            s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s) +            s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s)              # The ' in the onClick attributes are not escaped, it couldn't be parsed              # like: http://trailers.apple.com/trailers/wb/gravity/ @@ -96,6 +103,9 @@ class AppleTrailersIE(InfoExtractor):              trailer_info_json = self._search_regex(self._JSON_RE,                                                     on_click, 'trailer info')              trailer_info = json.loads(trailer_info_json) +            first_url = trailer_info.get('url') +            if not first_url: +                continue              title = trailer_info['title']              video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()              thumbnail = li.find('.//img').attrib['src'] @@ -107,7 +117,6 @@ class AppleTrailersIE(InfoExtractor):              if m:                  duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) -            first_url = trailer_info['url']              trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()              settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)              settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') @@ -144,3 +153,76 @@ class AppleTrailersIE(InfoExtractor):              'id': movie,              'entries': playlist,          } + + +class AppleTrailersSectionIE(InfoExtractor): +    IE_NAME = 'appletrailers:section' +    _SECTIONS = { +        'justadded': { +            'feed_path': 'just_added', +            'title': 'Just Added', +        }, +        'exclusive': { +            'feed_path': 'exclusive', +            'title': 'Exclusive', +        }, +        'justhd': { +            'feed_path': 'just_hd', +            'title': 'Just HD', +        }, +        'mostpopular': { +            'feed_path': 'most_pop', +            'title': 'Most Popular', +        }, +        'moviestudios': { +            'feed_path': 'studios', +            'title': 'Movie Studios', +        }, +    } +    _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS) +    _TESTS = [{ +        'url': 'http://trailers.apple.com/#section=justadded', +        'info_dict': { +            'title': 'Just Added', +            'id': 'justadded', +        }, +        'playlist_mincount': 80, +    }, { +        'url': 'http://trailers.apple.com/#section=exclusive', +        'info_dict': { +            'title': 'Exclusive', +            'id': 'exclusive', +        }, +        'playlist_mincount': 80, +    }, { +        'url': 'http://trailers.apple.com/#section=justhd', +        'info_dict': { +            'title': 'Just HD', +            'id': 'justhd', +        }, +        'playlist_mincount': 80, +    }, { +        'url': 'http://trailers.apple.com/#section=mostpopular', +        'info_dict': { +            'title': 'Most Popular', +            'id': 'mostpopular', +        }, +        'playlist_mincount': 80, +    }, { +        'url': 'http://trailers.apple.com/#section=moviestudios', +        'info_dict': { +            'title': 'Movie Studios', +            'id': 'moviestudios', +        }, +        'playlist_mincount': 80, +    }] + +    def _real_extract(self, url): +        section = self._match_id(url) +        section_data = self._download_json( +            'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'], +            section) +        entries = [ +            self.url_result('http://trailers.apple.com' + e['location']) +            for e in section_data] +        return self.playlist_result(entries, section, self._SECTIONS[section]['title']) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 2a00da3ee..10301a8ea 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -68,9 +68,13 @@ class ArteTVPlus7IE(InfoExtractor):      def _extract_url_info(cls, url):          mobj = re.match(cls._VALID_URL, url)          lang = mobj.group('lang') -        # This is not a real id, it can be for example AJT for the news -        # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal -        video_id = mobj.group('id') +        query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) +        if 'vid' in query: +            video_id = query['vid'][0] +        else: +            # This is not a real id, it can be for example AJT for the news +            # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal +            video_id = mobj.group('id')          return video_id, lang      def _real_extract(self, url): @@ -79,9 +83,15 @@ class ArteTVPlus7IE(InfoExtractor):          return self._extract_from_webpage(webpage, video_id, lang)      def _extract_from_webpage(self, webpage, video_id, lang): +        patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') +        ids = (video_id, '') +        # some pages contain multiple videos (like +        # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), +        # so we first try to look for json URLs that contain the video id from +        # the 'vid' parameter. +        patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates]          json_url = self._html_search_regex( -            [r'arte_vp_url=["\'](.*?)["\']', r'data-url=["\']([^"]+)["\']'], -            webpage, 'json vp url', default=None) +            patterns, webpage, 'json vp url', default=None)          if not json_url:              iframe_url = self._html_search_regex(                  r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 50e47ba0a..7ac3044c7 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals  import time  import hmac +import hashlib +import re  from .common import InfoExtractor  from ..compat import ( @@ -32,6 +34,19 @@ class AtresPlayerIE(InfoExtractor):                  'duration': 5527.6,                  'thumbnail': 're:^https?://.*\.jpg$',              }, +            'skip': 'This video is only available for registered users' +        }, +        { +            'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html', +            'md5': '0d0e918533bbd4b263f2de4d197d4aac', +            'info_dict': { +                'id': 'capitulo-112-david-bustamante', +                'ext': 'flv', +                'title': 'David Bustamante', +                'description': 'md5:f33f1c0a05be57f6708d4dd83a3b81c6', +                'duration': 1439.0, +                'thumbnail': 're:^https?://.*\.jpg$', +            },          },          {              'url': 'http://www.atresplayer.com/television/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_2014122400174.html', @@ -50,6 +65,13 @@ class AtresPlayerIE(InfoExtractor):      _LOGIN_URL = 'https://servicios.atresplayer.com/j_spring_security_check' +    _ERRORS = { +        'UNPUBLISHED': 'We\'re sorry, but this video is not yet available.', +        'DELETED': 'This video has expired and is no longer available for online streaming.', +        'GEOUNPUBLISHED': 'We\'re sorry, but this video is not available in your region due to right restrictions.', +        # 'PREMIUM': 'PREMIUM', +    } +      def _real_initialize(self):          self._login() @@ -83,58 +105,81 @@ class AtresPlayerIE(InfoExtractor):          episode_id = self._search_regex(              r'episode="([^"]+)"', webpage, 'episode id') +        request = sanitized_Request( +            self._PLAYER_URL_TEMPLATE % episode_id, +            headers={'User-Agent': self._USER_AGENT}) +        player = self._download_json(request, episode_id, 'Downloading player JSON') + +        episode_type = player.get('typeOfEpisode') +        error_message = self._ERRORS.get(episode_type) +        if error_message: +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) + +        formats = [] +        video_url = player.get('urlVideo') +        if video_url: +            format_info = { +                'url': video_url, +                'format_id': 'http', +            } +            mobj = re.search(r'(?P<bitrate>\d+)K_(?P<width>\d+)x(?P<height>\d+)', video_url) +            if mobj: +                format_info.update({ +                    'width': int_or_none(mobj.group('width')), +                    'height': int_or_none(mobj.group('height')), +                    'tbr': int_or_none(mobj.group('bitrate')), +                }) +            formats.append(format_info) + +        m3u8_url = player.get('urlVideoHls') +        if m3u8_url: +            m3u8_formats = self._extract_m3u8_formats( +                m3u8_url, episode_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) +            if m3u8_formats: +                formats.extend(m3u8_formats) +          timestamp = int_or_none(self._download_webpage(              self._TIME_API_URL,              video_id, 'Downloading timestamp', fatal=False), 1000, time.time())          timestamp_shifted = compat_str(timestamp + self._TIMESTAMP_SHIFT)          token = hmac.new(              self._MAGIC.encode('ascii'), -            (episode_id + timestamp_shifted).encode('utf-8') +            (episode_id + timestamp_shifted).encode('utf-8'), hashlib.md5          ).hexdigest() -        formats = [] -        for fmt in ['windows', 'android_tablet']: -            request = sanitized_Request( -                self._URL_VIDEO_TEMPLATE.format(fmt, episode_id, timestamp_shifted, token)) -            request.add_header('User-Agent', self._USER_AGENT) - -            fmt_json = self._download_json( -                request, video_id, 'Downloading %s video JSON' % fmt) - -            result = fmt_json.get('resultDes') -            if result.lower() != 'ok': -                raise ExtractorError( -                    '%s returned error: %s' % (self.IE_NAME, result), expected=True) - -            for format_id, video_url in fmt_json['resultObject'].items(): -                if format_id == 'token' or not video_url.startswith('http'): -                    continue -                if video_url.endswith('/Manifest'): -                    if 'geodeswowsmpra3player' in video_url: -                        f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] -                        f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) -                        # this videos are protected by DRM, the f4m downloader doesn't support them -                        continue -                    else: -                        f4m_url = video_url[:-9] + '/manifest.f4m' -                    formats.extend(self._extract_f4m_formats(f4m_url, video_id)) -                else: -                    formats.append({ -                        'url': video_url, -                        'format_id': 'android-%s' % format_id, -                        'preference': 1, -                    }) -        self._sort_formats(formats) +        request = sanitized_Request( +            self._URL_VIDEO_TEMPLATE.format('windows', episode_id, timestamp_shifted, token), +            headers={'User-Agent': self._USER_AGENT}) -        player = self._download_json( -            self._PLAYER_URL_TEMPLATE % episode_id, -            episode_id) +        fmt_json = self._download_json( +            request, video_id, 'Downloading windows video JSON') + +        result = fmt_json.get('resultDes') +        if result.lower() != 'ok': +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, result), expected=True) + +        for format_id, video_url in fmt_json['resultObject'].items(): +            if format_id == 'token' or not video_url.startswith('http'): +                continue +            if 'geodeswowsmpra3player' in video_url: +                f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] +                f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) +                # this videos are protected by DRM, the f4m downloader doesn't support them +                continue +            else: +                f4m_url = video_url[:-9] + '/manifest.f4m' +            f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) +            if f4m_formats: +                formats.extend(f4m_formats) +        self._sort_formats(formats)          path_data = player.get('pathData')          episode = self._download_xml( -            self._EPISODE_URL_TEMPLATE % path_data, -            video_id, 'Downloading episode XML') +            self._EPISODE_URL_TEMPLATE % path_data, video_id, +            'Downloading episode XML')          duration = float_or_none(xpath_text(              episode, './media/asset/info/technical/contentDuration', 'duration')) diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index b0b089dee..4382a302b 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -15,7 +15,7 @@ class AudiMediaIE(InfoExtractor):          'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test',          'md5': '79a8b71c46d49042609795ab59779b66',          'info_dict': { -            'id': '1564', +            'id': '1565',              'ext': 'mp4',              'title': '60 Seconds of Audi Sport 104/2015 - WEC Bahrain, Rookie Test',              'description': 'md5:60e5d30a78ced725f7b8d34370762941', diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index 693ba22c6..3eed91279 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -56,7 +56,7 @@ class AudiomackIE(InfoExtractor):          # API is inconsistent with errors          if 'url' not in api_response or not api_response['url'] or 'error' in api_response: -            raise ExtractorError('Invalid url %s', url) +            raise ExtractorError('Invalid url %s' % url)          # Audiomack wraps a lot of soundcloud tracks in their branded wrapper          # if so, pass the work off to the soundcloud extractor diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py new file mode 100644 index 000000000..38bda3af5 --- /dev/null +++ b/youtube_dl/extractor/bleacherreport.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .amp import AMPIE +from ..utils import ( +    ExtractorError, +    int_or_none, +    parse_iso8601, +) + + +class BleacherReportIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)' +    _TESTS = [{ +        'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', +        'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', +        'info_dict': { +            'id': '2496438', +            'ext': 'mp4', +            'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', +            'uploader_id': 3992341, +            'description': 'CFB, ACC, Florida State', +            'timestamp': 1434380212, +            'upload_date': '20150615', +            'uploader': 'Team Stream Now ', +        }, +        'add_ie': ['Ooyala'], +    }, { +        'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', +        'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', +        'info_dict': { +            'id': '2586817', +            'ext': 'mp4', +            'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', +            'timestamp': 1446839961, +            'uploader': 'Sean Fay', +            'description': 'md5:825e94e0f3521df52fa83b2ed198fa20', +            'uploader_id': 6466954, +            'upload_date': '20151011', +        }, +        'add_ie': ['Youtube'], +    }] + +    def _real_extract(self, url): +        article_id = self._match_id(url) + +        article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article'] + +        thumbnails = [] +        primary_photo = article_data.get('primaryPhoto') +        if primary_photo: +            thumbnails = [{ +                'url': primary_photo['url'], +                'width': primary_photo.get('width'), +                'height': primary_photo.get('height'), +            }] + +        info = { +            '_type': 'url_transparent', +            'id': article_id, +            'title': article_data['title'], +            'uploader': article_data.get('author', {}).get('name'), +            'uploader_id': article_data.get('authorId'), +            'timestamp': parse_iso8601(article_data.get('createdAt')), +            'thumbnails': thumbnails, +            'comment_count': int_or_none(article_data.get('commentsCount')), +            'view_count': int_or_none(article_data.get('hitCount')), +        } + +        video = article_data.get('video') +        if video: +            video_type = video['type'] +            if video_type == 'cms.bleacherreport.com': +                info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] +            elif video_type == 'ooyala.com': +                info['url'] = 'ooyala:%s' % video['id'] +            elif video_type == 'youtube.com': +                info['url'] = video['id'] +            elif video_type == 'vine.co': +                info['url'] = 'https://vine.co/v/%s' % video['id'] +            else: +                info['url'] = video_type + video['id'] +            return info +        else: +            raise ExtractorError('no video in the article', expected=True) + + +class BleacherReportCMSIE(AMPIE): +    _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' +    _TESTS = [{ +        'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', +        'md5': '8c2c12e3af7805152675446c905d159b', +        'info_dict': { +            'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', +            'ext': 'flv', +            'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', +            'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', +        }, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        info = self._extract_feed_info('http://cms.bleacherreport.com/media/items/%s/akamai.json' % video_id) +        info['id'] = video_id +        return info diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py deleted file mode 100644 index 35375f7b1..000000000 --- a/youtube_dl/extractor/bliptv.py +++ /dev/null @@ -1,290 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - -from ..compat import compat_urlparse -from ..utils import ( -    clean_html, -    int_or_none, -    parse_iso8601, -    sanitized_Request, -    unescapeHTML, -    xpath_text, -    xpath_with_ns, -) - - -class BlipTVIE(InfoExtractor): -    _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+_]+)))' - -    _TESTS = [ -        { -            'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', -            'md5': '80baf1ec5c3d2019037c1c707d676b9f', -            'info_dict': { -                'id': '5779306', -                'ext': 'm4v', -                'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', -                'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', -                'timestamp': 1323138843, -                'upload_date': '20111206', -                'uploader': 'cbr', -                'uploader_id': '679425', -                'duration': 81, -            } -        }, -        { -            # https://github.com/rg3/youtube-dl/pull/2274 -            'note': 'Video with subtitles', -            'url': 'http://blip.tv/play/h6Uag5OEVgI.html', -            'md5': '309f9d25b820b086ca163ffac8031806', -            'info_dict': { -                'id': '6586561', -                'ext': 'mp4', -                'title': 'Red vs. Blue Season 11 Episode 1', -                'description': 'One-Zero-One', -                'timestamp': 1371261608, -                'upload_date': '20130615', -                'uploader': 'redvsblue', -                'uploader_id': '792887', -                'duration': 279, -            } -        }, -        { -            # https://bugzilla.redhat.com/show_bug.cgi?id=967465 -            'url': 'http://a.blip.tv/api.swf#h6Uag5KbVwI', -            'md5': '314e87b1ebe7a48fcbfdd51b791ce5a6', -            'info_dict': { -                'id': '6573122', -                'ext': 'mov', -                'upload_date': '20130520', -                'description': 'Two hapless space marines argue over what to do when they realize they have an astronomically huge problem on their hands.', -                'title': 'Red vs. Blue Season 11 Trailer', -                'timestamp': 1369029609, -                'uploader': 'redvsblue', -                'uploader_id': '792887', -            } -        }, -        { -            'url': 'http://blip.tv/play/gbk766dkj4Yn', -            'md5': 'fe0a33f022d49399a241e84a8ea8b8e3', -            'info_dict': { -                'id': '1749452', -                'ext': 'mp4', -                'upload_date': '20090208', -                'description': 'Witness the first appearance of the Nostalgia Critic character, as Doug reviews the movie Transformers.', -                'title': 'Nostalgia Critic: Transformers', -                'timestamp': 1234068723, -                'uploader': 'NostalgiaCritic', -                'uploader_id': '246467', -            } -        }, -        { -            # https://github.com/rg3/youtube-dl/pull/4404 -            'note': 'Audio only', -            'url': 'http://blip.tv/hilarios-productions/weekly-manga-recap-kingdom-7119982', -            'md5': '76c0a56f24e769ceaab21fbb6416a351', -            'info_dict': { -                'id': '7103299', -                'ext': 'flv', -                'title': 'Weekly Manga Recap: Kingdom', -                'description': 'And then Shin breaks the enemy line, and he's all like HWAH! And then he slices a guy and it's all like FWASHING! And... it's really hard to describe the best parts of this series without breaking down into sound effects, okay?', -                'timestamp': 1417660321, -                'upload_date': '20141204', -                'uploader': 'The Rollo T', -                'uploader_id': '407429', -                'duration': 7251, -                'vcodec': 'none', -            } -        }, -        { -            # missing duration -            'url': 'http://blip.tv/rss/flash/6700880', -            'info_dict': { -                'id': '6684191', -                'ext': 'm4v', -                'title': 'Cowboy Bebop: Gateway Shuffle Review', -                'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8', -                'timestamp': 1386639757, -                'upload_date': '20131210', -                'uploader': 'sfdebris', -                'uploader_id': '706520', -            } -        } -    ] - -    @staticmethod -    def _extract_url(webpage): -        mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) -        if mobj: -            return 'http://blip.tv/a/a-' + mobj.group(1) -        mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage) -        if mobj: -            return mobj.group(1) - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        lookup_id = mobj.group('lookup_id') - -        # See https://github.com/rg3/youtube-dl/issues/857 and -        # https://github.com/rg3/youtube-dl/issues/4197 -        if lookup_id: -            urlh = self._request_webpage( -                'http://blip.tv/play/%s' % lookup_id, lookup_id, 'Resolving lookup id') -            url = compat_urlparse.urlparse(urlh.geturl()) -            qs = compat_urlparse.parse_qs(url.query) -            mobj = re.match(self._VALID_URL, qs['file'][0]) - -        video_id = mobj.group('id') - -        rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS') - -        def _x(p): -            return xpath_with_ns(p, { -                'blip': 'http://blip.tv/dtd/blip/1.0', -                'media': 'http://search.yahoo.com/mrss/', -                'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', -            }) - -        item = rss.find('channel/item') - -        video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id -        title = xpath_text(item, 'title', 'title', fatal=True) -        description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description')) -        timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp')) -        uploader = xpath_text(item, _x('blip:user'), 'uploader') -        uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id') -        duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration')) -        media_thumbnail = item.find(_x('media:thumbnail')) -        thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None -                     else xpath_text(item, 'image', 'thumbnail')) -        categories = [category.text for category in item.findall('category') if category is not None] - -        formats = [] -        subtitles_urls = {} - -        media_group = item.find(_x('media:group')) -        for media_content in media_group.findall(_x('media:content')): -            url = media_content.get('url') -            role = media_content.get(_x('blip:role')) -            msg = self._download_webpage( -                url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url', -                video_id, 'Resolving URL for %s' % role) -            real_url = compat_urlparse.parse_qs(msg.strip())['message'][0] - -            media_type = media_content.get('type') -            if media_type == 'text/srt' or url.endswith('.srt'): -                LANGS = { -                    'english': 'en', -                } -                lang = role.rpartition('-')[-1].strip().lower() -                langcode = LANGS.get(lang, lang) -                subtitles_urls[langcode] = url -            elif media_type.startswith('video/'): -                formats.append({ -                    'url': real_url, -                    'format_id': role, -                    'format_note': media_type, -                    'vcodec': media_content.get(_x('blip:vcodec')) or 'none', -                    'acodec': media_content.get(_x('blip:acodec')), -                    'filesize': media_content.get('filesize'), -                    'width': int_or_none(media_content.get('width')), -                    'height': int_or_none(media_content.get('height')), -                }) -        self._check_formats(formats, video_id) -        self._sort_formats(formats) - -        subtitles = self.extract_subtitles(video_id, subtitles_urls) - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'timestamp': timestamp, -            'uploader': uploader, -            'uploader_id': uploader_id, -            'duration': duration, -            'thumbnail': thumbnail, -            'categories': categories, -            'formats': formats, -            'subtitles': subtitles, -        } - -    def _get_subtitles(self, video_id, subtitles_urls): -        subtitles = {} -        for lang, url in subtitles_urls.items(): -            # For some weird reason, blip.tv serves a video instead of subtitles -            # when we request with a common UA -            req = sanitized_Request(url) -            req.add_header('User-Agent', 'youtube-dl') -            subtitles[lang] = [{ -                # The extension is 'srt' but it's actually an 'ass' file -                'ext': 'ass', -                'data': self._download_webpage(req, None, note=False), -            }] -        return subtitles - - -class BlipTVUserIE(InfoExtractor): -    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$' -    _PAGE_SIZE = 12 -    IE_NAME = 'blip.tv:user' -    _TEST = { -        'url': 'http://blip.tv/actone', -        'info_dict': { -            'id': 'actone', -            'title': 'Act One: The Series', -        }, -        'playlist_count': 5, -    } - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        username = mobj.group(1) - -        page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' - -        page = self._download_webpage(url, username, 'Downloading user page') -        mobj = re.search(r'data-users-id="([^"]+)"', page) -        page_base = page_base % mobj.group(1) -        title = self._og_search_title(page) - -        # Download video ids using BlipTV Ajax calls. Result size per -        # query is limited (currently to 12 videos) so we need to query -        # page by page until there are no video ids - it means we got -        # all of them. - -        video_ids = [] -        pagenum = 1 - -        while True: -            url = page_base + "&page=" + str(pagenum) -            page = self._download_webpage( -                url, username, 'Downloading video ids from page %d' % pagenum) - -            # Extract video identifiers -            ids_in_page = [] - -            for mobj in re.finditer(r'href="/([^"]+)"', page): -                if mobj.group(1) not in ids_in_page: -                    ids_in_page.append(unescapeHTML(mobj.group(1))) - -            video_ids.extend(ids_in_page) - -            # A little optimization - if current page is not -            # "full", ie. does not contain PAGE_SIZE video ids then -            # we can assume that this page is the last one - there -            # are no more ids on further pages - no need to query -            # again. - -            if len(ids_in_page) < self._PAGE_SIZE: -                break - -            pagenum += 1 - -        urls = ['http://blip.tv/%s' % video_id for video_id in video_ids] -        url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls] -        return self.playlist_result( -            url_entries, playlist_title=title, playlist_id=username) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 66e394e10..e66854538 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -1,18 +1,21 @@  # coding: utf-8  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..utils import (      ExtractorError,      int_or_none,      parse_duration, +    xpath_element, +    xpath_text,  )  class BRIE(InfoExtractor):      IE_DESC = 'Bayerischer Rundfunk Mediathek' -    _VALID_URL = r'https?://(?:www\.)?br\.de/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' -    _BASE_URL = 'http://www.br.de' +    _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html'      _TESTS = [          { @@ -22,7 +25,7 @@ class BRIE(InfoExtractor):                  'id': '48f656ef-287e-486f-be86-459122db22cc',                  'ext': 'mp4',                  'title': 'Die böse Überraschung', -                'description': 'Betriebliche Altersvorsorge: Die böse Überraschung', +                'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9',                  'duration': 180,                  'uploader': 'Reinhard Weber',                  'upload_date': '20150422', @@ -30,23 +33,23 @@ class BRIE(InfoExtractor):          },          {              'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', -            'md5': 'a44396d73ab6a68a69a568fae10705bb', +            'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef',              'info_dict': {                  'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05', -                'ext': 'mp4', +                'ext': 'flv',                  'title': 'Manfred Schreiber ist tot', -                'description': 'Abendschau kompakt: Manfred Schreiber ist tot', +                'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97',                  'duration': 26,              }          },          { -            'url': 'http://www.br.de/radio/br-klassik/sendungen/allegro/premiere-urauffuehrung-the-land-2015-dance-festival-muenchen-100.html', +            'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html',              'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d',              'info_dict': {                  'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b',                  'ext': 'aac',                  'title': 'Kurzweilig und sehr bewegend', -                'description': '"The Land" von Peeping Tom: Kurzweilig und sehr bewegend', +                'description': 'md5:0351996e3283d64adeb38ede91fac54e',                  'duration': 296,              }          }, @@ -57,7 +60,7 @@ class BRIE(InfoExtractor):                  'id': '6ba73750-d405-45d3-861d-1ce8c524e059',                  'ext': 'mp4',                  'title': 'Umweltbewusster Häuslebauer', -                'description': 'Uwe Erdelt: Umweltbewusster Häuslebauer', +                'description': 'md5:d52dae9792d00226348c1dbb13c9bae2',                  'duration': 116,              }          }, @@ -68,7 +71,7 @@ class BRIE(InfoExtractor):                  'id': 'd982c9ce-8648-4753-b358-98abb8aec43d',                  'ext': 'mp4',                  'title': 'Folge 1 - Metaphysik', -                'description': 'Kant für Anfänger: Folge 1 - Metaphysik', +                'description': 'md5:bb659990e9e59905c3d41e369db1fbe3',                  'duration': 893,                  'uploader': 'Eva Maria Steimle',                  'upload_date': '20140117', @@ -77,28 +80,31 @@ class BRIE(InfoExtractor):      ]      def _real_extract(self, url): -        display_id = self._match_id(url) +        base_url, display_id = re.search(self._VALID_URL, url).groups()          page = self._download_webpage(url, display_id)          xml_url = self._search_regex(              r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') -        xml = self._download_xml(self._BASE_URL + xml_url, None) +        xml = self._download_xml(base_url + xml_url, display_id)          medias = []          for xml_media in xml.findall('video') + xml.findall('audio'): +            media_id = xml_media.get('externalId')              media = { -                'id': xml_media.get('externalId'), -                'title': xml_media.find('title').text, -                'duration': parse_duration(xml_media.find('duration').text), -                'formats': self._extract_formats(xml_media.find('assets')), -                'thumbnails': self._extract_thumbnails(xml_media.find('teaserImage/variants')), -                'description': ' '.join(xml_media.find('shareTitle').text.splitlines()), -                'webpage_url': xml_media.find('permalink').text +                'id': media_id, +                'title': xpath_text(xml_media, 'title', 'title', True), +                'duration': parse_duration(xpath_text(xml_media, 'duration')), +                'formats': self._extract_formats(xpath_element( +                    xml_media, 'assets'), media_id), +                'thumbnails': self._extract_thumbnails(xpath_element( +                    xml_media, 'teaserImage/variants'), base_url), +                'description': xpath_text(xml_media, 'desc'), +                'webpage_url': xpath_text(xml_media, 'permalink'), +                'uploader': xpath_text(xml_media, 'author'),              } -            if xml_media.find('author').text: -                media['uploader'] = xml_media.find('author').text -            if xml_media.find('broadcastDate').text: -                media['upload_date'] = ''.join(reversed(xml_media.find('broadcastDate').text.split('.'))) +            broadcast_date = xpath_text(xml_media, 'broadcastDate') +            if broadcast_date: +                media['upload_date'] = ''.join(reversed(broadcast_date.split('.')))              medias.append(media)          if len(medias) > 1: @@ -109,35 +115,58 @@ class BRIE(InfoExtractor):              raise ExtractorError('No media entries found')          return medias[0] -    def _extract_formats(self, assets): - -        def text_or_none(asset, tag): -            elem = asset.find(tag) -            return None if elem is None else elem.text - -        formats = [{ -            'url': text_or_none(asset, 'downloadUrl'), -            'ext': text_or_none(asset, 'mediaType'), -            'format_id': asset.get('type'), -            'width': int_or_none(text_or_none(asset, 'frameWidth')), -            'height': int_or_none(text_or_none(asset, 'frameHeight')), -            'tbr': int_or_none(text_or_none(asset, 'bitrateVideo')), -            'abr': int_or_none(text_or_none(asset, 'bitrateAudio')), -            'vcodec': text_or_none(asset, 'codecVideo'), -            'acodec': text_or_none(asset, 'codecAudio'), -            'container': text_or_none(asset, 'mediaType'), -            'filesize': int_or_none(text_or_none(asset, 'size')), -        } for asset in assets.findall('asset') -            if asset.find('downloadUrl') is not None] - +    def _extract_formats(self, assets, media_id): +        formats = [] +        for asset in assets.findall('asset'): +            format_url = xpath_text(asset, ['downloadUrl', 'url']) +            asset_type = asset.get('type') +            if asset_type == 'HDS': +                f4m_formats = self._extract_f4m_formats( +                    format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False) +                if f4m_formats: +                    formats.extend(f4m_formats) +            elif asset_type == 'HLS': +                m3u8_formats = self._extract_m3u8_formats( +                    format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False) +                if m3u8_formats: +                    formats.extend(m3u8_formats) +            else: +                format_info = { +                    'ext': xpath_text(asset, 'mediaType'), +                    'width': int_or_none(xpath_text(asset, 'frameWidth')), +                    'height': int_or_none(xpath_text(asset, 'frameHeight')), +                    'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')), +                    'abr': int_or_none(xpath_text(asset, 'bitrateAudio')), +                    'vcodec': xpath_text(asset, 'codecVideo'), +                    'acodec': xpath_text(asset, 'codecAudio'), +                    'container': xpath_text(asset, 'mediaType'), +                    'filesize': int_or_none(xpath_text(asset, 'size')), +                } +                format_url = self._proto_relative_url(format_url) +                if format_url: +                    http_format_info = format_info.copy() +                    http_format_info.update({ +                        'url': format_url, +                        'format_id': 'http-%s' % asset_type, +                    }) +                    formats.append(http_format_info) +                server_prefix = xpath_text(asset, 'serverPrefix') +                if server_prefix: +                    rtmp_format_info = format_info.copy() +                    rtmp_format_info.update({ +                        'url': server_prefix, +                        'play_path': xpath_text(asset, 'fileName'), +                        'format_id': 'rtmp-%s' % asset_type, +                    }) +                    formats.append(rtmp_format_info)          self._sort_formats(formats)          return formats -    def _extract_thumbnails(self, variants): +    def _extract_thumbnails(self, variants, base_url):          thumbnails = [{ -            'url': self._BASE_URL + variant.find('url').text, -            'width': int_or_none(variant.find('width').text), -            'height': int_or_none(variant.find('height').text), -        } for variant in variants.findall('variant')] +            'url': base_url + xpath_text(variant, 'url'), +            'width': int_or_none(xpath_text(variant, 'width')), +            'height': int_or_none(xpath_text(variant, 'height')), +        } for variant in variants.findall('variant') if xpath_text(variant, 'url')]          thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True)          return thumbnails diff --git a/youtube_dl/extractor/canal13cl.py b/youtube_dl/extractor/canal13cl.py deleted file mode 100644 index 93241fefe..000000000 --- a/youtube_dl/extractor/canal13cl.py +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class Canal13clIE(InfoExtractor): -    _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)' -    _TEST = { -        'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', -        'md5': '4cb1fa38adcad8fea88487a078831755', -        'info_dict': { -            'id': '1403022125', -            'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', -            'ext': 'mp4', -            'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda', -            'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.', -        } -    } - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        display_id = mobj.group('id') - -        webpage = self._download_webpage(url, display_id) - -        title = self._html_search_meta( -            'twitter:title', webpage, 'title', fatal=True) -        description = self._html_search_meta( -            'twitter:description', webpage, 'description') -        url = self._html_search_regex( -            r'articuloVideo = \"(.*?)\"', webpage, 'url') -        real_id = self._search_regex( -            r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id) -        thumbnail = self._html_search_regex( -            r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail') - -        return { -            'id': real_id, -            'display_id': display_id, -            'url': url, -            'title': title, -            'description': description, -            'ext': 'mp4', -            'thumbnail': thumbnail, -        } diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index fd1770dac..6d9cd8abd 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -5,7 +5,6 @@ import re  from .common import InfoExtractor  from ..utils import ExtractorError -from .bliptv import BlipTVIE  from .screenwavemedia import ScreenwaveMediaIE @@ -34,18 +33,17 @@ class CinemassacreIE(InfoExtractor):              },          },          { -            # blip.tv embedded video +            # Youtube embedded video              'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', -            'md5': 'ca9b3c8dd5a66f9375daeb5135f5a3de', +            'md5': 'df4cf8a1dcedaec79a73d96d83b99023',              'info_dict': { -                'id': '4065369', -                'ext': 'flv', +                'id': 'OEVzPCY2T-g', +                'ext': 'mp4',                  'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles',                  'upload_date': '20061207', -                'uploader': 'cinemassacre', -                'uploader_id': '250778', -                'timestamp': 1283233867, -                'description': 'md5:0a108c78d130676b207d0f6d029ecffd', +                'uploader': 'Cinemassacre', +                'uploader_id': 'JamesNintendoNerd', +                'description': 'md5:784734696c2b8b7f4b8625cc799e07f6',              }          },          { @@ -89,8 +87,6 @@ class CinemassacreIE(InfoExtractor):              ],              webpage, 'player data URL', default=None, group='url')          if not playerdata_url: -            playerdata_url = BlipTVIE._extract_url(webpage) -        if not playerdata_url:              raise ExtractorError('Unable to find player data')          video_title = self._html_search_regex( diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 81f3d7697..2efa200b5 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -1,10 +1,12 @@  # encoding: utf-8  from __future__ import unicode_literals -import json -  from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( +    int_or_none, +    parse_duration, +    parse_iso8601, +)  class ComCarCoffIE(InfoExtractor): @@ -16,6 +18,7 @@ class ComCarCoffIE(InfoExtractor):              'ext': 'mp4',              'upload_date': '20141127',              'timestamp': 1417107600, +            'duration': 1232,              'title': 'Happy Thanksgiving Miranda',              'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.',              'thumbnail': 'http://ccc.crackle.com/images/s5e4_thumb.jpg', @@ -31,9 +34,10 @@ class ComCarCoffIE(InfoExtractor):              display_id = 'comediansincarsgettingcoffee.com'          webpage = self._download_webpage(url, display_id) -        full_data = json.loads(self._search_regex( -            r'<script type="application/json" id="videoData">(?P<json>.+?)</script>', -            webpage, 'full data json')) +        full_data = self._parse_json( +            self._search_regex( +                r'window\.app\s*=\s*({.+?});\n', webpage, 'full data json'), +            display_id)['videoData']          video_id = full_data['activeVideo']['video']          video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id] @@ -45,12 +49,18 @@ class ComCarCoffIE(InfoExtractor):          formats = self._extract_m3u8_formats(              video_data['mediaUrl'], video_id, ext='mp4') +        timestamp = int_or_none(video_data.get('pubDateTime')) or parse_iso8601( +            video_data.get('pubDate')) +        duration = int_or_none(video_data.get('durationSeconds')) or parse_duration( +            video_data.get('duration')) +          return {              'id': video_id,              'display_id': display_id,              'title': video_data['title'],              'description': video_data.get('description'), -            'timestamp': parse_iso8601(video_data.get('pubDate')), +            'timestamp': timestamp, +            'duration': duration,              'thumbnails': thumbnails,              'formats': formats,              'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))), diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 934da765e..9a94cf361 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -25,6 +25,18 @@ class DaumIE(InfoExtractor):              'duration': 3868,          },      }, { +        # Test for https://github.com/rg3/youtube-dl/issues/7949 +        'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=M1O35s8HPOo0&clipid=73147290', +        'md5': 'c92d78bcee4424451f1667f275c1dc97', +        'info_dict': { +            'id': '73147290', +            'ext': 'mp4', +            'title': '싸이 - 나팔바지 [유희열의 스케치북] 299회 20151218', +            'description': '싸이 - 나팔바지', +            'upload_date': '20151219', +            'duration': 232, +        }, +    }, {          'url': 'http://tvpot.daum.net/v/vab4dyeDBysyBssyukBUjBz',          'only_matching': True,      }, { @@ -37,9 +49,11 @@ class DaumIE(InfoExtractor):          video_id = mobj.group('id')          canonical_url = 'http://tvpot.daum.net/v/%s' % video_id          webpage = self._download_webpage(canonical_url, video_id) +        og_url = self._og_search_url(webpage, default=None) or self._search_regex( +            r'<link[^>]+rel=(["\'])canonical\1[^>]+href=(["\'])(?P<url>.+?)\2', +            webpage, 'canonical url', group='url')          full_id = self._search_regex( -            r'src=["\']http://videofarm\.daum\.net/controller/video/viewer/Video\.html\?.*?vid=(.+?)[&"\']', -            webpage, 'full id') +            r'tvpot\.daum\.net/v/([^/]+)', og_url, 'full id')          query = compat_urllib_parse.urlencode({'vid': full_id})          info = self._download_xml(              'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index d836c1a6c..60ed438f8 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals  import itertools -from .common import InfoExtractor +from .amp import AMPIE  from ..compat import (      compat_HTTPError,      compat_urllib_parse, @@ -12,14 +12,11 @@ from ..compat import (  from ..utils import (      ExtractorError,      clean_html, -    determine_ext, -    int_or_none, -    parse_iso8601,      sanitized_Request,  ) -class DramaFeverBaseIE(InfoExtractor): +class DramaFeverBaseIE(AMPIE):      _LOGIN_URL = 'https://www.dramafever.com/accounts/login/'      _NETRC_MACHINE = 'dramafever' @@ -80,60 +77,25 @@ class DramaFeverIE(DramaFeverBaseIE):              'timestamp': 1404336058,              'upload_date': '20140702',              'duration': 343, -        } +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }      def _real_extract(self, url):          video_id = self._match_id(url).replace('/', '.')          try: -            feed = self._download_json( -                'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id, -                video_id, 'Downloading episode JSON')['channel']['item'] +            info = self._extract_feed_info( +                'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id)          except ExtractorError as e:              if isinstance(e.cause, compat_HTTPError):                  raise ExtractorError(                      'Currently unavailable in your country.', expected=True)              raise -        media_group = feed.get('media-group', {}) - -        formats = [] -        for media_content in media_group['media-content']: -            src = media_content.get('@attributes', {}).get('url') -            if not src: -                continue -            ext = determine_ext(src) -            if ext == 'f4m': -                formats.extend(self._extract_f4m_formats( -                    src, video_id, f4m_id='hds')) -            elif ext == 'm3u8': -                formats.extend(self._extract_m3u8_formats( -                    src, video_id, 'mp4', m3u8_id='hls')) -            else: -                formats.append({ -                    'url': src, -                }) -        self._sort_formats(formats) - -        title = media_group.get('media-title') -        description = media_group.get('media-description') -        duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration')) -        thumbnail = self._proto_relative_url( -            media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url')) -        timestamp = parse_iso8601(feed.get('pubDate'), ' ') - -        subtitles = {} -        for media_subtitle in media_group.get('media-subTitle', []): -            lang = media_subtitle.get('@attributes', {}).get('lang') -            href = media_subtitle.get('@attributes', {}).get('href') -            if not lang or not href: -                continue -            subtitles[lang] = [{ -                'ext': 'ttml', -                'url': href, -            }] -          series_id, episode_number = video_id.split('.')          episode_info = self._download_json(              # We only need a single episode info, so restricting page size to one episode @@ -146,21 +108,12 @@ class DramaFeverIE(DramaFeverBaseIE):              if value:                  subfile = value[0].get('subfile') or value[0].get('new_subfile')                  if subfile and subfile != 'http://www.dramafever.com/st/': -                    subtitles.setdefault('English', []).append({ +                    info['subtitiles'].setdefault('English', []).append({                          'ext': 'srt',                          'url': subfile,                      }) -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'timestamp': timestamp, -            'duration': duration, -            'formats': formats, -            'subtitles': subtitles, -        } +        return info  class DramaFeverSeriesIE(DramaFeverBaseIE): diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index d9a868119..6f9b003c2 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -2,6 +2,11 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..utils import ( +    xpath_element, +    xpath_text, +    int_or_none, +)  class FazIE(InfoExtractor): @@ -37,31 +42,32 @@ class FazIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) +        description = self._og_search_description(webpage)          config_xml_url = self._search_regex( -            r'(?:var\s+)?videoXMLURL\s*=\s*"([^"]+)', webpage, 'config xml url') +            r'videoXMLURL\s*=\s*"([^"]+)', webpage, 'config xml url')          config = self._download_xml(              config_xml_url, video_id, 'Downloading config xml') -        encodings = config.find('ENCODINGS') +        encodings = xpath_element(config, 'ENCODINGS', 'encodings', True)          formats = []          for pref, code in enumerate(['LOW', 'HIGH', 'HQ']): -            encoding = encodings.find(code) -            if encoding is None: -                continue -            encoding_url = encoding.find('FILENAME').text -            formats.append({ -                'url': encoding_url, -                'format_id': code.lower(), -                'quality': pref, -            }) +            encoding = xpath_element(encodings, code) +            if encoding: +                encoding_url = xpath_text(encoding, 'FILENAME') +                if encoding_url: +                    formats.append({ +                        'url': encoding_url, +                        'format_id': code.lower(), +                        'quality': pref, +                        'tbr': int_or_none(xpath_text(encoding, 'AVERAGEBITRATE')), +                    })          self._sort_formats(formats) -        descr = self._html_search_regex( -            r'<p class="Content Copy">(.*?)</p>', webpage, 'description', fatal=False)          return {              'id': video_id,              'title': self._og_search_title(webpage),              'formats': formats, -            'description': descr, -            'thumbnail': config.find('STILL/STILL_BIG').text, +            'description': description.strip() if description else None, +            'thumbnail': xpath_text(config, 'STILL/STILL_BIG'), +            'duration': int_or_none(xpath_text(config, 'DURATION')),          } diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 91cd46e76..18f439df9 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -1,67 +1,93 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor +from ..compat import compat_urllib_parse  from ..utils import (      ExtractorError, -    find_xpath_attr, -    sanitized_Request, +    int_or_none, +    qualities,  )  class FlickrIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' +    _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/[\w\-_@]+/(?P<id>\d+)'      _TEST = {          'url': 'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/', -        'md5': '6fdc01adbc89d72fc9c4f15b4a4ba87b', +        'md5': '164fe3fa6c22e18d448d4d5af2330f31',          'info_dict': {              'id': '5645318632', -            'ext': 'mp4', -            "description": "Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.", -            "uploader_id": "forestwander-nature-pictures", -            "title": "Dark Hollow Waterfalls" +            'ext': 'mpg', +            'description': 'Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.', +            'title': 'Dark Hollow Waterfalls', +            'duration': 19, +            'timestamp': 1303528740, +            'upload_date': '20110423', +            'uploader_id': '10922353@N03', +            'uploader': 'Forest Wander', +            'comment_count': int, +            'view_count': int, +            'tags': list,          }      } -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) +    _API_BASE_URL = 'https://api.flickr.com/services/rest?' -        video_id = mobj.group('id') -        video_uploader_id = mobj.group('uploader_id') -        webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id -        req = sanitized_Request(webpage_url) -        req.add_header( -            'User-Agent', -            # it needs a more recent version -            'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20150101 Firefox/38.0 (Chrome)') -        webpage = self._download_webpage(req, video_id) +    def _call_api(self, method, video_id, api_key, note, secret=None): +        query = { +            'photo_id': video_id, +            'method': 'flickr.%s' % method, +            'api_key': api_key, +            'format': 'json', +            'nojsoncallback': 1, +        } +        if secret: +            query['secret'] = secret +        data = self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note) +        if data['stat'] != 'ok': +            raise ExtractorError(data['message']) +        return data -        secret = self._search_regex(r'secret"\s*:\s*"(\w+)"', webpage, 'secret') +    def _real_extract(self, url): +        video_id = self._match_id(url) -        first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' -        first_xml = self._download_xml(first_url, video_id, 'Downloading first data webpage') +        api_key = self._download_json( +            'https://www.flickr.com/hermes_error_beacon.gne', video_id, +            'Downloading api key')['site_key'] -        node_id = find_xpath_attr( -            first_xml, './/{http://video.yahoo.com/YEP/1.0/}Item', 'id', -            'id').text +        video_info = self._call_api( +            'photos.getInfo', video_id, api_key, 'Downloading video info')['photo'] +        if video_info['media'] == 'video': +            streams = self._call_api( +                'video.getStreamInfo', video_id, api_key, +                'Downloading streams info', video_info['secret'])['streams'] -        second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' -        second_xml = self._download_xml(second_url, video_id, 'Downloading second data webpage') +            preference = qualities( +                ['288p', 'iphone_wifi', '100', '300', '700', '360p', 'appletv', '720p', '1080p', 'orig']) -        self.report_extraction(video_id) +            formats = [] +            for stream in streams['stream']: +                stream_type = str(stream.get('type')) +                formats.append({ +                    'format_id': stream_type, +                    'url': stream['_content'], +                    'preference': preference(stream_type), +                }) +            self._sort_formats(formats) -        stream = second_xml.find('.//STREAM') -        if stream is None: -            raise ExtractorError('Unable to extract video url') -        video_url = stream.attrib['APP'] + stream.attrib['FULLPATH'] +            owner = video_info.get('owner', {}) -        return { -            'id': video_id, -            'url': video_url, -            'ext': 'mp4', -            'title': self._og_search_title(webpage), -            'description': self._og_search_description(webpage), -            'thumbnail': self._og_search_thumbnail(webpage), -            'uploader_id': video_uploader_id, -        } +            return { +                'id': video_id, +                'title': video_info['title']['_content'], +                'description': video_info.get('description', {}).get('_content'), +                'formats': formats, +                'timestamp': int_or_none(video_info.get('dateuploaded')), +                'duration': int_or_none(video_info.get('video', {}).get('duration')), +                'uploader_id': owner.get('nsid'), +                'uploader': owner.get('realname'), +                'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), +                'view_count': int_or_none(video_info.get('views')), +                'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])] +            } +        else: +            raise ExtractorError('not a video', expected=True) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 3a4a59135..318ac013d 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -2,14 +2,10 @@ from __future__ import unicode_literals  import re -from .common import InfoExtractor -from ..utils import ( -    parse_iso8601, -    int_or_none, -) +from .amp import AMPIE -class FoxNewsIE(InfoExtractor): +class FoxNewsIE(AMPIE):      IE_DESC = 'Fox News and Fox Business Video'      _VALID_URL = r'https?://(?P<host>video\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'      _TESTS = [ @@ -20,10 +16,10 @@ class FoxNewsIE(InfoExtractor):                  'id': '3937480',                  'ext': 'flv',                  'title': 'Frozen in Time', -                'description': 'Doctors baffled by 16-year-old girl that is the size of a toddler', +                'description': '16-year-old girl is size of toddler',                  'duration': 265, -                'timestamp': 1304411491, -                'upload_date': '20110503', +                # 'timestamp': 1304411491, +                # 'upload_date': '20110503',                  'thumbnail': 're:^https?://.*\.jpg$',              },          }, @@ -34,10 +30,10 @@ class FoxNewsIE(InfoExtractor):                  'id': '3922535568001',                  'ext': 'mp4',                  'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal", -                'description': "Congressman discusses the president's executive action", +                'description': "Congressman discusses president's plan",                  'duration': 292, -                'timestamp': 1417662047, -                'upload_date': '20141204', +                # 'timestamp': 1417662047, +                # 'upload_date': '20141204',                  'thumbnail': 're:^https?://.*\.jpg$',              },          }, @@ -52,52 +48,9 @@ class FoxNewsIE(InfoExtractor):      ]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        host = mobj.group('host') +        host, video_id = re.match(self._VALID_URL, url).groups() -        video = self._download_json( -            'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id) - -        item = video['channel']['item'] -        title = item['title'] -        description = item['description'] -        timestamp = parse_iso8601(item['dc-date']) - -        media_group = item['media-group'] -        duration = None -        formats = [] -        for media in media_group['media-content']: -            attributes = media['@attributes'] -            video_url = attributes['url'] -            if video_url.endswith('.f4m'): -                formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id)) -            elif video_url.endswith('.m3u8'): -                formats.extend(self._extract_m3u8_formats(video_url, video_id, 'flv')) -            elif not video_url.endswith('.smil'): -                duration = int_or_none(attributes.get('duration')) -                formats.append({ -                    'url': video_url, -                    'format_id': media['media-category']['@attributes']['label'], -                    'preference': 1, -                    'vbr': int_or_none(attributes.get('bitrate')), -                    'filesize': int_or_none(attributes.get('fileSize')) -                }) -        self._sort_formats(formats) - -        media_thumbnail = media_group['media-thumbnail']['@attributes'] -        thumbnails = [{ -            'url': media_thumbnail['url'], -            'width': int_or_none(media_thumbnail.get('width')), -            'height': int_or_none(media_thumbnail.get('height')), -        }] if media_thumbnail else [] - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'duration': duration, -            'timestamp': timestamp, -            'formats': formats, -            'thumbnails': thumbnails, -        } +        info = self._extract_feed_info( +            'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) +        info['id'] = video_id +        return info diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 6613ee17a..fdc51f44f 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -1,8 +1,6 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  from ..utils import int_or_none @@ -23,8 +21,7 @@ class FranceInterIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) @@ -33,7 +30,7 @@ class FranceInterIE(InfoExtractor):          video_url = 'http://www.franceinter.fr/' + path          title = self._html_search_regex( -            r'<span class="title">(.+?)</span>', webpage, 'title') +            r'<span class="title-diffusion">(.+?)</span>', webpage, 'title')          description = self._html_search_regex(              r'<span class="description">(.*?)</span>',              webpage, 'description', fatal=False) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c2e8f9b62..3c3066e38 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -44,7 +44,6 @@ from .myvi import MyviIE  from .condenast import CondeNastIE  from .udn import UDNEmbedIE  from .senateisvp import SenateISVPIE -from .bliptv import BlipTVIE  from .svt import SVTIE  from .pornhub import PornHubIE  from .xhamster import XHamsterEmbedIE @@ -55,6 +54,8 @@ from .snagfilms import SnagFilmsEmbedIE  from .screenwavemedia import ScreenwaveMediaIE  from .mtv import MTVServicesEmbeddedIE  from .pladform import PladformIE +from .googledrive import GoogleDriveIE +from .jwplatform import JWPlatformIE  class GenericIE(InfoExtractor): @@ -1440,11 +1441,6 @@ class GenericIE(InfoExtractor):                  'id': match.group('id')              } -        # Look for embedded blip.tv player -        bliptv_url = BlipTVIE._extract_url(webpage) -        if bliptv_url: -            return self.url_result(bliptv_url, 'BlipTV') -          # Look for SVT player          svt_url = SVTIE._extract_url(webpage)          if svt_url: @@ -1769,6 +1765,11 @@ class GenericIE(InfoExtractor):          if nbc_sports_url:              return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') +        # Look for Google Drive embeds +        google_drive_url = GoogleDriveIE._extract_url(webpage) +        if google_drive_url: +            return self.url_result(google_drive_url, 'GoogleDrive') +          # Look for UDN embeds          mobj = re.search(              r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) @@ -1796,6 +1797,11 @@ class GenericIE(InfoExtractor):          if snagfilms_url:              return self.url_result(snagfilms_url) +        # Look for JWPlatform embeds +        jwplatform_url = JWPlatformIE._extract_url(webpage) +        if jwplatform_url: +            return self.url_result(jwplatform_url, 'JWPlatform') +          # Look for ScreenwaveMedia embeds          mobj = re.search(ScreenwaveMediaIE.EMBED_PATTERN, webpage)          if mobj is not None: diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py new file mode 100644 index 000000000..f354c9c7a --- /dev/null +++ b/youtube_dl/extractor/googledrive.py @@ -0,0 +1,88 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    int_or_none, +) + + +class GoogleDriveIE(InfoExtractor): +    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})' +    _TEST = { +        'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', +        'md5': '881f7700aec4f538571fa1e0eed4a7b6', +        'info_dict': { +            'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', +            'ext': 'mp4', +            'title': 'Big Buck Bunny.mp4', +            'duration': 46, +        } +    } +    _FORMATS_EXT = { +        '5': 'flv', +        '6': 'flv', +        '13': '3gp', +        '17': '3gp', +        '18': 'mp4', +        '22': 'mp4', +        '34': 'flv', +        '35': 'flv', +        '36': '3gp', +        '37': 'mp4', +        '38': 'mp4', +        '43': 'webm', +        '44': 'webm', +        '45': 'webm', +        '46': 'webm', +        '59': 'mp4', +    } + +    @staticmethod +    def _extract_url(webpage): +        mobj = re.search( +            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', +            webpage) +        if mobj: +            return 'https://drive.google.com/file/d/%s' % mobj.group('id') + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage( +            'http://docs.google.com/file/d/%s' % video_id, video_id, encoding='unicode_escape') + +        reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) +        if reason: +            raise ExtractorError(reason) + +        title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title') +        duration = int_or_none(self._search_regex( +            r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None)) +        fmt_stream_map = self._search_regex( +            r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',') +        fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') + +        formats = [] +        for fmt, fmt_stream in zip(fmt_list, fmt_stream_map): +            fmt_id, fmt_url = fmt_stream.split('|') +            resolution = fmt.split('/')[1] +            width, height = resolution.split('x') +            formats.append({ +                'url': fmt_url, +                'format_id': fmt_id, +                'resolution': resolution, +                'width': int_or_none(width), +                'height': int_or_none(height), +                'ext': self._FORMATS_EXT[fmt_id], +            }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'thumbnail': self._og_search_thumbnail(webpage), +            'duration': duration, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 70c8ca64e..85e9344aa 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -13,7 +13,7 @@ from ..utils import (  class ImgurIE(InfoExtractor): -    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!gallery)(?P<id>[a-zA-Z0-9]+)' +    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|topic/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$'      _TESTS = [{          'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -21,7 +21,7 @@ class ImgurIE(InfoExtractor):              'id': 'A61SaA1',              'ext': 'mp4',              'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', -            'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', +            'description': 'Imgur: The most awesome images on the Internet.',          },      }, {          'url': 'https://imgur.com/A61SaA1', @@ -29,8 +29,20 @@ class ImgurIE(InfoExtractor):              'id': 'A61SaA1',              'ext': 'mp4',              'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', -            'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', +            'description': 'Imgur: The most awesome images on the Internet.',          }, +    }, { +        'url': 'https://imgur.com/gallery/YcAQlkx', +        'info_dict': { +            'id': 'YcAQlkx', +            'ext': 'mp4', +            'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', +            'description': 'Imgur: The most awesome images on the Internet.' + +        } +    }, { +        'url': 'http://imgur.com/topic/Funny/N8rOudd', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -100,25 +112,38 @@ class ImgurIE(InfoExtractor):  class ImgurAlbumIE(InfoExtractor): -    _VALID_URL = r'https?://(?:i\.)?imgur\.com/gallery/(?P<id>[a-zA-Z0-9]+)' +    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:a|gallery|topic/[^/]+)/)?(?P<id>[a-zA-Z0-9]{5})(?:[/?#&]+)?$' -    _TEST = { +    _TESTS = [{          'url': 'http://imgur.com/gallery/Q95ko',          'info_dict': {              'id': 'Q95ko',          },          'playlist_count': 25, -    } +    }, { +        'url': 'http://imgur.com/a/j6Orj', +        'only_matching': True, +    }, { +        'url': 'http://imgur.com/topic/Aww/ll5Vk', +        'only_matching': True, +    }]      def _real_extract(self, url):          album_id = self._match_id(url)          album_images = self._download_json(              'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, -            album_id)['data']['images'] - -        entries = [ -            self.url_result('http://imgur.com/%s' % image['hash']) -            for image in album_images if image.get('hash')] - -        return self.playlist_result(entries, album_id) +            album_id, fatal=False) + +        if album_images: +            data = album_images.get('data') +            if data and isinstance(data, dict): +                images = data.get('images') +                if images and isinstance(images, list): +                    entries = [ +                        self.url_result('http://imgur.com/%s' % image['hash']) +                        for image in images if image.get('hash')] +                    return self.playlist_result(entries, album_id) + +        # Fallback to single video +        return self.url_result('http://imgur.com/%s' % album_id, ImgurIE.ie_key()) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index c158f2064..e5e16ca3b 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -47,7 +47,7 @@ class InstagramIE(InfoExtractor):  class InstagramUserIE(InfoExtractor): -    _VALID_URL = r'https://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])' +    _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'      IE_DESC = 'Instagram user profile'      IE_NAME = 'instagram:user'      _TEST = { diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py new file mode 100644 index 000000000..cdc095a79 --- /dev/null +++ b/youtube_dl/extractor/jwplatform.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class JWPlatformIE(InfoExtractor): +    _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' +    _TEST = { +        'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', +        'md5': 'fa8899fa601eb7c83a64e9d568bdf325', +        'info_dict': { +            'id': 'nPripu9l', +            'ext': 'mov', +            'title': 'Big Buck Bunny Trailer', +            'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', +            'upload_date': '20081127', +            'timestamp': 1227796140, +        } +    } + +    @staticmethod +    def _extract_url(webpage): +        mobj = re.search( +            r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', +            webpage) +        if mobj: +            return mobj.group('url') + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) +        video_data = json_data['playlist'][0] +        subtitles = {} +        for track in video_data['tracks']: +            if track['kind'] == 'captions': +                subtitles[track['label']] = [{'url': self._proto_relative_url(track['file'])}] + +        formats = [] +        for source in video_data['sources']: +            source_url = self._proto_relative_url(source['file']) +            source_type = source.get('type') or '' +            if source_type == 'application/vnd.apple.mpegurl': +                m3u8_formats = self._extract_m3u8_formats(source_url, video_id, 'mp4', 'm3u8_native', fatal=None) +                if m3u8_formats: +                    formats.extend(m3u8_formats) +            elif source_type.startswith('audio'): +                formats.append({ +                    'url': source_url, +                    'vcodec': 'none', +                }) +            else: +                formats.append({ +                    'url': source_url, +                    'width': int_or_none(source.get('width')), +                    'height': int_or_none(source.get('height')), +                }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': video_data['title'], +            'description': video_data.get('description'), +            'thumbnail': self._proto_relative_url(video_data.get('image')), +            'timestamp': int_or_none(video_data.get('pubdate')), +            'subtitles': subtitles, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/makertv.py b/youtube_dl/extractor/makertv.py new file mode 100644 index 000000000..3c34d4604 --- /dev/null +++ b/youtube_dl/extractor/makertv.py @@ -0,0 +1,32 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MakerTVIE(InfoExtractor): +    _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)*video|makerplayer.com/embed/maker)/(?P<id>[a-zA-Z0-9]{12})' +    _TEST = { +        'url': 'http://www.maker.tv/video/Fh3QgymL9gsc', +        'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', +        'info_dict': { +            'id': 'Fh3QgymL9gsc', +            'ext': 'mp4', +            'title': 'Maze Runner: The Scorch Trials Official Movie Review', +            'description': 'md5:11ff3362d7ef1d679fdb649f6413975a', +            'upload_date': '20150918', +            'timestamp': 1442549540, +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        jwplatform_id = self._search_regex(r'jw_?id="([^"]+)"', webpage, 'jwplatform id') + +        return { +            '_type': 'url_transparent', +            'id': video_id, +            'url': 'jwplatform:%s' % jwplatform_id, +            'ie_key': 'JWPlatform', +        } diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 744e4a09a..97e8ffc97 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -16,7 +16,7 @@ from ..utils import (  class PBSIE(InfoExtractor):      _STATIONS = ( -        (r'(?:video|www)\.pbs\.org', 'PBS: Public Broadcasting Service'),  # http://www.pbs.org/ +        (r'(?:video|www|player)\.pbs\.org', 'PBS: Public Broadcasting Service'),  # http://www.pbs.org/          (r'video\.aptv\.org', 'APT - Alabama Public Television (WBIQ)'),  # http://aptv.org/          (r'video\.gpb\.org', 'GPB/Georgia Public Broadcasting (WGTV)'),  # http://www.gpb.org/          (r'video\.mpbonline\.org', 'Mississippi Public Broadcasting (WMPN)'),  # http://www.mpbonline.org diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 63cc764bb..514e9b433 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -31,9 +31,8 @@ class PeriscopeIE(InfoExtractor):      }]      def _call_api(self, method, value): -        attribute = 'token' if len(value) > 13 else 'broadcast_id'          return self._download_json( -            'https://api.periscope.tv/api/v2/%s?%s=%s' % (method, attribute, value), value) +            'https://api.periscope.tv/api/v2/%s?broadcast_id=%s' % (method, value), value)      def _real_extract(self, url):          token = self._match_id(url) diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py deleted file mode 100644 index 5da66ca9e..000000000 --- a/youtube_dl/extractor/soompi.py +++ /dev/null @@ -1,146 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .crunchyroll import CrunchyrollIE - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( -    ExtractorError, -    int_or_none, -    remove_start, -    xpath_text, -) - - -class SoompiBaseIE(InfoExtractor): -    def _get_episodes(self, webpage, episode_filter=None): -        episodes = self._parse_json( -            self._search_regex( -                r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'), -            None) -        return list(filter(episode_filter, episodes)) - - -class SoompiIE(SoompiBaseIE, CrunchyrollIE): -    IE_NAME = 'soompi' -    _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)' -    _TESTS = [{ -        'url': 'http://tv.soompi.com/en/watch/29235', -        'info_dict': { -            'id': '29235', -            'ext': 'mp4', -            'title': 'Episode 1096', -            'description': '2015-05-20' -        }, -        'params': { -            'skip_download': True, -        }, -    }] - -    def _get_episode(self, webpage, video_id): -        return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0] - -    def _get_subtitles(self, config, video_id): -        sub_langs = {} -        for subtitle in config.findall('./{default}preload/subtitles/subtitle'): -            sub_langs[subtitle.attrib['id']] = subtitle.attrib['title'] - -        subtitles = {} -        for s in config.findall('./{default}preload/subtitle'): -            lang_code = sub_langs.get(s.attrib['id']) -            if not lang_code: -                continue -            sub_id = s.get('id') -            data = xpath_text(s, './data', 'data') -            iv = xpath_text(s, './iv', 'iv') -            if not id or not iv or not data: -                continue -            subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8') -            subtitles[lang_code] = self._extract_subtitles(subtitle) -        return subtitles - -    def _real_extract(self, url): -        video_id = self._match_id(url) - -        try: -            webpage = self._download_webpage( -                url, video_id, 'Downloading episode page') -        except ExtractorError as ee: -            if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: -                webpage = ee.cause.read() -                block_message = self._html_search_regex( -                    r'(?s)<div class="block-message">(.+?)</div>', webpage, -                    'block message', default=None) -                if block_message: -                    raise ExtractorError(block_message, expected=True) -            raise - -        formats = [] -        config = None -        for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage): -            config = self._download_xml( -                'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id), -                video_id, 'Downloading %s XML' % format_id) -            m3u8_url = xpath_text( -                config, './{default}preload/stream_info/file', -                '%s m3u8 URL' % format_id) -            if not m3u8_url: -                continue -            formats.extend(self._extract_m3u8_formats( -                m3u8_url, video_id, 'mp4', m3u8_id=format_id)) -        self._sort_formats(formats) - -        episode = self._get_episode(webpage, video_id) - -        title = episode['name'] -        description = episode.get('description') -        duration = int_or_none(episode.get('duration')) - -        thumbnails = [{ -            'id': thumbnail_id, -            'url': thumbnail_url, -        } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()] - -        subtitles = self.extract_subtitles(config, video_id) - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'thumbnails': thumbnails, -            'duration': duration, -            'formats': formats, -            'subtitles': subtitles -        } - - -class SoompiShowIE(SoompiBaseIE): -    IE_NAME = 'soompi:show' -    _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)' -    _TESTS = [{ -        'url': 'http://tv.soompi.com/en/shows/liar-game', -        'info_dict': { -            'id': 'liar-game', -            'title': 'Liar Game', -            'description': 'md5:52c02bce0c1a622a95823591d0589b66', -        }, -        'playlist_count': 14, -    }] - -    def _real_extract(self, url): -        show_id = self._match_id(url) - -        webpage = self._download_webpage( -            url, show_id, 'Downloading show page') - -        title = remove_start(self._og_search_title(webpage), 'SoompiTV | ') -        description = self._og_search_description(webpage) - -        entries = [ -            self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi') -            for episode in self._get_episodes(webpage)] - -        return self.playlist_result(entries, show_id, title, description) diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py new file mode 100644 index 000000000..a363b4d40 --- /dev/null +++ b/youtube_dl/extractor/tele13.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( +    js_to_json, +    qualities, +    determine_ext, +) + + +class Tele13IE(InfoExtractor): +    _VALID_URL = r'^http://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' +    _TESTS = [ +        { +            'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', +            'md5': '4cb1fa38adcad8fea88487a078831755', +            'info_dict': { +                'id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', +                'ext': 'mp4', +                'title': 'El círculo de hierro de Michelle Bachelet en su regreso a La Moneda', +            }, +            'params': { +                # HTTP Error 404: Not Found +                'skip_download': True, +            }, +        }, +        { +            'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok', +            'md5': '867adf6a3b3fef932c68a71d70b70946', +            'info_dict': { +                'id': 'rOoKv2OMpOw', +                'ext': 'mp4', +                'title': 'Shooting star seen on 7-Sep-2015', +                'description': 'md5:7292ff2a34b2f673da77da222ae77e1e', +                'uploader': 'Porjai Jaturongkhakun', +                'upload_date': '20150906', +                'uploader_id': 'UCnLY_3ezwNcDSC_Wc6suZxw', +            }, +            'add_ie': ['Youtube'], +        } +    ] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) + +        setup_js = self._search_regex(r"(?s)jwplayer\('player-vivo'\).setup\((\{.*?\})\)", webpage, 'setup code') +        sources = self._parse_json(self._search_regex(r'sources\s*:\s*(\[[^\]]+\])', setup_js, 'sources'), display_id, js_to_json) + +        preference = qualities(['Móvil', 'SD', 'HD']) +        formats = [] +        urls = [] +        for f in sources: +            format_url = f['file'] +            if format_url and format_url not in urls: +                ext = determine_ext(format_url) +                if ext == 'm3u8': +                    m3u8_formats = self._extract_m3u8_formats(format_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) +                    if m3u8_formats: +                        formats.extend(m3u8_formats) +                elif YoutubeIE.suitable(format_url): +                    return self.url_result(format_url, 'Youtube') +                else: +                    formats.append({ +                        'url': format_url, +                        'format_id': f.get('label'), +                        'preference': preference(f.get('label')), +                        'ext': ext, +                    }) +                urls.append(format_url) +        self._sort_formats(formats) + +        return { +            'id': display_id, +            'title': self._search_regex(r'title\s*:\s*"([^"]+)"', setup_js, 'title'), +            'description': self._html_search_meta('description', webpage, 'description'), +            'thumbnail': self._search_regex(r'image\s*:\s*"([^"]+)"', setup_js, 'thumbnail', default=None), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index c1ee1decc..e03e2dbaa 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -5,6 +5,8 @@ from .common import InfoExtractor  from ..utils import (      parse_iso8601,      int_or_none, +    xpath_attr, +    xpath_element,  ) @@ -15,7 +17,7 @@ class TwentyFourVideoIE(InfoExtractor):      _TESTS = [          {              'url': 'http://www.24video.net/video/view/1044982', -            'md5': 'd041af8b5b4246ea466226a0d6693345', +            'md5': 'e09fc0901d9eaeedac872f154931deeb',              'info_dict': {                  'id': '1044982',                  'ext': 'mp4', @@ -64,33 +66,24 @@ class TwentyFourVideoIE(InfoExtractor):              r'<div class="comments-title" id="comments-count">(\d+) комментари',              webpage, 'comment count', fatal=False)) -        formats = [] +        # Sets some cookies +        self._download_xml( +            r'http://www.24video.net/video/xml/%s?mode=init' % video_id, +            video_id, 'Downloading init XML') -        pc_video = self._download_xml( +        video_xml = self._download_xml(              'http://www.24video.net/video/xml/%s?mode=play' % video_id, -            video_id, 'Downloading PC video URL').find('.//video') +            video_id, 'Downloading video XML') -        formats.append({ -            'url': pc_video.attrib['url'], -            'format_id': 'pc', -            'quality': 1, -        }) +        video = xpath_element(video_xml, './/video', 'video', fatal=True) -        like_count = int_or_none(pc_video.get('ratingPlus')) -        dislike_count = int_or_none(pc_video.get('ratingMinus')) -        age_limit = 18 if pc_video.get('adult') == 'true' else 0 +        formats = [{ +            'url': xpath_attr(video, '', 'url', 'video URL', fatal=True), +        }] -        mobile_video = self._download_xml( -            'http://www.24video.net/video/xml/%s' % video_id, -            video_id, 'Downloading mobile video URL').find('.//video') - -        formats.append({ -            'url': mobile_video.attrib['url'], -            'format_id': 'mobile', -            'quality': 0, -        }) - -        self._sort_formats(formats) +        like_count = int_or_none(video.get('ratingPlus')) +        dislike_count = int_or_none(video.get('ratingMinus')) +        age_limit = 18 if video.get('adult') == 'true' else 0          return {              'id': video_id, diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index f38a72fde..811ee197d 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -4,26 +4,48 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from .xstream import XstreamIE  from ..utils import (      ExtractorError,      float_or_none,  ) -class VGTVIE(InfoExtractor): -    IE_DESC = 'VGTV and BTTV' +class VGTVIE(XstreamIE): +    IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' + +    _HOST_TO_APPNAME = { +        'vgtv.no': 'vgtv', +        'bt.no/tv': 'bttv', +        'aftenbladet.no/tv': 'satv', +        'fvn.no/fvntv': 'fvntv', +        'aftenposten.no/webtv': 'aptv', +    } + +    _APP_NAME_TO_VENDOR = { +        'vgtv': 'vgtv', +        'bttv': 'bt', +        'satv': 'sa', +        'fvntv': 'fvn', +        'aptv': 'ap', +    } +      _VALID_URL = r'''(?x) -                    (?: -                        vgtv:| -                        http://(?:www\.)? +                    (?:https?://(?:www\.)? +                    (?P<host> +                        %s                      ) -                    (?P<host>vgtv|bt) +                    /                      (?: -                        :| -                        \.no/(?:tv/)?\#!/(?:video|live)/ -                    ) -                    (?P<id>[0-9]+) -                    ''' +                        \#!/(?:video|live)/| +                        embed?.*id= +                    )| +                    (?P<appname> +                        %s +                    ):) +                    (?P<id>\d+) +                    ''' % ('|'.join(_HOST_TO_APPNAME.keys()), '|'.join(_APP_NAME_TO_VENDOR.keys())) +      _TESTS = [          {              # streamType: vod @@ -59,17 +81,18 @@ class VGTVIE(InfoExtractor):                  # m3u8 download                  'skip_download': True,              }, +            'skip': 'Video is no longer available',          },          { -            # streamType: live +            # streamType: wasLive              'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla',              'info_dict': {                  'id': '113063', -                'ext': 'flv', -                'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +                'ext': 'mp4', +                'title': 'V75 fra Solvalla 30.05.15',                  'description': 'md5:b3743425765355855f88e096acc93231',                  'thumbnail': 're:^https?://.*\.jpg', -                'duration': 0, +                'duration': 25966,                  'timestamp': 1432975582,                  'upload_date': '20150530',                  'view_count': int, @@ -80,6 +103,20 @@ class VGTVIE(InfoExtractor):              },          },          { +            'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', +            'md5': 'fd828cd29774a729bf4d4425fe192972', +            'info_dict': { +                'id': '21039', +                'ext': 'mov', +                'title': 'TRAILER: «SWEATSHOP» - I can´t take any more', +                'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', +                'duration': 66, +                'timestamp': 1417002452, +                'upload_date': '20141126', +                'view_count': int, +            } +        }, +        {              'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien',              'only_matching': True,          }, @@ -89,21 +126,27 @@ class VGTVIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id')          host = mobj.group('host') - -        HOST_WEBSITES = { -            'vgtv': 'vgtv', -            'bt': 'bttv', -        } +        appname = self._HOST_TO_APPNAME[host] if host else mobj.group('appname') +        vendor = self._APP_NAME_TO_VENDOR[appname]          data = self._download_json(              'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website' -            % (host, video_id, HOST_WEBSITES[host]), +            % (vendor, video_id, appname),              video_id, 'Downloading media JSON')          if data.get('status') == 'inactive':              raise ExtractorError(                  'Video %s is no longer available' % video_id, expected=True) +        info = { +            'formats': [], +        } +        if len(video_id) == 5: +            if appname == 'bttv': +                info = self._extract_video_info('btno', video_id) +            elif appname == 'aptv': +                info = self._extract_video_info('ap', video_id) +          streams = data['streamUrls']          stream_type = data.get('streamType') @@ -111,48 +154,53 @@ class VGTVIE(InfoExtractor):          hls_url = streams.get('hls')          if hls_url: -            formats.extend(self._extract_m3u8_formats( -                hls_url, video_id, 'mp4', m3u8_id='hls')) +            m3u8_formats = self._extract_m3u8_formats( +                hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False) +            if m3u8_formats: +                formats.extend(m3u8_formats)          hds_url = streams.get('hds')          # wasLive hds are always 404          if hds_url and stream_type != 'wasLive': -            formats.extend(self._extract_f4m_formats( -                hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', -                video_id, f4m_id='hds')) +            f4m_formats = self._extract_f4m_formats( +                hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds', fatal=False) +            if f4m_formats: +                formats.extend(f4m_formats) +        mp4_urls = streams.get('pseudostreaming') or []          mp4_url = streams.get('mp4')          if mp4_url: -            _url = hls_url or hds_url -            MP4_URL_TEMPLATE = '%s/%%s.%s' % (mp4_url.rpartition('/')[0], mp4_url.rpartition('.')[-1]) -            for mp4_format in _url.split(','): -                m = re.search('(?P<width>\d+)_(?P<height>\d+)_(?P<vbr>\d+)', mp4_format) -                if not m: -                    continue -                width = int(m.group('width')) -                height = int(m.group('height')) -                vbr = int(m.group('vbr')) -                formats.append({ -                    'url': MP4_URL_TEMPLATE % mp4_format, -                    'format_id': 'mp4-%s' % vbr, -                    'width': width, -                    'height': height, -                    'vbr': vbr, -                    'preference': 1, +            mp4_urls.append(mp4_url) +        for mp4_url in mp4_urls: +            format_info = { +                'url': mp4_url, +            } +            mobj = re.search('(\d+)_(\d+)_(\d+)', mp4_url) +            if mobj: +                tbr = int(mobj.group(3)) +                format_info.update({ +                    'width': int(mobj.group(1)), +                    'height': int(mobj.group(2)), +                    'tbr': tbr, +                    'format_id': 'mp4-%s' % tbr,                  }) -        self._sort_formats(formats) +            formats.append(format_info) + +        info['formats'].extend(formats) + +        self._sort_formats(info['formats']) -        return { +        info.update({              'id': video_id, -            'title': self._live_title(data['title']), +            'title': self._live_title(data['title']) if stream_type == 'live' else data['title'],              'description': data['description'],              'thumbnail': data['images']['main'] + '?t[]=900x506q80',              'timestamp': data['published'],              'duration': float_or_none(data['duration'], 1000),              'view_count': data['displays'], -            'formats': formats,              'is_live': True if stream_type == 'live' else False, -        } +        }) +        return info  class BTArticleIE(InfoExtractor): @@ -161,7 +209,7 @@ class BTArticleIE(InfoExtractor):      _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html'      _TEST = {          'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', -        'md5': 'd055e8ee918ef2844745fcfd1a4175fb', +        'md5': '2acbe8ad129b3469d5ae51b1158878df',          'info_dict': {              'id': '23199',              'ext': 'mp4', @@ -178,15 +226,15 @@ class BTArticleIE(InfoExtractor):      def _real_extract(self, url):          webpage = self._download_webpage(url, self._match_id(url))          video_id = self._search_regex( -            r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id') -        return self.url_result('vgtv:bt:%s' % video_id, 'VGTV') +            r'<video[^>]+data-id="(\d+)"', webpage, 'video id') +        return self.url_result('bttv:%s' % video_id, 'VGTV')  class BTVestlendingenIE(InfoExtractor):      IE_NAME = 'bt:vestlendingen'      IE_DESC = 'Bergens Tidende - Vestlendingen'      _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588',          'md5': 'd7d17e3337dc80de6d3a540aefbe441b',          'info_dict': { @@ -197,7 +245,19 @@ class BTVestlendingenIE(InfoExtractor):              'timestamp': 1430473209,              'upload_date': '20150501',          }, -    } +        'skip': '404 Error', +    }, { +        'url': 'http://www.bt.no/spesial/vestlendingen/#!/86255', +        'md5': 'a2893f8632e96389f4bdf36aa9463ceb', +        'info_dict': { +            'id': '86255', +            'ext': 'mov', +            'title': 'Du må tåle å fryse og være sulten', +            'description': 'md5:b8046f4d022d5830ddab04865791d063', +            'upload_date': '20150321', +            'timestamp': 1426942023, +        }, +    }]      def _real_extract(self, url): -        return self.url_result('xstream:btno:%s' % self._match_id(url), 'Xstream') +        return self.url_result('bttv:%s' % self._match_id(url), 'VGTV') diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index a63c23617..ca3f20a3d 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -30,6 +30,12 @@ class VikiBaseIE(InfoExtractor):      _token = None +    _ERRORS = { +        'geo': 'Sorry, this content is not available in your region.', +        'upcoming': 'Sorry, this content is not yet available.', +        # 'paywall': 'paywall', +    } +      def _prepare_call(self, path, timestamp=None, post_data=None):          path += '?' if '?' not in path else '&'          if not timestamp: @@ -67,6 +73,12 @@ class VikiBaseIE(InfoExtractor):              '%s returned error: %s' % (self.IE_NAME, error),              expected=True) +    def _check_errors(self, data): +        for reason, status in data.get('blocking', {}).items(): +            if status and reason in self._ERRORS: +                raise ExtractorError('%s said: %s' % ( +                    self.IE_NAME, self._ERRORS[reason]), expected=True) +      def _real_initialize(self):          self._login() @@ -193,6 +205,7 @@ class VikiIE(VikiBaseIE):              'timestamp': 1321985454,              'description': 'md5:44b1e46619df3a072294645c770cef36',              'title': 'Love In Magic', +            'age_limit': 13,          },      }] @@ -202,6 +215,8 @@ class VikiIE(VikiBaseIE):          video = self._call_api(              'videos/%s.json' % video_id, video_id, 'Downloading video JSON') +        self._check_errors(video) +          title = self.dict_selection(video.get('titles', {}), 'en')          if not title:              title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id @@ -262,8 +277,11 @@ class VikiIE(VikiBaseIE):                  r'^(\d+)[pP]$', format_id, 'height', default=None))              for protocol, format_dict in stream_dict.items():                  if format_id == 'm3u8': -                    formats = self._extract_m3u8_formats( -                        format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol) +                    m3u8_formats = self._extract_m3u8_formats( +                        format_dict['url'], video_id, 'mp4', 'm3u8_native', +                        m3u8_id='m3u8-%s' % protocol, fatal=None) +                    if m3u8_formats: +                        formats.extend(m3u8_formats)                  else:                      formats.append({                          'url': format_dict['url'], @@ -315,6 +333,8 @@ class VikiChannelIE(VikiBaseIE):              'containers/%s.json' % channel_id, channel_id,              'Downloading channel JSON') +        self._check_errors(channel) +          title = self.dict_selection(channel['titles'], 'en')          description = self.dict_selection(channel['descriptions'], 'en') diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py index 71584c291..76c91bd92 100644 --- a/youtube_dl/extractor/xstream.py +++ b/youtube_dl/extractor/xstream.py @@ -42,11 +42,7 @@ class XstreamIE(InfoExtractor):          'only_matching': True,      }] -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        partner_id = mobj.group('partner_id') -        video_id = mobj.group('id') - +    def _extract_video_info(self, partner_id, video_id):          data = self._download_xml(              'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s'              % (partner_id, video_id), @@ -97,6 +93,7 @@ class XstreamIE(InfoExtractor):              formats.append({                  'url': link.get('href'),                  'format_id': link.get('rel'), +                'preference': 1,              })          thumbnails = [{ @@ -113,3 +110,10 @@ class XstreamIE(InfoExtractor):              'formats': formats,              'thumbnails': thumbnails,          } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        partner_id = mobj.group('partner_id') +        video_id = mobj.group('id') + +        return self._extract_video_info(partner_id, video_id)  | 
