diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 29 | ||||
| -rw-r--r-- | youtube_dl/extractor/esri.py | 74 | ||||
| -rw-r--r-- | youtube_dl/extractor/lynda.py | 10 | ||||
| -rw-r--r-- | youtube_dl/extractor/moniker.py | 18 | ||||
| -rw-r--r-- | youtube_dl/extractor/shahid.py | 107 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 53 | 
7 files changed, 244 insertions, 49 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5307240f8..fa9acc923 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -158,6 +158,7 @@ from .eporner import EpornerIE  from .eroprofile import EroProfileIE  from .escapist import EscapistIE  from .espn import ESPNIE +from .esri import EsriVideoIE  from .everyonesmixtape import EveryonesMixtapeIE  from .exfm import ExfmIE  from .expotv import ExpoTVIE @@ -523,6 +524,7 @@ from .senateisvp import SenateISVPIE  from .servingsys import ServingSysIE  from .sexu import SexuIE  from .sexykarma import SexyKarmaIE +from .shahid import ShahidIE  from .shared import SharedIE  from .sharesix import ShareSixIE  from .sina import SinaIE diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5982055be..65835d257 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -15,6 +15,7 @@ import xml.etree.ElementTree  from ..compat import (      compat_cookiejar,      compat_cookies, +    compat_getpass,      compat_HTTPError,      compat_http_client,      compat_urllib_error, @@ -610,7 +611,7 @@ class InfoExtractor(object):          return (username, password) -    def _get_tfa_info(self): +    def _get_tfa_info(self, note='two-factor verification code'):          """          Get the two-factor authentication info          TODO - asking the user will be required for sms/phone verify @@ -624,7 +625,7 @@ class InfoExtractor(object):          if downloader_params.get('twofactor', None) is not None:              return downloader_params['twofactor'] -        return None +        return compat_getpass('Type %s and press [Return]: ' % note)      # Helper functions for extracting OpenGraph info      @staticmethod @@ -640,7 +641,7 @@ class InfoExtractor(object):      @staticmethod      def _meta_regex(prop):          return r'''(?isx)<meta -                    (?=[^>]+(?:itemprop|name|property|id)=(["\']?)%s\1) +                    (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)                      [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)      def _og_search_property(self, prop, html, name=None, **kargs): @@ -724,16 +725,18 @@ class InfoExtractor(object):      @staticmethod      def _hidden_inputs(html): -        return dict([ -            (input.group('name'), input.group('value')) for input in re.finditer( -                r'''(?x) -                    <input\s+ -                        type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+ -                        name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+ -                        (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)? -                        value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value) -                ''', html) -        ]) +        hidden_inputs = {} +        for input in re.findall(r'<input([^>]+)>', html): +            if not re.search(r'type=(["\'])hidden\1', input): +                continue +            name = re.search(r'name=(["\'])(?P<value>.+?)\1', input) +            if not name: +                continue +            value = re.search(r'value=(["\'])(?P<value>.*?)\1', input) +            if not value: +                continue +            hidden_inputs[name.group('value')] = value.group('value') +        return hidden_inputs      def _form_hidden_inputs(self, form_id, html):          form = self._search_regex( diff --git a/youtube_dl/extractor/esri.py b/youtube_dl/extractor/esri.py new file mode 100644 index 000000000..bf5d2019f --- /dev/null +++ b/youtube_dl/extractor/esri.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( +    int_or_none, +    parse_filesize, +    unified_strdate, +) + + +class EsriVideoIE(InfoExtractor): +    _VALID_URL = r'https?://video\.esri\.com/watch/(?P<id>[0-9]+)' +    _TEST = { +        'url': 'https://video.esri.com/watch/1124/arcgis-online-_dash_-developing-applications', +        'md5': 'd4aaf1408b221f1b38227a9bbaeb95bc', +        'info_dict': { +            'id': '1124', +            'ext': 'mp4', +            'title': 'ArcGIS Online - Developing Applications', +            'description': 'Jeremy Bartley demonstrates how to develop applications with ArcGIS Online.', +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 185, +            'upload_date': '20120419', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        formats = [] +        for width, height, content in re.findall( +                r'(?s)<li><strong>(\d+)x(\d+):</strong>(.+?)</li>', webpage): +            for video_url, ext, filesize in re.findall( +                    r'<a[^>]+href="([^"]+)">([^<]+) \(([^<]+)\)</a>', content): +                formats.append({ +                    'url': compat_urlparse.urljoin(url, video_url), +                    'ext': ext.lower(), +                    'format_id': '%s-%s' % (ext.lower(), height), +                    'width': int(width), +                    'height': int(height), +                    'filesize_approx': parse_filesize(filesize), +                }) +        self._sort_formats(formats) + +        title = self._html_search_meta('title', webpage, 'title') +        description = self._html_search_meta( +            'description', webpage, 'description', fatal=False) + +        thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail', fatal=False) +        if thumbnail: +            thumbnail = re.sub(r'_[st]\.jpg$', '_x.jpg', thumbnail) + +        duration = int_or_none(self._search_regex( +            [r'var\s+videoSeconds\s*=\s*(\d+)', r"'duration'\s*:\s*(\d+)"], +            webpage, 'duration', fatal=False)) + +        upload_date = unified_strdate(self._html_search_meta( +            'last-modified', webpage, 'upload date', fatal=None)) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'upload_date': upload_date, +            'formats': formats +        } diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index deead220a..5b9157ed4 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -11,6 +11,7 @@ from ..compat import (  )  from ..utils import (      ExtractorError, +    clean_html,      int_or_none,  ) @@ -70,6 +71,15 @@ class LyndaBaseIE(InfoExtractor):                      'Confirming log in and log out from another device')          if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')): +            if 'login error' in login_page: +                mobj = re.search( +                    r'(?s)<h1[^>]+class="topmost">(?P<title>[^<]+)</h1>\s*<div>(?P<description>.+?)</div>', +                    login_page) +                if mobj: +                    raise ExtractorError( +                        'lynda returned error: %s - %s' +                        % (mobj.group('title'), clean_html(mobj.group('description'))), +                        expected=True)              raise ExtractorError('Unable to log in') diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py index 88dcd4f73..69e4bcd1a 100644 --- a/youtube_dl/extractor/moniker.py +++ b/youtube_dl/extractor/moniker.py @@ -9,7 +9,10 @@ from ..compat import (      compat_urllib_parse,      compat_urllib_request,  ) -from ..utils import ExtractorError +from ..utils import ( +    ExtractorError, +    remove_start, +)  class MonikerIE(InfoExtractor): @@ -25,6 +28,14 @@ class MonikerIE(InfoExtractor):              'title': 'youtube-dl test video',          },      }, { +        'url': 'http://allmyvideos.net/embed-jih3nce3x6wn', +        'md5': '710883dee1bfc370ecf9fa6a89307c88', +        'info_dict': { +            'id': 'jih3nce3x6wn', +            'ext': 'mp4', +            'title': 'youtube-dl test video', +        }, +    }, {          'url': 'http://vidspot.net/l2ngsmhs8ci5',          'md5': '710883dee1bfc370ecf9fa6a89307c88',          'info_dict': { @@ -38,7 +49,10 @@ class MonikerIE(InfoExtractor):      }]      def _real_extract(self, url): -        video_id = self._match_id(url) +        orig_video_id = self._match_id(url) +        video_id = remove_start(orig_video_id, 'embed-') +        url = url.replace(orig_video_id, video_id) +        assert re.match(self._VALID_URL, url) is not None          orig_webpage = self._download_webpage(url, video_id)          if '>File Not Found<' in orig_webpage: diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py new file mode 100644 index 000000000..6e9903d5e --- /dev/null +++ b/youtube_dl/extractor/shahid.py @@ -0,0 +1,107 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ( +    ExtractorError, +    int_or_none, +    parse_iso8601, +) + + +class ShahidIE(InfoExtractor): +    _VALID_URL = r'https?://shahid\.mbc\.net/ar/episode/(?P<id>\d+)/?' +    _TESTS = [{ +        'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html', +        'info_dict': { +            'id': '90574', +            'ext': 'm3u8', +            'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3', +            'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان', +            'duration': 2972, +            'timestamp': 1422057420, +            'upload_date': '20150123', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        } +    }, { +        # shahid plus subscriber only +        'url': 'https://shahid.mbc.net/ar/episode/90511/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html', +        'only_matching': True +    }] + +    def _handle_error(self, response): +        if not isinstance(response, dict): +            return +        error = response.get('error') +        if error: +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), +                expected=True) + +    def _download_json(self, url, video_id, note='Downloading JSON metadata'): +        response = super(ShahidIE, self)._download_json(url, video_id, note)['data'] +        self._handle_error(response) +        return response + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        api_vars = { +            'id': video_id, +            'type': 'player', +            'url': 'http://api.shahid.net/api/v1_1', +            'playerType': 'episode', +        } + +        flashvars = self._search_regex( +            r'var\s+flashvars\s*=\s*({[^}]+})', webpage, 'flashvars', default=None) +        if flashvars: +            for key in api_vars.keys(): +                value = self._search_regex( +                    r'\b%s\s*:\s*(?P<q>["\'])(?P<value>.+?)(?P=q)' % key, +                    flashvars, 'type', default=None, group='value') +                if value: +                    api_vars[key] = value + +        player = self._download_json( +            'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-%s.html' +            % (video_id, api_vars['type']), video_id, 'Downloading player JSON') + +        formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4') + +        video = self._download_json( +            '%s/%s/%s?%s' % ( +                api_vars['url'], api_vars['playerType'], api_vars['id'], +                compat_urllib_parse.urlencode({ +                    'apiKey': 'sh@hid0nlin3', +                    'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', +                }).encode('utf-8')), +            video_id, 'Downloading video JSON') + +        video = video[api_vars['playerType']] + +        title = video['title'] +        description = video.get('description') +        thumbnail = video.get('thumbnailUrl') +        duration = int_or_none(video.get('duration')) +        timestamp = parse_iso8601(video.get('referenceDate')) +        categories = [ +            category['name'] +            for category in video.get('genres', []) if 'name' in category] + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'timestamp': timestamp, +            'categories': categories, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index facd837ad..887c46d95 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -33,6 +33,7 @@ from ..utils import (      int_or_none,      orderedSet,      parse_duration, +    remove_start,      smuggle_url,      str_to_int,      unescapeHTML, @@ -46,7 +47,7 @@ from ..utils import (  class YoutubeBaseInfoExtractor(InfoExtractor):      """Provide base functions for Youtube extractors"""      _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' -    _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor' +    _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'      _NETRC_MACHINE = 'youtube'      # If True it will raise an error if no login info is provided      _LOGIN_REQUIRED = False @@ -128,40 +129,24 @@ class YoutubeBaseInfoExtractor(InfoExtractor):          # Two-Factor          # TODO add SMS and phone call support - these require making a request and then prompting the user -        if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None: -            tfa_code = self._get_tfa_info() +        if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None: +            tfa_code = self._get_tfa_info('2-step verification code') -            if tfa_code is None: -                self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>') -                self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)') +            if not tfa_code: +                self._downloader.report_warning( +                    'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' +                    '(Note that only TOTP (Google Authenticator App) codes work at this time.)')                  return False -            # Unlike the first login form, secTok and timeStmp are both required for the TFA form - -            match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U) -            if match is None: -                self._downloader.report_warning('Failed to get secTok - did the page structure change?') -            secTok = match.group(1) -            match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U) -            if match is None: -                self._downloader.report_warning('Failed to get timeStmp - did the page structure change?') -            timeStmp = match.group(1) - -            tfa_form_strs = { -                'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1', -                'smsToken': '', -                'smsUserPin': tfa_code, -                'smsVerifyPin': 'Verify', - -                'PersistentCookie': 'yes', -                'checkConnection': '', -                'checkedDomains': 'youtube', -                'pstMsg': '1', -                'secTok': secTok, -                'timeStmp': timeStmp, -                'service': 'youtube', -                'hl': 'en_US', -            } +            tfa_code = remove_start(tfa_code, 'G-') + +            tfa_form_strs = self._form_hidden_inputs('challenge', login_results) + +            tfa_form_strs.update({ +                'Pin': tfa_code, +                'TrustDevice': 'on', +            }) +              tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())              tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii') @@ -173,8 +158,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              if tfa_results is False:                  return False -            if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None: -                self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.') +            if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None: +                self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')                  return False              if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:                  self._downloader.report_warning('unable to log in - did the page structure change?')  | 
