diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/afreecatv.py | 26 | ||||
| -rwxr-xr-x | youtube_dl/extractor/cda.py | 65 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/nrk.py | 44 | ||||
| -rw-r--r-- | youtube_dl/extractor/plays.py | 34 | ||||
| -rw-r--r-- | youtube_dl/extractor/vlive.py | 19 | 
6 files changed, 145 insertions, 49 deletions
| diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 518c61f67..75b366993 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -11,6 +11,7 @@ from ..compat import (  from ..utils import (      ExtractorError,      int_or_none, +    update_url_query,      xpath_element,      xpath_text,  ) @@ -18,12 +19,18 @@ from ..utils import (  class AfreecaTVIE(InfoExtractor):      IE_DESC = 'afreecatv.com' -    _VALID_URL = r'''(?x)^ -        https?://(?:(live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)? -        (?: -            /app/(?:index|read_ucc_bbs)\.cgi| -            /player/[Pp]layer\.(?:swf|html)) -        \?.*?\bnTitleNo=(?P<id>\d+)''' +    _VALID_URL = r'''(?x) +                    https?:// +                        (?: +                            (?:(?:live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)? +                            (?: +                                /app/(?:index|read_ucc_bbs)\.cgi| +                                /player/[Pp]layer\.(?:swf|html) +                            )\?.*?\bnTitleNo=| +                            vod\.afreecatv\.com/PLAYER/STATION/ +                        ) +                        (?P<id>\d+) +                    '''      _TESTS = [{          'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=',          'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', @@ -66,6 +73,9 @@ class AfreecaTVIE(InfoExtractor):      }, {          'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',          'only_matching': True, +    }, { +        'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030', +        'only_matching': True,      }]      @staticmethod @@ -83,7 +93,9 @@ class AfreecaTVIE(InfoExtractor):          info_url = compat_urlparse.urlunparse(parsed_url._replace(              netloc='afbbs.afreecatv.com:8080',              path='/api/video/get_video_info.php')) -        video_xml = self._download_xml(info_url, video_id) + +        video_xml = self._download_xml( +            update_url_query(info_url, {'nTitleNo': video_id}), video_id)          if xpath_element(video_xml, './track/video/file') is None:              raise ExtractorError('Specified AfreecaTV video does not exist', diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 8af318703..e00bdaf66 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -5,14 +5,16 @@ import re  from .common import InfoExtractor  from ..utils import ( -    decode_packed_codes,      ExtractorError, -    parse_duration +    float_or_none, +    int_or_none, +    parse_duration,  )  class CDAIE(InfoExtractor):      _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)' +    _BASE_URL = 'http://www.cda.pl/'      _TESTS = [{          'url': 'http://www.cda.pl/video/5749950c',          'md5': '6f844bf51b15f31fae165365707ae970', @@ -21,6 +23,9 @@ class CDAIE(InfoExtractor):              'ext': 'mp4',              'height': 720,              'title': 'Oto dlaczego przed zakrętem należy zwolnić.', +            'description': 'md5:269ccd135d550da90d1662651fcb9772', +            'thumbnail': 're:^https?://.*\.jpg$', +            'average_rating': float,              'duration': 39          }      }, { @@ -30,6 +35,11 @@ class CDAIE(InfoExtractor):              'id': '57413289',              'ext': 'mp4',              'title': 'Lądowanie na lotnisku na Maderze', +            'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', +            'thumbnail': 're:^https?://.*\.jpg$', +            'uploader': 'crash404', +            'view_count': int, +            'average_rating': float,              'duration': 137          }      }, { @@ -39,31 +49,55 @@ class CDAIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        webpage = self._download_webpage('http://ebd.cda.pl/0x0/' + video_id, video_id) +        self._set_cookie('cda.pl', 'cda.player', 'html5') +        webpage = self._download_webpage( +            self._BASE_URL + '/video/' + video_id, video_id)          if 'Ten film jest dostępny dla użytkowników premium' in webpage:              raise ExtractorError('This video is only available for premium users.', expected=True) -        title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title') -          formats = [] +        uploader = self._search_regex(r'''(?x) +            <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*> +            (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*? +            <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3> +        ''', webpage, 'uploader', default=None, group='uploader') +        view_count = self._search_regex( +            r'Odsłony:(?:\s| )*([0-9]+)', webpage, +            'view_count', default=None) +        average_rating = self._search_regex( +            r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', +            webpage, 'rating', fatal=False, group='rating_value') +          info_dict = {              'id': video_id, -            'title': title, +            'title': self._og_search_title(webpage), +            'description': self._og_search_description(webpage), +            'uploader': uploader, +            'view_count': int_or_none(view_count), +            'average_rating': float_or_none(average_rating), +            'thumbnail': self._og_search_thumbnail(webpage),              'formats': formats,              'duration': None,          }          def extract_format(page, version): -            unpacked = decode_packed_codes(page) -            format_url = self._search_regex( -                r"(?:file|url)\s*:\s*(\\?[\"'])(?P<url>http.+?)\1", unpacked, -                '%s url' % version, fatal=False, group='url') -            if not format_url: +            json_str = self._search_regex( +                r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page, +                '%s player_json' % version, fatal=False, group='player_data') +            if not json_str: +                return +            player_data = self._parse_json( +                json_str, '%s player_data' % version, fatal=False) +            if not player_data: +                return +            video = player_data.get('video') +            if not video or 'file' not in video: +                self.report_warning('Unable to extract %s version information' % version)                  return              f = { -                'url': format_url, +                'url': video['file'],              }              m = re.search(                  r'<a[^>]+data-quality="(?P<format_id>[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P<height>[0-9]+)p', @@ -75,9 +109,7 @@ class CDAIE(InfoExtractor):                  })              info_dict['formats'].append(f)              if not info_dict['duration']: -                info_dict['duration'] = parse_duration(self._search_regex( -                    r"duration\s*:\s*(\\?[\"'])(?P<duration>.+?)\1", -                    unpacked, 'duration', fatal=False, group='duration')) +                info_dict['duration'] = parse_duration(video.get('duration'))          extract_format(webpage, 'default') @@ -85,7 +117,8 @@ class CDAIE(InfoExtractor):                  r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',                  webpage):              webpage = self._download_webpage( -                href, video_id, 'Downloading %s version information' % resolution, fatal=False) +                self._BASE_URL + href, video_id, +                'Downloading %s version information' % resolution, fatal=False)              if not webpage:                  # Manually report warning because empty page is returned when                  # invalid version is requested. diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5f4c984a9..05c51fac9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -886,7 +886,7 @@ class InfoExtractor(object):                          'url': e.get('contentUrl'),                          'title': unescapeHTML(e.get('name')),                          'description': unescapeHTML(e.get('description')), -                        'thumbnail': e.get('thumbnailUrl'), +                        'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),                          'duration': parse_duration(e.get('duration')),                          'timestamp': unified_timestamp(e.get('uploadDate')),                          'filesize': float_or_none(e.get('contentSize')), @@ -1703,7 +1703,7 @@ class InfoExtractor(object):                                  representation_ms_info['fragments'] = [{                                      'url': media_template % {                                          'Number': segment_number, -                                        'Bandwidth': representation_attrib.get('bandwidth'), +                                        'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),                                      },                                      'duration': segment_duration,                                  } for segment_number in range( @@ -1721,7 +1721,7 @@ class InfoExtractor(object):                                  def add_segment_url():                                      segment_url = media_template % {                                          'Time': segment_time, -                                        'Bandwidth': representation_attrib.get('bandwidth'), +                                        'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),                                          'Number': segment_number,                                      }                                      representation_ms_info['fragments'].append({ diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 3700b7ab2..c89aac63e 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -1,6 +1,7 @@  # coding: utf-8  from __future__ import unicode_literals +import random  import re  from .common import InfoExtractor @@ -14,6 +15,25 @@ from ..utils import (  class NRKBaseIE(InfoExtractor): +    _faked_ip = None + +    def _download_webpage_handle(self, *args, **kwargs): +        # NRK checks X-Forwarded-For HTTP header in order to figure out the +        # origin of the client behind proxy. This allows to bypass geo +        # restriction by faking this header's value to some Norway IP. +        # We will do so once we encounter any geo restriction error. +        if self._faked_ip: +            # NB: str is intentional +            kwargs.setdefault(str('headers'), {})['X-Forwarded-For'] = self._faked_ip +        return super(NRKBaseIE, self)._download_webpage_handle(*args, **kwargs) + +    def _fake_ip(self): +        # Use fake IP from 37.191.128.0/17 in order to workaround geo +        # restriction +        def octet(lb=0, ub=255): +            return random.randint(lb, ub) +        self._faked_ip = '37.191.%d.%d' % (octet(128), octet()) +      def _real_extract(self, url):          video_id = self._match_id(url) @@ -24,6 +44,8 @@ class NRKBaseIE(InfoExtractor):          title = data.get('fullTitle') or data.get('mainTitle') or data['title']          video_id = data.get('id') or video_id +        http_headers = {'X-Forwarded-For': self._faked_ip} if self._faked_ip else {} +          entries = []          media_assets = data.get('mediaAssets') @@ -54,6 +76,7 @@ class NRKBaseIE(InfoExtractor):                      'duration': duration,                      'subtitles': subtitles,                      'formats': formats, +                    'http_headers': http_headers,                  })          if not entries: @@ -70,10 +93,23 @@ class NRKBaseIE(InfoExtractor):                  }]          if not entries: -            if data.get('usageRights', {}).get('isGeoBlocked'): -                raise ExtractorError( -                    'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', -                    expected=True) +            message_type = data.get('messageType', '') +            # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* +            if 'IsGeoBlocked' in message_type and not self._faked_ip: +                self.report_warning( +                    'Video is geo restricted, trying to fake IP') +                self._fake_ip() +                return self._real_extract(url) + +            MESSAGES = { +                'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', +                'ProgramRightsHasExpired': 'Programmet har gått ut', +                'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', +            } +            raise ExtractorError( +                '%s said: %s' % (self.IE_NAME, MESSAGES.get( +                    message_type, message_type)), +                expected=True)          conviva = data.get('convivaStatistics') or {}          series = conviva.get('seriesName') or data.get('seriesTitle') diff --git a/youtube_dl/extractor/plays.py b/youtube_dl/extractor/plays.py index c3c38cf4a..ddfc6f148 100644 --- a/youtube_dl/extractor/plays.py +++ b/youtube_dl/extractor/plays.py @@ -8,30 +8,31 @@ from ..utils import int_or_none  class PlaysTVIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?plays\.tv/video/(?P<id>[0-9a-f]{18})' -    _TEST = { -        'url': 'http://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall', +    _VALID_URL = r'https?://(?:www\.)?plays\.tv/(?:video|embeds)/(?P<id>[0-9a-f]{18})' +    _TESTS = [{ +        'url': 'https://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall',          'md5': 'dfeac1198506652b5257a62762cec7bc',          'info_dict': {              'id': '56af17f56c95335490',              'ext': 'mp4', -            'title': 'When you outplay the Azir wall', +            'title': 'Bjergsen - When you outplay the Azir wall',              'description': 'Posted by Bjergsen',          } -    } +    }, { +        'url': 'https://plays.tv/embeds/56af17f56c95335490', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) +        webpage = self._download_webpage( +            'https://plays.tv/video/%s' % video_id, video_id) + +        info = self._search_json_ld(webpage, video_id,) -        title = self._og_search_title(webpage) -        content = self._parse_json( -            self._search_regex( -                r'R\.bindContent\(({.+?})\);', webpage, -                'content'), video_id)['content']          mpd_url, sources = re.search(              r'(?s)<video[^>]+data-mpd="([^"]+)"[^>]*>(.+?)</video>', -            content).groups() +            webpage).groups()          formats = self._extract_mpd_formats(              self._proto_relative_url(mpd_url), video_id, mpd_id='DASH')          for format_id, height, format_url in re.findall(r'<source\s+res="((\d+)h?)"\s+src="([^"]+)"', sources): @@ -42,10 +43,11 @@ class PlaysTVIE(InfoExtractor):              })          self._sort_formats(formats) -        return { +        info.update({              'id': video_id, -            'title': title,              'description': self._og_search_description(webpage), -            'thumbnail': self._og_search_thumbnail(webpage), +            'thumbnail': info.get('thumbnail') or self._og_search_thumbnail(webpage),              'formats': formats, -        } +        }) + +        return info diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 8d671cca7..acf9fda48 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -17,7 +17,7 @@ from ..compat import compat_urllib_parse_urlencode  class VLiveIE(InfoExtractor):      IE_NAME = 'vlive'      _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.vlive.tv/video/1326',          'md5': 'cc7314812855ce56de70a06a27314983',          'info_dict': { @@ -27,7 +27,20 @@ class VLiveIE(InfoExtractor):              'creator': "Girl's Day",              'view_count': int,          }, -    } +    }, { +        'url': 'http://www.vlive.tv/video/16937', +        'info_dict': { +            'id': '16937', +            'ext': 'mp4', +            'title': '[V LIVE] 첸백시 걍방', +            'creator': 'EXO', +            'view_count': int, +            'subtitles': 'mincount:12', +        }, +        'params': { +            'skip_download': True, +        }, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) @@ -116,7 +129,7 @@ class VLiveIE(InfoExtractor):          subtitles = {}          for caption in playinfo.get('captions', {}).get('list', []): -            lang = dict_get(caption, ('language', 'locale', 'country', 'label')) +            lang = dict_get(caption, ('locale', 'language', 'country', 'label'))              if lang and caption.get('source'):                  subtitles[lang] = [{                      'ext': 'vtt', | 
