diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/bilibili.py | 145 | 
1 files changed, 93 insertions, 52 deletions
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index beffcecd0..b898223e3 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -27,14 +27,14 @@ class BiliBiliIE(InfoExtractor):      _TESTS = [{          'url': 'http://www.bilibili.tv/video/av1074402/', -        'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', +        'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',          'info_dict': {              'id': '1074402', -            'ext': 'mp4', +            'ext': 'flv',              'title': '【金坷垃】金泡沫',              'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', -            'duration': 308.315, -            'timestamp': 1398012660, +            'duration': 308.067, +            'timestamp': 1398012678,              'upload_date': '20140420',              'thumbnail': r're:^https?://.+\.jpg',              'uploader': '菊子桑', @@ -59,17 +59,38 @@ class BiliBiliIE(InfoExtractor):          'url': 'http://www.bilibili.com/video/av8903802/',          'info_dict': {              'id': '8903802', -            'ext': 'mp4',              'title': '阿滴英文|英文歌分享#6 "Closer',              'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', -            'uploader': '阿滴英文', -            'uploader_id': '65880958', -            'timestamp': 1488382620, -            'upload_date': '20170301', -        }, -        'params': { -            'skip_download': True,  # Test metadata only          }, +        'playlist': [{ +            'info_dict': { +                'id': '8903802_part1', +                'ext': 'flv', +                'title': '阿滴英文|英文歌分享#6 "Closer', +                'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', +                'uploader': '阿滴英文', +                'uploader_id': '65880958', +                'timestamp': 1488382634, +                'upload_date': '20170301', +            }, +            'params': { +                'skip_download': True,  # Test metadata only +            }, +        }, { +            'info_dict': { +                'id': '8903802_part2', +                'ext': 'flv', +                'title': '阿滴英文|英文歌分享#6 "Closer', +                'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', +                'uploader': '阿滴英文', +                'uploader_id': '65880958', +                'timestamp': 1488382634, +                'upload_date': '20170301', +            }, +            'params': { +                'skip_download': True,  # Test metadata only +            }, +        }]      }]      _APP_KEY = '84956560bc028eb7' @@ -92,9 +113,13 @@ class BiliBiliIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          if 'anime/' not in url: -            cid = compat_parse_qs(self._search_regex( -                [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', -                 r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], +            cid = self._search_regex( +                r'cid(?:["\']:|=)(\d+)', webpage, 'cid', +                default=None +            ) or compat_parse_qs(self._search_regex( +                [r'1EmbedPlayer\([^)]+,\s*"([^"]+)"\)', +                 r'1EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', +                 r'1<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],                  webpage, 'player parameters'))['cid'][0]          else:              if 'no_bangumi_tip' not in smuggled_data: @@ -114,53 +139,66 @@ class BiliBiliIE(InfoExtractor):                  self._report_error(js)              cid = js['result']['cid'] -        payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) -        sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() -          headers = {              'Referer': url          }          headers.update(self.geo_verification_headers()) -        video_info = self._download_json( -            'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), -            video_id, note='Downloading video info page', -            headers=headers) - -        if 'durl' not in video_info: -            self._report_error(video_info) -          entries = [] -        for idx, durl in enumerate(video_info['durl']): -            formats = [{ -                'url': durl['url'], -                'filesize': int_or_none(durl['size']), -            }] -            for backup_url in durl.get('backup_url', []): -                formats.append({ -                    'url': backup_url, -                    # backup URLs have lower priorities -                    'preference': -2 if 'hd.mp4' in backup_url else -3, +        RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') +        for num, rendition in enumerate(RENDITIONS, start=1): +            payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) +            sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() + +            video_info = self._download_json( +                'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), +                video_id, note='Downloading video info page', +                headers=headers, fatal=num == len(RENDITIONS)) + +            if not video_info: +                continue + +            if 'durl' not in video_info: +                if num < len(RENDITIONS): +                    continue +                self._report_error(video_info) + +            for idx, durl in enumerate(video_info['durl']): +                formats = [{ +                    'url': durl['url'], +                    'filesize': int_or_none(durl['size']), +                }] +                for backup_url in durl.get('backup_url', []): +                    formats.append({ +                        'url': backup_url, +                        # backup URLs have lower priorities +                        'preference': -2 if 'hd.mp4' in backup_url else -3, +                    }) + +                for a_format in formats: +                    a_format.setdefault('http_headers', {}).update({ +                        'Referer': url, +                    }) + +                self._sort_formats(formats) + +                entries.append({ +                    'id': '%s_part%s' % (video_id, idx), +                    'duration': float_or_none(durl.get('length'), 1000), +                    'formats': formats,                  }) +            break -            for a_format in formats: -                a_format.setdefault('http_headers', {}).update({ -                    'Referer': url, -                }) - -            self._sort_formats(formats) - -            entries.append({ -                'id': '%s_part%s' % (video_id, idx), -                'duration': float_or_none(durl.get('length'), 1000), -                'formats': formats, -            }) - -        title = self._html_search_regex('<h1[^>]*>([^<]+)</h1>', webpage, 'title') +        title = self._html_search_regex( +            ('<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', +             '(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', +            group='title')          description = self._html_search_meta('description', webpage)          timestamp = unified_timestamp(self._html_search_regex( -            r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', default=None)) +            r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', +            default=None) or self._html_search_meta( +            'uploadDate', webpage, 'timestamp', default=None))          thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)          # TODO 'view_count' requires deobfuscating Javascript @@ -174,13 +212,16 @@ class BiliBiliIE(InfoExtractor):          }          uploader_mobj = re.search( -            r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"', +            r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>(?P<name>[^<]+)',              webpage)          if uploader_mobj:              info.update({                  'uploader': uploader_mobj.group('name'),                  'uploader_id': uploader_mobj.group('id'),              }) +        if not info.get('uploader'): +            info['uploader'] = self._html_search_meta( +                'author', webpage, 'uploader', default=None)          for entry in entries:              entry.update(info)  | 
