diff options
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r-- | youtube_dl/extractor/common.py | 117 |
1 files changed, 106 insertions, 11 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b633ea9b9..cd155a090 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -21,6 +21,7 @@ from ..compat import ( compat_str, ) from ..utils import ( + age_restricted, clean_html, compiled_regex_type, ExtractorError, @@ -40,7 +41,7 @@ class InfoExtractor(object): information about the video (or videos) the URL refers to. This information includes the real video URL, the video title, author and others. The information is stored in a dictionary which is then - passed to the FileDownloader. The FileDownloader processes this + passed to the YoutubeDL. The YoutubeDL processes this information possibly downloading the video to the file system, among other possible outcomes. @@ -92,6 +93,8 @@ class InfoExtractor(object): by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. + < -1000 to hide the format (if there is + another one which is strictly better) * language_preference Is this in the correct requested language? 10 if it's what the URL is about, @@ -118,6 +121,7 @@ class InfoExtractor(object): The following fields are optional: + alt_title: A secondary title of the video. display_id An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is something like "4234987", title "Dancing naked mole rats", @@ -129,7 +133,7 @@ class InfoExtractor(object): * "resolution" (optional, string "{width}x{height"}, deprecated) thumbnail: Full URL to a video thumbnail image. - description: One-line video description. + description: Full video description. uploader: Full name of the video uploader. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). @@ -143,6 +147,17 @@ class InfoExtractor(object): like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video comment_count: Number of comments on the video + comments: A list of comments, each with one or more of the following + properties (all but one of text or html optional): + * "author" - human-readable name of the comment author + * "author_id" - user ID of the comment author + * "id" - Comment ID + * "html" - Comment as HTML + * "text" - Plain text of the comment + * "timestamp" - UNIX timestamp of comment + * "parent" - ID of the comment this one is replying to. + Set to "root" to indicate that this is a + comment to the original video. age_limit: Age restriction for the video, as an integer (years) webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set @@ -158,8 +173,8 @@ class InfoExtractor(object): _type "playlist" indicates multiple videos. - There must be a key "entries", which is a list or a PagedList object, each - element of which is a valid dictionary under this specfication. + There must be a key "entries", which is a list, an iterable, or a PagedList + object, each element of which is a valid dictionary by this specification. Additionally, playlists can have "title" and "id" attributes with the same semantics as videos (see above). @@ -174,9 +189,10 @@ class InfoExtractor(object): _type "url" indicates that the video must be extracted from another location, possibly by a different extractor. Its only required key is: "url" - the next URL to extract. - - Additionally, it may have properties believed to be identical to the - resolved entity, for example "title" if the title of the referred video is + The key "ie_key" can be set to the class name (minus the trailing "IE", + e.g. "Youtube") if the extractor class is known in advance. + Additionally, the dictionary may have any properties of the resolved entity + known in advance, for example "title" if the title of the referred video is known ahead of time. @@ -400,6 +416,10 @@ class InfoExtractor(object): url_or_request, video_id, note, errnote, fatal=fatal) if (not fatal) and json_string is False: return None + return self._parse_json( + json_string, video_id, transform_source=transform_source, fatal=fatal) + + def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): if transform_source: json_string = transform_source(json_string) try: @@ -449,7 +469,7 @@ class InfoExtractor(object): return video_info @staticmethod - def playlist_result(entries, playlist_id=None, playlist_title=None): + def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): """Returns a playlist""" video_info = {'_type': 'playlist', 'entries': entries} @@ -457,6 +477,8 @@ class InfoExtractor(object): video_info['id'] = playlist_id if playlist_title: video_info['title'] = playlist_title + if playlist_description: + video_info['description'] = playlist_description return video_info def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): @@ -591,9 +613,9 @@ class InfoExtractor(object): if display_name is None: display_name = name return self._html_search_regex( - r'''(?ix)<meta + r'''(?isx)<meta (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) - [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name), + [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name), html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -800,6 +822,49 @@ class InfoExtractor(object): self._sort_formats(formats) return formats + # TODO: improve extraction + def _extract_smil_formats(self, smil_url, video_id): + smil = self._download_xml( + smil_url, video_id, 'Downloading SMIL file', + 'Unable to download SMIL file') + + base = smil.find('./head/meta').get('base') + + formats = [] + rtmp_count = 0 + for video in smil.findall('./body/switch/video'): + src = video.get('src') + if not src: + continue + bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + width = int_or_none(video.get('width')) + height = int_or_none(video.get('height')) + proto = video.get('proto') + if not proto: + if base: + if base.startswith('rtmp'): + proto = 'rtmp' + elif base.startswith('http'): + proto = 'http' + ext = video.get('ext') + if proto == 'm3u8': + formats.extend(self._extract_m3u8_formats(src, video_id, ext)) + elif proto == 'rtmp': + rtmp_count += 1 + streamer = video.get('streamer') or base + formats.append({ + 'url': streamer, + 'play_path': src, + 'ext': 'flv', + 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'width': width, + 'height': height, + }) + self._sort_formats(formats) + + return formats + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -829,10 +894,40 @@ class InfoExtractor(object): return res def _set_cookie(self, domain, name, value, expire_time=None): - cookie = compat_cookiejar.Cookie(0, name, value, None, None, domain, None, + cookie = compat_cookiejar.Cookie( + 0, name, value, None, None, domain, None, None, '/', True, False, expire_time, '', None, None, None) self._downloader.cookiejar.set_cookie(cookie) + def get_testcases(self, include_onlymatching=False): + t = getattr(self, '_TEST', None) + if t: + assert not hasattr(self, '_TESTS'), \ + '%s has _TEST and _TESTS' % type(self).__name__ + tests = [t] + else: + tests = getattr(self, '_TESTS', []) + for t in tests: + if not include_onlymatching and t.get('only_matching', False): + continue + t['name'] = type(self).__name__[:-len('IE')] + yield t + + def is_suitable(self, age_limit): + """ Test whether the extractor is generally suitable for the given + age limit (i.e. pornographic sites are not, all others usually are) """ + + any_restricted = False + for tc in self.get_testcases(include_onlymatching=False): + if 'playlist' in tc: + tc = tc['playlist'][0] + is_restricted = age_restricted( + tc.get('info_dict', {}).get('age_limit'), age_limit) + if not is_restricted: + return True + any_restricted = any_restricted or is_restricted + return not any_restricted + class SearchInfoExtractor(InfoExtractor): """ |