diff options
author | Sergey M․ <dstftw@gmail.com> | 2017-05-06 02:27:06 +0700 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2017-05-06 02:27:06 +0700 |
commit | 9cafc3fd8b54b9b91a145cddf9e4db0bd59e1b5f (patch) | |
tree | 84846091b48548204fb94f3a7cc30d11187570f7 /youtube_dl/extractor | |
parent | 329e3dd5adf52520c87ba31395d090455114783b (diff) |
[youtube] Extract chapters
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/youtube.py | 36 |
1 files changed, 34 insertions, 2 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 480f403da..ec12e313c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1257,6 +1257,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor): url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') + @staticmethod + def _extract_chapters(description, duration): + if not description: + return None + chapter_lines = re.findall( + r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)', + description) + if not chapter_lines: + return None + chapters = [] + for next_num, (chapter_line, time_point) in enumerate( + chapter_lines, start=1): + start_time = parse_duration(time_point) + if start_time is None: + continue + end_time = (duration if next_num == len(chapter_lines) + else parse_duration(chapter_lines[next_num][1])) + if end_time is None: + continue + chapter_title = re.sub( + r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-') + chapter_title = re.sub(r'\s+', ' ', chapter_title) + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': chapter_title, + }) + return chapters + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -1399,9 +1428,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_title = '_' # description - video_description = get_element_by_id("eow-description", video_webpage) + description_original = video_description = get_element_by_id("eow-description", video_webpage) if video_description: - video_description = re.sub(r'''(?x) + description_original = video_description = re.sub(r'''(?x) <a\s+ (?:[a-zA-Z-]+="[^"]*"\s+)*? (?:title|href)="([^"]+)"\s+ @@ -1558,6 +1587,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('writeannotations', False): video_annotations = self._extract_annotations(video_id) + chapters = self._extract_chapters(description_original, video_duration) + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() formats = [{ @@ -1790,6 +1821,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, + 'chapters': chapters, 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, 'view_count': view_count, 'like_count': like_count, |