From ac19c3ac8035fbf9369fd4bd336c9045d4eeafa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 05:35:39 +0700 Subject: [go] Improve video id extraction (closes #25207, closes #25216, closes #26058) --- youtube_dl/extractor/go.py | 46 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) (limited to 'youtube_dl/extractor/go.py') diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 0d731e90a..878ba14e6 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -4,10 +4,12 @@ from __future__ import unicode_literals import re from .adobepass import AdobePassIE +from ..compat import compat_str from ..utils import ( int_or_none, determine_ext, parse_age_limit, + try_get, urlencode_postdata, ExtractorError, ) @@ -116,6 +118,18 @@ class GoIE(AdobePassIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot', + 'info_dict': { + 'id': 'VDKA22600213', + 'ext': 'mp4', + 'title': 'Pilot', + 'description': 'md5:74306df917cfc199d76d061d66bebdb4', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, @@ -149,14 +163,30 @@ class GoIE(AdobePassIE): brand = site_info.get('brand') if not video_id or not site_info: webpage = self._download_webpage(url, display_id or video_id) - video_id = self._search_regex( - ( - # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" - # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*(VDKA\w+)', - # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet - r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' - ), webpage, 'video id', default=video_id) + data = self._parse_json( + self._search_regex( + r'["\']__abc_com__["\']\s*\]\s*=\s*({.+?})\s*;', webpage, + 'data', default='{}'), + display_id or video_id, fatal=False) + # https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot + layout = try_get(data, lambda x: x['page']['content']['video']['layout'], dict) + video_id = None + if layout: + video_id = try_get( + layout, + (lambda x: x['videoid'], lambda x: x['video']['id']), + compat_str) + if not video_id: + video_id = self._search_regex( + ( + # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" + # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood + r'data-video-id=["\']*(VDKA\w+)', + # page.analytics.videoIdCode + r'\bvideoIdCode["\']\s*:\s*["\']((?:vdka|VDKA)\w+)', + # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet + r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' + ), webpage, 'video id', default=video_id) if not site_info: brand = self._search_regex( (r'data-brand=\s*["\']\s*(\d+)', -- cgit v1.2.3