diff options
| author | Sergey M․ <dstftw@gmail.com> | 2016-09-11 14:59:14 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2016-09-11 14:59:14 +0700 | 
| commit | 2cb93afcd8a8a1f086a97ef3791fa033ddc1610a (patch) | |
| tree | 0136b0b3149ba2aafbc84ae19b5d5fb7b990c0f0 | |
| parent | bfcda07a2710738c32f63fdb4e09e177acc53df3 (diff) | |
[viafree] Improve video id extraction (Closes #10615)
| -rw-r--r-- | youtube_dl/extractor/tvplay.py | 36 | 
1 files changed, 31 insertions, 5 deletions
| diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index c0fec2594..5548ff2ac 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -16,6 +16,7 @@ from ..utils import (      parse_iso8601,      qualities,      try_get, +    js_to_json,      update_url_query,  ) @@ -368,6 +369,10 @@ class ViafreeIE(InfoExtractor):          },          'add_ie': [TVPlayIE.ie_key()],      }, { +        # Different og:image URL schema +        'url': 'www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2', +        'only_matching': True, +    }, {          'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1',          'only_matching': True,      }, { @@ -384,14 +389,35 @@ class ViafreeIE(InfoExtractor):          webpage = self._download_webpage(url, video_id) +        data = self._parse_json( +            self._search_regex( +                r'(?s)window\.App\s*=\s*({.+?})\s*;\s*</script', +                webpage, 'data', default='{}'), +            video_id, transform_source=lambda x: re.sub( +                r'(?s)function\s+[a-zA-Z_][\da-zA-Z_]*\s*\([^)]*\)\s*{[^}]*}\s*', +                'null', x), fatal=False) +          video_id = None -        thumbnail = self._og_search_thumbnail(webpage, default=None) -        if thumbnail: -            video_id = self._search_regex( -                r'https?://[^/]+/imagecache/(?:[^/]+/)+seasons/\d+/(\d{6,})/', -                thumbnail, 'video id', default=None) +        if data: +            video_id = try_get( +                data, lambda x: x['context']['dispatcher']['stores'][ +                    'ContentPageProgramStore']['currentVideo']['id'], +                compat_str) + +        # Fallback #1 (extract from og:image URL schema) +        if not video_id: +            thumbnail = self._og_search_thumbnail(webpage, default=None) +            if thumbnail: +                video_id = self._search_regex( +                    # Patterns seen: +                    #  http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/inbox/765166/a2e95e5f1d735bab9f309fa345cc3f25.jpg +                    #  http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/seasons/15204/758770/4a5ba509ca8bc043e1ebd1a76131cdf2.jpg +                    r'https?://[^/]+/imagecache/(?:[^/]+/)+(\d{6,})/', +                    thumbnail, 'video id', default=None) +        # Fallback #2. Extract from raw JSON string. +        # May extract wrong video id if relatedClips is present.          if not video_id:              video_id = self._search_regex(                  r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](\d{6,})', | 
