[theplatform] Support URLs with 'guid='

author: Yen Chi Hsuan <yan12125@gmail.com> 2015-08-20 01:38:39 +0800
committer: Yen Chi Hsuan <yan12125@gmail.com> 2015-08-20 01:38:39 +0800
commit: 05fe2594e4589b4e714a423550172eeec3949a70 (patch)
tree: 2b1710e31dcf81cd491253c4a847027a359778ca
parent: 26e1c3514f4af1ed60cd1114a653fe49e1fa8d11 (diff)
1 files changed, 36 insertions, 0 deletions
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index f02e0f58d..883bf491c 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -9,6 +9,10 @@ import hashlib
 
 
 from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_parse_urlparse,
+)
 from ..utils import (
     determine_ext,
     ExtractorError,
@@ -120,6 +124,20 @@ class ThePlatformIE(ThePlatformBaseIE):
     }, {
         'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
         'only_matching': True,
+    }, {
+        'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701',
+        'md5': '734f3790fb5fc4903da391beeebc4836',
+        'info_dict': {
+            'id': 'tdy_or_siri_150701',
+            'ext': 'mp4',
+            'title': 'iPhone Siri’s sassy response to a math question has people talking',
+            'description': 'md5:a565d1deadd5086f3331d57298ec6333',
+            'duration': 83.0,
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1435752600,
+            'upload_date': '20150701',
+            'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"],
+        },
     }]
 
     @staticmethod
@@ -154,6 +172,24 @@ class ThePlatformIE(ThePlatformBaseIE):
             path += '/media'
         path += '/' + video_id
 
+        qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        if 'guid' in qs_dict:
+            webpage = self._download_webpage(url, video_id)
+            scripts = re.findall(r'<script[^>]+src="([^"]+)"', webpage)
+            feed_id = None
+            # feed id usually locates in the last script.
+            # Seems there's no pattern for the interested script filename, so
+            # I try one by one
+            for script in reversed(scripts):
+                feed_script = self._download_webpage(script, video_id, 'Downloading feed script')
+                feed_id = self._search_regex(r'defaultFeedId\s*:\s*"([^"]+)"', feed_script, 'default feed id', default=None)
+                if feed_id is not None:
+                    break
+            if feed_id is None:
+                raise ExtractorError('Unable to find feed id')
+            return self.url_result('http://feed.theplatform.com/f/%s/%s?byGuid=%s' % (
+                provider_id, feed_id, qs_dict['guid'][0]))
+
         if smuggled_data.get('force_smil_url', False):
             smil_url = url
         elif mobj.group('config'):
author	Yen Chi Hsuan <yan12125@gmail.com>	2015-08-20 01:38:39 +0800
committer	Yen Chi Hsuan <yan12125@gmail.com>	2015-08-20 01:38:39 +0800
commit	05fe2594e4589b4e714a423550172eeec3949a70 (patch)
tree	2b1710e31dcf81cd491253c4a847027a359778ca
parent	26e1c3514f4af1ed60cd1114a653fe49e1fa8d11 (diff)