PhotobucketIE: accept new format of urls and add a test

author: Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> 2013-05-05 13:07:00 +0200
committer: Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> 2013-05-05 13:07:00 +0200
commit: d96680f58d918c99baab952338f547c42266bdf3 (patch)
tree: 3e7786eaa4d4da33c5909ee6856fe0da63baf315 /youtube_dl/InfoExtractors.py
parent: f8602d32429bf8a7dace35bcab91ba6be092cfde (diff)
1 files changed, 22 insertions, 9 deletions
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 8c3751d0a..a98d403b3 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -848,7 +848,10 @@ class DailymotionIE(InfoExtractor):
 class PhotobucketIE(InfoExtractor):
     """Information extractor for photobucket.com."""
 
-    _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
+    # TODO: the original _VALID_URL was:
+    # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
+    # Check if it's necessary to keep the old extracion process
+    _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
     IE_NAME = u'photobucket'
 
     def _real_extract(self, url):
@@ -857,20 +860,30 @@ class PhotobucketIE(InfoExtractor):
         if mobj is None:
             raise ExtractorError(u'Invalid URL: %s' % url)
 
-        video_id = mobj.group(1)
+        video_id = mobj.group('id')
 
-        video_extension = 'flv'
+        video_extension = mobj.group('ext')
 
         # Retrieve video webpage to extract further information
-        request = compat_urllib_request.Request(url)
-        try:
-            self.report_download_webpage(video_id)
-            webpage = compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
+        webpage = self._download_webpage(url, video_id)
 
         # Extract URL, uploader, and title from webpage
         self.report_extraction(video_id)
+        # We try first by looking the javascript code:
+        mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
+        if mobj is not None:
+            info = json.loads(mobj.group('json'))
+            return [{
+                'id':       video_id,
+                'url':      info[u'downloadUrl'],
+                'uploader': info[u'username'],
+                'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
+                'title':    info[u'title'],
+                'ext':      video_extension,
+                'thumbnail': info[u'thumbUrl'],
+            }]
+
+        # We try looking in other parts of the webpage
         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
         if mobj is None:
             raise ExtractorError(u'Unable to extract media URL')
author	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>	2013-05-05 13:07:00 +0200
committer	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>	2013-05-05 13:07:00 +0200
commit	d96680f58d918c99baab952338f547c42266bdf3 (patch)
tree	3e7786eaa4d4da33c5909ee6856fe0da63baf315 /youtube_dl/InfoExtractors.py
parent	f8602d32429bf8a7dace35bcab91ba6be092cfde (diff)