From 56c7366547462ecec0536df58971249a8a870ddd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?=
 <jaime.marquinez.ferrandiz@gmail.com>
Date: Mon, 8 Jul 2013 15:14:27 +0200
Subject: YoutubeIE: reuse instances of InfoExtractors (closes #998)

When a IE is added to the list, it's also added to a dictionary. When a IE is requested it first looks in the dictionary and if there's no instance it will create a new one.

That way _real_initialize is only called once for each IE, saving time if it needs to login for example.
---
 youtube_dl/extractor/common.py | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'youtube_dl/extractor')
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 1d98222ce..236c7b12c 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -106,6 +106,11 @@ class InfoExtractor(object):
         """Real extraction process. Redefine in subclasses."""
         pass
 
+    @classmethod
+    def ie_key(cls):
+        """A string for getting the InfoExtractor with get_info_extractor"""
+        return cls.__name__[:-2]
+
     @property
     def IE_NAME(self):
         return type(self).__name__[:-2]
-- 
cgit v1.2.3


From 6d3a7d03e14fcbc704bf30d305fb95c5829e55a6 Mon Sep 17 00:00:00 2001
From: huohuarong <huohuarong@gmail.com>
Date: Fri, 2 Aug 2013 15:26:11 +0800
Subject: fix bug: kankan extractor not support
 http://vod.kankan.com/v/70/70309.shtml

---
 youtube_dl/extractor/kankan.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py
index 8537ba584..445d46501 100644
--- a/youtube_dl/extractor/kankan.py
+++ b/youtube_dl/extractor/kankan.py
@@ -21,8 +21,10 @@ class KankanIE(InfoExtractor):
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
 
-        title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title')
-        gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid')
+        title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, u'video title')
+        surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0)
+        gcids = re.findall(r"http://.+?/.+?/(.+?)/", surls)
+        gcid = gcids[-1]
 
         video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid,
                                                  video_id, u'Downloading video url info')
-- 
cgit v1.2.3


From 6624a2b07dafad4de895b4e84f4595214817518d Mon Sep 17 00:00:00 2001
From: huohuarong <huohuarong@gmail.com>
Date: Fri, 2 Aug 2013 17:58:46 +0800
Subject: add an extractor for tv.sohu.com

---
 youtube_dl/extractor/__init__.py |  1 +
 youtube_dl/extractor/sohu.py     | 97 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)
 create mode 100644 youtube_dl/extractor/sohu.py

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index c20172a53..3a08d676f 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -55,6 +55,7 @@ from .redtube import RedTubeIE
 from .ringtv import RingTVIE
 from .roxwel import RoxwelIE
 from .sina import SinaIE
+from .sohu import SohuIE
 from .soundcloud import SoundcloudIE, SoundcloudSetIE
 from .spiegel import SpiegelIE
 from .stanfordoc import StanfordOpenClassroomIE
diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
new file mode 100644
index 000000000..830814221
--- /dev/null
+++ b/youtube_dl/extractor/sohu.py
@@ -0,0 +1,97 @@
+# encoding: utf-8
+
+import re
+import json
+import time
+import logging
+import urllib2
+
+from .common import InfoExtractor
+from ..utils import compat_urllib_request
+
+
+class SohuIE(InfoExtractor):
+    _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?'
+
+    _TEST = {
+        u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super',
+        u'file': u'382479172.flv',
+        u'md5': u'cc84eed6b6fbf0f2f9a8d3cb9da1939b',
+        u'info_dict': {
+            u'title': u'The Illest - Far East Movement Riff Raff',
+        },
+    }
+
+    def _clearn_html(self, string):
+        tags = re.findall(r'<.+?>', string)
+        for t in tags:
+            string = string.replace(t, ' ')
+        for i in range(2):
+            spaces = re.findall(r'\s+', string)
+            for s in spaces:
+                string = string.replace(s, ' ')
+        string = string.strip()
+        return string
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>'
+        compiled = re.compile(pattern, re.DOTALL)
+        title = self._search_regex(compiled, webpage, u'video title').strip('\t\n')
+        title = self._clearn_html(title)
+        pattern = re.compile(r'var vid="(\d+)"')
+        result = re.search(pattern, webpage)
+        if not result:
+            logging.info('[Sohu] could not get vid')
+            return None
+        vid = result.group(1)
+        logging.info('vid: %s' % vid)
+        base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
+        url_1 = base_url_1 + vid
+        logging.info('json url: %s' % url_1)
+        json_1 = json.loads(urllib2.urlopen(url_1).read())
+        # get the highest definition video vid and json infomation.
+        vids = []
+        qualities = ('oriVid', 'superVid', 'highVid', 'norVid')
+        for vid_name in qualities:
+            vids.append(json_1['data'][vid_name])
+        clearest_vid = 0
+        for i, v in enumerate(vids):
+            if v != 0:
+                clearest_vid = v
+                logging.info('quality definition: %s' % qualities[i][:-3])
+                break
+        if not clearest_vid:
+            logging.warning('could not find valid clearest_vid')
+            return None
+        if vid != clearest_vid:
+            url_1 = '%s%d' % (base_url_1, clearest_vid)
+            logging.info('highest definition json url: %s' % url_1)
+            json_1 = json.loads(urllib2.urlopen(url_1).read())
+        allot = json_1['allot']
+        prot = json_1['prot']
+        clipsURL = json_1['data']['clipsURL']
+        su = json_1['data']['su']
+        num_of_parts = json_1['data']['totalBlocks']
+        logging.info('Total parts: %d' % num_of_parts)
+        base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]'
+        files_info = []
+        for i in range(num_of_parts):
+            middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i])
+            logging.info('middle url part %d: %s' % (i, middle_url))
+            middle_info = urllib2.urlopen(middle_url).read().split('|')
+            middle_part_1 = middle_info[0]
+            download_url = '%s%s?key=%s' % (middle_info[0], su[i], middle_info[3])
+
+            info = {
+                'id': '%s_part%02d' % (video_id, i + 1),
+                'title': title,
+                'url': download_url,
+                'ext': 'mp4',
+            }
+            files_info.append(info)
+            time.sleep(1)
+
+        return files_info
-- 
cgit v1.2.3


From 4ec929dc9b55a2588b4a27e64871c5bfa900bf37 Mon Sep 17 00:00:00 2001
From: huohuarong <huohuarong@gmail.com>
Date: Sat, 3 Aug 2013 10:29:58 +0800
Subject: use ..utils/clean_html()

---
 youtube_dl/extractor/sohu.py | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
index 830814221..cf0ab5478 100644
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@@ -7,7 +7,7 @@ import logging
 import urllib2
 
 from .common import InfoExtractor
-from ..utils import compat_urllib_request
+from ..utils import compat_urllib_request, clean_html
 
 
 class SohuIE(InfoExtractor):
@@ -22,16 +22,6 @@ class SohuIE(InfoExtractor):
         },
     }
 
-    def _clearn_html(self, string):
-        tags = re.findall(r'<.+?>', string)
-        for t in tags:
-            string = string.replace(t, ' ')
-        for i in range(2):
-            spaces = re.findall(r'\s+', string)
-            for s in spaces:
-                string = string.replace(s, ' ')
-        string = string.strip()
-        return string
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -40,7 +30,7 @@ class SohuIE(InfoExtractor):
         pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>'
         compiled = re.compile(pattern, re.DOTALL)
         title = self._search_regex(compiled, webpage, u'video title').strip('\t\n')
-        title = self._clearn_html(title)
+        title = clean_html(title)
         pattern = re.compile(r'var vid="(\d+)"')
         result = re.search(pattern, webpage)
         if not result:
@@ -93,5 +83,8 @@ class SohuIE(InfoExtractor):
             }
             files_info.append(info)
             time.sleep(1)
-
+        if num_of_parts == 1:
+            info =  files_info[0]
+            info['id'] = video_id
+            return info
         return files_info
-- 
cgit v1.2.3


From b5a6d408181c118bf51382f486a2492643ed74ec Mon Sep 17 00:00:00 2001
From: huohuarong <huohuarong@gmail.com>
Date: Mon, 5 Aug 2013 22:51:54 +0800
Subject: fix parse title bug

---
 youtube_dl/extractor/sohu.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
index cf0ab5478..cd049b6f0 100644
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@@ -27,10 +27,10 @@ class SohuIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
-        pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>'
+        pattern = r'<title>(.+?)</title>'
         compiled = re.compile(pattern, re.DOTALL)
-        title = self._search_regex(compiled, webpage, u'video title').strip('\t\n')
-        title = clean_html(title)
+        title = self._search_regex(compiled, webpage, u'video title')
+        title = clean_html(title).split('-')[0].strip()
         pattern = re.compile(r'var vid="(\d+)"')
         result = re.search(pattern, webpage)
         if not result:
@@ -41,7 +41,8 @@ class SohuIE(InfoExtractor):
         base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
         url_1 = base_url_1 + vid
         logging.info('json url: %s' % url_1)
-        json_1 = json.loads(urllib2.urlopen(url_1).read())
+        webpage = self._download_webpage(url_1, vid)
+        json_1 = json.loads(webpage)
         # get the highest definition video vid and json infomation.
         vids = []
         qualities = ('oriVid', 'superVid', 'highVid', 'norVid')
-- 
cgit v1.2.3


From 461cead4f788f6a69902f350b9143a5e1588b57d Mon Sep 17 00:00:00 2001
From: tsantala <tsantala@tsantala-desktop.(none)>
Date: Tue, 6 Aug 2013 04:34:24 +0300
Subject: changes

---
 youtube_dl/extractor/AddAnime.py | 54 ++++++++++++++++++++++++++++++++++++++++
 youtube_dl/extractor/__init__.py |  2 ++
 2 files changed, 56 insertions(+)
 create mode 100644 youtube_dl/extractor/AddAnime.py

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/AddAnime.py b/youtube_dl/extractor/AddAnime.py
new file mode 100644
index 000000000..43b0b24fe
--- /dev/null
+++ b/youtube_dl/extractor/AddAnime.py
@@ -0,0 +1,54 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+)
+from bs4 import BeautifulSoup
+
+
+class AddAnimeIE(InfoExtractor):
+
+    _VALID_URL = r'^(?:http?://)?(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
+    IE_NAME = u'AddAnime'
+    _TEST = {
+        u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
+        u'file': u'137499050692ced.flv',
+        u'md5': u'0813c2430bea7a46bf13acf3406992f4',
+        u'info_dict': {
+            u"description": u"One Piece 606", 
+            u"uploader": u"mugiwaraQ8", 
+            u"title": u"One Piece 606"
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+
+        video_id = mobj.group('video_id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(r'var normal_video_file = "(.*?)",',
+            webpage, u'video URL')
+
+        video_title = self._og_search_title(webpage)
+
+        video_description = self._og_search_description(webpage)
+        
+        soup = BeautifulSoup(webpage)
+        
+        video_uploader= soup.find("meta", {"author":""})['content']
+
+        info = {
+            'id':  video_id,
+            'url': video_url,
+            'ext': 'flv',
+            'title': video_title,
+            'description': video_description,
+            'uploader': video_uploader
+        }
+
+        return [info]
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 84c02c2ed..28dcb2cc4 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -1,3 +1,5 @@
+
+from .AddAnime import AddAnimeIE
 from .archiveorg import ArchiveOrgIE
 from .ard import ARDIE
 from .arte import ArteTvIE
-- 
cgit v1.2.3


From d5b00ee6e0ba70fd5d87752e8772fc1c39e4bd59 Mon Sep 17 00:00:00 2001
From: huohuarong <huohuarong@gmail.com>
Date: Tue, 6 Aug 2013 10:26:57 +0800
Subject: improve sohu extractor

---
 youtube_dl/extractor/sohu.py | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
index cd049b6f0..24fc3a5d7 100644
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@@ -31,6 +31,7 @@ class SohuIE(InfoExtractor):
         compiled = re.compile(pattern, re.DOTALL)
         title = self._search_regex(compiled, webpage, u'video title')
         title = clean_html(title).split('-')[0].strip()
+        self.to_screen('Title: %s' % title)
         pattern = re.compile(r'var vid="(\d+)"')
         result = re.search(pattern, webpage)
         if not result:
@@ -70,6 +71,7 @@ class SohuIE(InfoExtractor):
         base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]'
         files_info = []
         for i in range(num_of_parts):
+            self.to_screen('Geting json infomation of part %s/%s' % (i + 1, num_of_parts))
             middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i])
             logging.info('middle url part %d: %s' % (i, middle_url))
             middle_info = urllib2.urlopen(middle_url).read().split('|')
-- 
cgit v1.2.3


From 97b3656c2e37e45d556816b8f1f15c20d14f1acd Mon Sep 17 00:00:00 2001
From: rzhxeo <rzhxeot7z81b4700@mailcatch.com>
Date: Fri, 9 Aug 2013 18:37:33 +0200
Subject:  YoupornIE: Add support for hd videos and update Test

---
 youtube_dl/extractor/youporn.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index d1156bf42..cc9c37027 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -12,14 +12,16 @@ from ..utils import (
     unescapeHTML,
     unified_strdate,
 )
-
+from ..aes import (
+    aes_decrypt_text
+)
 
 class YouPornIE(InfoExtractor):
     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
     _TEST = {
         u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
         u'file': u'505835.mp4',
-        u'md5': u'c37ddbaaa39058c76a7e86c6813423c1',
+        u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89',
         u'info_dict': {
             u"upload_date": u"20101221", 
             u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", 
@@ -75,6 +77,14 @@ class YouPornIE(InfoExtractor):
         # Get all of the links from the page
         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
         links = re.findall(LINK_RE, download_list_html)
+        
+        # Get link of hd video
+        encrypted_video_url = self._html_search_regex(r'var encryptedURL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';',
+            webpage, u'encrypted_video_url')
+        video_url = unicode( aes_decrypt_text(encrypted_video_url, video_title, 32), 'utf-8')
+        if video_url.split('/')[6].split('_')[0] == u'720p': # only add if 720p to avoid duplicates
+            links = [video_url] + links
+        
         if(len(links) == 0):
             raise ExtractorError(u'ERROR: no known formats available for video')
 
-- 
cgit v1.2.3


From 5a27ecdd2ec83ba6e1069428c4c0fb3bd61f638c Mon Sep 17 00:00:00 2001
From: kkalpakloglou <kkalpakloglou@yahoo.com>
Date: Fri, 16 Aug 2013 23:54:09 +0300
Subject: Update AddAnime.py

---
 youtube_dl/extractor/AddAnime.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/AddAnime.py b/youtube_dl/extractor/AddAnime.py
index 43b0b24fe..a312fa97e 100644
--- a/youtube_dl/extractor/AddAnime.py
+++ b/youtube_dl/extractor/AddAnime.py
@@ -1,11 +1,6 @@
 import re
 
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-)
-from bs4 import BeautifulSoup
-
 
 class AddAnimeIE(InfoExtractor):
 
@@ -17,7 +12,6 @@ class AddAnimeIE(InfoExtractor):
         u'md5': u'0813c2430bea7a46bf13acf3406992f4',
         u'info_dict': {
             u"description": u"One Piece 606", 
-            u"uploader": u"mugiwaraQ8", 
             u"title": u"One Piece 606"
         }
     }
@@ -31,24 +25,27 @@ class AddAnimeIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        video_url = self._search_regex(r'var normal_video_file = "(.*?)",',
-            webpage, u'video URL')
+
+	def find_between( webpage, first, last ):
+  	    try:
+        	start = webpage.index( first ) + len( first )
+        	end = webpage.index( last, start )
+        	return webpage[start:end]
+    	    except ValueError:
+       		return ""
+
+	video_url = find_between( webpage, "var normal_video_file = '", "';" )
 
         video_title = self._og_search_title(webpage)
 
         video_description = self._og_search_description(webpage)
-        
-        soup = BeautifulSoup(webpage)
-        
-        video_uploader= soup.find("meta", {"author":""})['content']
 
         info = {
             'id':  video_id,
             'url': video_url,
             'ext': 'flv',
             'title': video_title,
-            'description': video_description,
-            'uploader': video_uploader
+            'description': video_description
         }
 
         return [info]
-- 
cgit v1.2.3


From 943f7f7a399c6fb3006eb2bd68070f28a272171f Mon Sep 17 00:00:00 2001
From: Pierre Rudloff <pierre@rudloff.pro>
Date: Sun, 18 Aug 2013 16:11:47 +0200
Subject: Download videos from jeuxvideo.com

---
 youtube_dl/extractor/__init__.py  |  1 +
 youtube_dl/extractor/jeuxvideo.py | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 youtube_dl/extractor/jeuxvideo.py

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 84c02c2ed..b9bd3a429 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -36,6 +36,7 @@ from .ign import IGNIE, OneUPIE
 from .ina import InaIE
 from .infoq import InfoQIE
 from .instagram import InstagramIE
+from .jeuxvideo import JeuxVideoIE
 from .jukebox import JukeboxIE
 from .justintv import JustinTVIE
 from .kankan import KankanIE
diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py
new file mode 100644
index 000000000..d74a1c9b4
--- /dev/null
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -0,0 +1,33 @@
+import json
+import re
+
+from .common import InfoExtractor
+
+class JeuxVideoIE(InfoExtractor):
+    _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        title = re.match(self._VALID_URL, url).group(1)
+        webpage = self._download_webpage(url, title)
+        m_download = re.search(r'<param name="flashvars" value="config=(.*?)" />', webpage)
+
+        xml_link = m_download.group(1)
+        
+        id = re.search(r'http://www.jeuxvideo.com/config/\w+/0011/(.*?)/\d+_player\.xml', xml_link).group(1)
+
+        xml_config = self._download_webpage(xml_link, title,
+                                                  'Downloading XML config')
+        info = re.search(r'<format\.json>(.*?)</format\.json>',
+                         xml_config, re.MULTILINE|re.DOTALL).group(1)
+        info = json.loads(info)['versions'][0]
+        
+        video_url = 'http://video720.jeuxvideo.com/' + info['file']
+
+        track_info = {'id':id,
+                      'title' : title,
+                      'ext' :   'mp4',
+                      'url' :   video_url
+                      }
+
+        return [track_info]
-- 
cgit v1.2.3


From 7070b83687ed134af6d9a71bbf2ec759a56965d5 Mon Sep 17 00:00:00 2001
From: Pierre Rudloff <pierre@rudloff.pro>
Date: Thu, 22 Aug 2013 12:54:17 +0200
Subject: Merge remote-tracking branch 'upstream/master'

---
 youtube_dl/extractor/jeuxvideo.py | 1 -
 1 file changed, 1 deletion(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py
index c8a8ae1b3..4327bc13d 100644
--- a/youtube_dl/extractor/jeuxvideo.py
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -31,7 +31,6 @@ class JeuxVideoIE(InfoExtractor):
 
         xml_config = self._download_webpage(xml_link, title,
                                                   'Downloading XML config')
-
         config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8'))
         info = re.search(r'<format\.json>(.*?)</format\.json>',
                          xml_config, re.MULTILINE|re.DOTALL).group(1)
-- 
cgit v1.2.3


From cd0abcc0bb4c218fd02850a139b626d252e22599 Mon Sep 17 00:00:00 2001
From: Pierre Rudloff <contact@rudloff.pro>
Date: Thu, 22 Aug 2013 13:54:23 +0200
Subject: Extractor for canalc2.tv

---
 youtube_dl/extractor/__init__.py |  1 +
 youtube_dl/extractor/canalc2.py  | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 youtube_dl/extractor/canalc2.py

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 9d12608e1..576b8433a 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -7,6 +7,7 @@ from .bliptv import BlipTVIE, BlipTVUserIE
 from .breakcom import BreakIE
 from .brightcove import BrightcoveIE
 from .canalplus import CanalplusIE
+from .canalc2 import Canalc2IE
 from .collegehumor import CollegeHumorIE
 from .comedycentral import ComedyCentralIE
 from .condenast import CondeNastIE
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
new file mode 100644
index 000000000..d0e2ed536
--- /dev/null
+++ b/youtube_dl/extractor/canalc2.py
@@ -0,0 +1,37 @@
+# coding: utf-8
+"""Extractor for canalc2.tv"""
+import re
+import lxml.html
+
+from .common import InfoExtractor
+
+class Canalc2IE(InfoExtractor):
+    """Extractor for canalc2.tv"""
+    _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui'
+
+    _TEST = {
+        u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
+        u'file': u'12163.mp4',
+        u'md5': u'c00fa80517373764ff5c0b5eb5a58780',
+        u'info_dict': {
+            u'title': u'Terrasses du Numérique'
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = re.match(self._VALID_URL, url).group(1)
+        webpage = self._download_webpage(url, video_id)
+        file_name = re.search(r"so\.addVariable\('file','(.*?)'\);",
+            webpage).group(1)
+        
+        video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name
+        
+        html   = lxml.html.fromstring(webpage)
+        
+        title = html.cssselect('.evenement8')[0].text_content()
+        
+        return {'id': video_id,
+                'ext' : 'mp4',
+                'url' : video_url,
+                'title' : title
+                }
-- 
cgit v1.2.3


From ff2424595adf02cbe5d1f1071e53c3b2e5f32c9e Mon Sep 17 00:00:00 2001
From: Pierre Rudloff <contact@rudloff.pro>
Date: Thu, 22 Aug 2013 14:47:51 +0200
Subject: lxml is not part of the standard library.

---
 youtube_dl/extractor/canalc2.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
index d0e2ed536..215abf537 100644
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -1,7 +1,6 @@
 # coding: utf-8
 """Extractor for canalc2.tv"""
 import re
-import lxml.html
 
 from .common import InfoExtractor
 
@@ -25,10 +24,9 @@ class Canalc2IE(InfoExtractor):
             webpage).group(1)
         
         video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name
-        
-        html   = lxml.html.fromstring(webpage)
-        
-        title = html.cssselect('.evenement8')[0].text_content()
+
+        title = self._html_search_regex(r'class="evenement8">(.*?)</a>',
+            webpage, u'title')
         
         return {'id': video_id,
                 'ext' : 'mp4',
-- 
cgit v1.2.3


From 341ca8d74c8f090bd696111353400f0cef2ba9bc Mon Sep 17 00:00:00 2001
From: Philipp Hagemeister <phihag@phihag.de>
Date: Tue, 27 Aug 2013 01:59:00 +0200
Subject: [trilulilu] Add support for trilulilu.ro

Fun fact: The ads (not yet supported) are loaded from youtube ;)
---
 youtube_dl/extractor/__init__.py  |  5 +--
 youtube_dl/extractor/trilulilu.py | 76 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 2 deletions(-)
 create mode 100644 youtube_dl/extractor/trilulilu.py

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index f71ae2713..fa53d9af9 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -6,6 +6,7 @@ from .bandcamp import BandcampIE
 from .bliptv import BlipTVIE, BlipTVUserIE
 from .breakcom import BreakIE
 from .brightcove import BrightcoveIE
+from .c56 import C56IE
 from .canalplus import CanalplusIE
 from .collegehumor import CollegeHumorIE
 from .comedycentral import ComedyCentralIE
@@ -73,18 +74,18 @@ from .ted import TEDIE
 from .tf1 import TF1IE
 from .thisav import ThisAVIE
 from .traileraddict import TrailerAddictIE
+from .trilulilu import TriluliluIE
 from .tudou import TudouIE
 from .tumblr import TumblrIE
 from .tutv import TutvIE
-from .ustream import UstreamIE
 from .unistra import UnistraIE
+from .ustream import UstreamIE
 from .vbox7 import Vbox7IE
 from .veoh import VeohIE
 from .vevo import VevoIE
 from .videofyme import VideofyMeIE
 from .vimeo import VimeoIE, VimeoChannelIE
 from .vine import VineIE
-from .c56 import C56IE
 from .wat import WatIE
 from .weibo import WeiboIE
 from .wimp import WimpIE
diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py
new file mode 100644
index 000000000..1c46156c7
--- /dev/null
+++ b/youtube_dl/extractor/trilulilu.py
@@ -0,0 +1,76 @@
+import json
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+)
+
+
+class TriluliluIE(InfoExtractor):
+    _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?trilulilu\.ro/video-(?P<category>[^/]+)/(?P<video_id>[^/]+)'
+    _TEST = {
+        u"url": u"http://www.trilulilu.ro/video-animatie/big-buck-bunny-1",
+        u'file': u"big-buck-bunny-1.mp4",
+        u'info_dict': {
+            u"title": u"Big Buck Bunny",
+            u"description": u":) pentru copilul din noi",
+        },
+        # Server ignores Range headers (--test)
+        u"params": {
+            u"skip_download": True
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('video_id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_title(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._og_search_description(webpage)
+
+        log_str = self._search_regex(
+            r'block_flash_vars[ ]=[ ]({[^}]+})', webpage, u'log info')
+        log = json.loads(log_str)
+
+        format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/'
+                      u'video-formats2' % log)
+        format_str = self._download_webpage(
+            format_url, video_id,
+            note=u'Downloading formats',
+            errnote=u'Error while downloading formats')
+
+        format_doc = xml.etree.ElementTree.fromstring(format_str)
+ 
+        video_url_template = (
+            u'http://fs%(server)s.trilulilu.ro/stream.php?type=video'
+            u'&source=site&hash=%(hash)s&username=%(userid)s&'
+            u'key=ministhebest&format=%%s&sig=&exp=' %
+            log)
+        formats = [
+            {
+                'format': fnode.text,
+                'url': video_url_template % fnode.text,
+            }
+
+            for fnode in format_doc.findall('./formats/format')
+        ]
+
+        info = {
+            '_type': 'video',
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+        }
+
+        # TODO: Remove when #980 has been merged
+        info['url'] = formats[-1]['url']
+        info['ext'] = formats[-1]['format'].partition('-')[0]
+
+        return info
-- 
cgit v1.2.3


From 069d098f846ca53073ec646f335f77dac4439844 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?=
 <jaime.marquinez.ferrandiz@gmail.com>
Date: Tue, 27 Aug 2013 10:21:57 +0200
Subject: [canalplus] Accept player.canalplus.fr urls

---
 youtube_dl/extractor/canalplus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 3b1c88876..1f02519a0 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
 from ..utils import unified_strdate
 
 class CanalplusIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)'
+    _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P<id>\d+)'
     _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
     IE_NAME = u'canalplus.fr'
 
-- 
cgit v1.2.3


From 2a7b4da9b2ee11e88976e0e93796fd8460aa053d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?=
 <jaime.marquinez.ferrandiz@gmail.com>
Date: Tue, 27 Aug 2013 10:25:38 +0200
Subject: [hark] get the song info in JSON and extract more information.

---
 youtube_dl/extractor/hark.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py
index ab0a69697..5bdd08afa 100644
--- a/youtube_dl/extractor/hark.py
+++ b/youtube_dl/extractor/hark.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import re
+import json
 
 from .common import InfoExtractor
 from ..utils import determine_ext
@@ -12,24 +13,25 @@ class HarkIE(InfoExtractor):
         u'file': u'mmbzyhkgny.mp3',
         u'md5': u'6783a58491b47b92c7c1af5a77d4cbee',
         u'info_dict': {
-            u"title": u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' On May 23, 2013 ",
+            u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013",
+            u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.',
+            u'duration': 11,
         }
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group(1)
-        embed_url = "http://www.hark.com/clips/%s/homepage_embed" %(video_id)
-        webpage = self._download_webpage(embed_url, video_id)
-
-        final_url = self._search_regex(r'src="(.+?).mp3"',
-                                webpage, 'video url')+'.mp3'
-        title = self._html_search_regex(r'<title>(.+?)</title>',
-                                webpage, 'video title').replace(' Sound Clip and Quote - Hark','').replace(
-                                'Sound Clip , Quote, MP3, and Ringtone - Hark','')
+        json_url = "http://www.hark.com/clips/%s.json" %(video_id)
+        info_json = self._download_webpage(json_url, video_id)
+        info = json.loads(info_json)
+        final_url = info['url']
 
         return {'id': video_id,
                 'url' : final_url,
-                'title': title,
+                'title': info['name'],
                 'ext': determine_ext(final_url),
+                'description': info['description'],
+                'thumbnail': info['image_original'],
+                'duration': info['duration'],
                 }
-- 
cgit v1.2.3


From e86ea47c029c1f95a696e43df7bea2e3e617fbc3 Mon Sep 17 00:00:00 2001
From: Philipp Hagemeister <phihag@phihag.de>
Date: Tue, 27 Aug 2013 10:35:20 +0200
Subject: [canalc2] Small improvements

---
 youtube_dl/extractor/canalc2.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
index 215abf537..50832217a 100644
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -1,17 +1,17 @@
 # coding: utf-8
-"""Extractor for canalc2.tv"""
 import re
 
 from .common import InfoExtractor
 
+
 class Canalc2IE(InfoExtractor):
-    """Extractor for canalc2.tv"""
+    _IE_NAME = 'canalc2.tv'
     _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui'
 
     _TEST = {
         u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
         u'file': u'12163.mp4',
-        u'md5': u'c00fa80517373764ff5c0b5eb5a58780',
+        u'md5': u'060158428b650f896c542dfbb3d6487f',
         u'info_dict': {
             u'title': u'Terrasses du Numérique'
         }
@@ -20,16 +20,16 @@ class Canalc2IE(InfoExtractor):
     def _real_extract(self, url):
         video_id = re.match(self._VALID_URL, url).group(1)
         webpage = self._download_webpage(url, video_id)
-        file_name = re.search(r"so\.addVariable\('file','(.*?)'\);",
-            webpage).group(1)
-        
+        file_name = self._search_regex(
+            r"so\.addVariable\('file','(.*?)'\);",
+            webpage, 'file name')
         video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name
 
-        title = self._html_search_regex(r'class="evenement8">(.*?)</a>',
-            webpage, u'title')
+        title = self._html_search_regex(
+            r'class="evenement8">(.*?)</a>', webpage, u'title')
         
         return {'id': video_id,
-                'ext' : 'mp4',
-                'url' : video_url,
-                'title' : title
+                'ext': 'mp4',
+                'url': video_url,
+                'title': title,
                 }
-- 
cgit v1.2.3


From 1a582dd49d628914fa6a056b490914738f15c56d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?=
 <jaime.marquinez.ferrandiz@gmail.com>
Date: Tue, 27 Aug 2013 11:56:48 +0200
Subject: Add an extractor for CNN (closes #1318)

---
 youtube_dl/extractor/__init__.py |  1 +
 youtube_dl/extractor/cnn.py      | 47 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)
 create mode 100644 youtube_dl/extractor/cnn.py

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index eeeb3db50..ea2af0d0e 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -9,6 +9,7 @@ from .brightcove import BrightcoveIE
 from .c56 import C56IE
 from .canalplus import CanalplusIE
 from .canalc2 import Canalc2IE
+from .cnn import CNNIE
 from .collegehumor import CollegeHumorIE
 from .comedycentral import ComedyCentralIE
 from .condenast import CondeNastIE
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
new file mode 100644
index 000000000..cee78765b
--- /dev/null
+++ b/youtube_dl/extractor/cnn.py
@@ -0,0 +1,47 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+class CNNIE(InfoExtractor):
+    _VALID_URL = r'https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/(?P<path>.+?/(?P<title>[^/]+?)\.cnn)'
+
+    _TEST = {
+        u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
+        u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4',
+        u'md5': u'3e6121ea48df7e2259fe73a0628605c4',
+        u'info_dict': {
+            u'title': u'Nadal wins 8th French Open title',
+            u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        path = mobj.group('path')
+        page_title = mobj.group('title')
+        info_xml = self._download_webpage(
+            'http://cnn.com/video/data/3.0/%s/index.xml' % path, page_title)
+        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+
+        formats = []
+        for f in info.findall('files/file'):
+            mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate'])
+            if mf is not None:
+                formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text))
+        formats = sorted(formats)
+        (_,_,_, video_path) = formats[-1]
+        video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path
+
+        thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')])
+        thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails]
+
+        return {'id': info.attrib['id'],
+                'title': info.find('headline').text,
+                'url': video_url,
+                'ext': determine_ext(video_url),
+                'thumbnail': thumbnails[-1][1],
+                'thumbnails': thumbs_dict,
+                'description': info.find('description').text,
+                }
-- 
cgit v1.2.3


From 0bc56fa66a4b0f1b6bf827bd3550a119d3e3b231 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?=
 <jaime.marquinez.ferrandiz@gmail.com>
Date: Tue, 27 Aug 2013 12:38:30 +0200
Subject: Add an  extractor for NBC news (closes #1320)

---
 youtube_dl/extractor/__init__.py |  1 +
 youtube_dl/extractor/nbc.py      | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 youtube_dl/extractor/nbc.py

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index ea2af0d0e..27bbcc0f7 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -54,6 +54,7 @@ from .muzu import MuzuTVIE
 from .myspass import MySpassIE
 from .myvideo import MyVideoIE
 from .nba import NBAIE
+from .nbc import NBCNewsIE
 from .ooyala import OoyalaIE
 from .pbs import PBSIE
 from .photobucket import PhotobucketIE
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
new file mode 100644
index 000000000..3bc9dae6d
--- /dev/null
+++ b/youtube_dl/extractor/nbc.py
@@ -0,0 +1,33 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import find_xpath_attr, compat_str
+
+
+class NBCNewsIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://www.nbcnews.com/video/nbc-news/52753292',
+        u'file': u'52753292.flv',
+        u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179',
+        u'info_dict': {
+            u'title': u'Crew emerges after four-month Mars food study',
+            u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
+        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video')
+
+        return {'id': video_id,
+                'title': info.find('headline').text,
+                'ext': 'flv',
+                'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
+                'description': compat_str(info.find('caption').text),
+                'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
+                }
-- 
cgit v1.2.3


From 7f3c4f4f65ddb4f8374b31b74428780e60a373de Mon Sep 17 00:00:00 2001
From: Jeff Smith <whydoubt@yahoo.com>
Date: Tue, 27 Aug 2013 14:38:50 -0500
Subject: Initial slash in Google+ photos link was removed

---
 youtube_dl/extractor/googleplus.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py
index 9f7fc19a4..f1cd88983 100644
--- a/youtube_dl/extractor/googleplus.py
+++ b/youtube_dl/extractor/googleplus.py
@@ -57,8 +57,8 @@ class GooglePlusIE(InfoExtractor):
             webpage, 'title', default=u'NA')
 
         # Step 2, Simulate clicking the image box to launch video
-        DOMAIN = 'https://plus.google.com'
-        video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN),
+        DOMAIN = 'https://plus.google.com/'
+        video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
             webpage, u'video page URL')
         if not video_page.startswith(DOMAIN):
             video_page = DOMAIN + video_page
-- 
cgit v1.2.3


From 273f603efb2028a54e04cca314b72bc2a9d767ef Mon Sep 17 00:00:00 2001
From: Philipp Hagemeister <phihag@phihag.de>
Date: Wed, 28 Aug 2013 00:14:19 +0200
Subject: [cnn] Allow more URLs

---
 youtube_dl/extractor/cnn.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index cee78765b..4338bd180 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -4,10 +4,12 @@ import xml.etree.ElementTree
 from .common import InfoExtractor
 from ..utils import determine_ext
 
+
 class CNNIE(InfoExtractor):
-    _VALID_URL = r'https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/(?P<path>.+?/(?P<title>[^/]+?)\.cnn)'
+    _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/
+        (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))'''
 
-    _TEST = {
+    _TESTS = [{
         u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
         u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4',
         u'md5': u'3e6121ea48df7e2259fe73a0628605c4',
@@ -15,14 +17,24 @@ class CNNIE(InfoExtractor):
             u'title': u'Nadal wins 8th French Open title',
             u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
         },
-    }
+    },
+    {
+        u"url": u"http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29",
+        u"file": u"us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4",
+        u"md5": u"b5cc60c60a3477d185af8f19a2a26f4e",
+        u"info_dict": {
+            u"title": "Student's epic speech stuns new freshmen",
+            u"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\""
+        }
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         path = mobj.group('path')
         page_title = mobj.group('title')
-        info_xml = self._download_webpage(
-            'http://cnn.com/video/data/3.0/%s/index.xml' % path, page_title)
+        info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path
+        print(info_url)
+        info_xml = self._download_webpage(info_url, page_title)
         info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
 
         formats = []
-- 
cgit v1.2.3


From 44586389e4676dfd926255cf76e36684dcf4742d Mon Sep 17 00:00:00 2001
From: Philipp Hagemeister <phihag@phihag.de>
Date: Wed, 28 Aug 2013 02:18:44 +0200
Subject: [appletrailers] Add support

---
 youtube_dl/extractor/__init__.py      |   1 +
 youtube_dl/extractor/appletrailers.py | 167 ++++++++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+)
 create mode 100644 youtube_dl/extractor/appletrailers.py

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 27bbcc0f7..2f86f2aca 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -1,3 +1,4 @@
+from .appletrailers import AppleTrailersIE
 from .archiveorg import ArchiveOrgIE
 from .ard import ARDIE
 from .arte import ArteTvIE
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
new file mode 100644
index 000000000..7d126e2d2
--- /dev/null
+++ b/youtube_dl/extractor/appletrailers.py
@@ -0,0 +1,167 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+)
+
+
+class AppleTrailersIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
+    _TEST = {
+        u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/",
+        u"playlist": [
+            {
+                u"file": u"manofsteel-trailer4.mov",
+                u"md5": u"11874af099d480cc09e103b189805d5f",
+                u"info_dict": {
+                    u"duration": 111,
+                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg",
+                    u"title": u"Trailer 4",
+                    u"upload_date": u"20130523",
+                    u"uploader_id": u"wb",
+                },
+            },
+            {
+                u"file": u"manofsteel-trailer3.mov",
+                u"md5": u"07a0a262aae5afe68120eed61137ab34",
+                u"info_dict": {
+                    u"duration": 182,
+                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg",
+                    u"title": u"Trailer 3",
+                    u"upload_date": u"20130417",
+                    u"uploader_id": u"wb",
+                },
+            },
+            {
+                u"file": u"manofsteel-trailer.mov",
+                u"md5": u"e401fde0813008e3307e54b6f384cff1",
+                u"info_dict": {
+                    u"duration": 148,
+                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg",
+                    u"title": u"Trailer",
+                    u"upload_date": u"20121212",
+                    u"uploader_id": u"wb",
+                },
+            },
+            {
+                u"file": u"manofsteel-teaser.mov",
+                u"md5": u"76b392f2ae9e7c98b22913c10a639c97",
+                u"info_dict": {
+                    u"duration": 93,
+                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg",
+                    u"title": u"Teaser",
+                    u"upload_date": u"20120721",
+                    u"uploader_id": u"wb",
+                },
+            }
+        ]
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        movie = mobj.group('movie')
+        uploader_id = mobj.group('company')
+
+        playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc'
+        playlist_snippet = self._download_webpage(playlist_url, movie)
+        playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet)
+        playlist_html = u'<html>' + playlist_cleaned + u'</html>'
+
+        size_cache = {}
+
+        doc = xml.etree.ElementTree.fromstring(playlist_html)
+        playlist = []
+        for li in doc.findall('./div/ul/li'):
+            title = li.find('.//h3').text
+            video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
+            thumbnail = li.find('.//img').attrib['src']
+
+            date_el = li.find('.//p')
+            upload_date = None
+            m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text)
+            if m:
+                upload_date = u'20' + m.group('year') + m.group('month') + m.group('day')
+            runtime_el = date_el.find('./br')
+            m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail)
+            duration = None
+            if m:
+                duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
+
+            formats = []
+            for formats_el in li.findall('.//li/a'):
+                if formats_el.attrib['class'] != 'OverlayPanel':
+                    continue
+                target = formats_el.attrib['target']
+
+                format_code = formats_el.text
+                if 'Automatic' in format_code:
+                    continue
+
+                size_q = formats_el.attrib['href']
+                size_id = size_q.rpartition('#videos-')[2]
+                if size_id not in size_cache:
+                    size_url = url + size_q
+                    sizepage_html = self._download_webpage(
+                        size_url, movie,
+                        note=u'Downloading size info %s' % size_id,
+                        errnote=u'Error while downloading size info %s' % size_id,
+                    )
+                    _doc = xml.etree.ElementTree.fromstring(sizepage_html)
+                    size_cache[size_id] = _doc
+
+                sizepage_doc = size_cache[size_id]
+                links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a')
+                for vid_a in links:
+                    href = vid_a.get('href')
+                    if not href.endswith(target):
+                        continue
+                    detail_q = href.partition('#')[0]
+                    detail_url = url + '/' + detail_q
+
+                    m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q)
+                    detail_id = m.group('detail_id')
+
+                    detail_html = self._download_webpage(
+                        detail_url, movie,
+                        note=u'Downloading detail %s %s' % (detail_id, size_id),
+                        errnote=u'Error while downloading detail %s %s' % (detail_id, size_id)
+                    )
+                    detail_doc = xml.etree.ElementTree.fromstring(detail_html)
+                    movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a')
+                    assert movie_link_el.get('class') == 'movieLink'
+                    movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h')
+                    ext = determine_ext(movie_link)
+                    assert ext == 'mov'
+
+                    formats.append({
+                        'format': format_code,
+                        'ext': ext,
+                        'url': movie_link,
+                    })
+
+            info = {
+                '_type': 'video',
+                'id': video_id,
+                'title': title,
+                'formats': formats,
+                'title': title,
+                'duration': duration,
+                'thumbnail': thumbnail,
+                'upload_date': upload_date,
+                'uploader_id': uploader_id,
+                'user_agent': 'QuickTime compatible (youtube-dl)',
+            }
+            # TODO: Remove when #980 has been merged
+            info['url'] = formats[-1]['url']
+            info['ext'] = formats[-1]['ext']
+
+            playlist.append(info)
+
+        return {
+            '_type': 'playlist',
+            'id': movie,
+            'entries': playlist,
+        }
-- 
cgit v1.2.3


From a1bb0f8773e0fff787ffe7bd1729073f3385d2ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?=
 <jaime.marquinez.ferrandiz@gmail.com>
Date: Wed, 28 Aug 2013 10:20:37 +0200
Subject: [cnn] remove debug print call.

---
 youtube_dl/extractor/cnn.py | 1 -
 1 file changed, 1 deletion(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index 4338bd180..a79f881cd 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -33,7 +33,6 @@ class CNNIE(InfoExtractor):
         path = mobj.group('path')
         page_title = mobj.group('title')
         info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path
-        print(info_url)
         info_xml = self._download_webpage(info_url, page_title)
         info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
 
-- 
cgit v1.2.3


From 3e223834d9f358bc7cb1c3748dc63d1ab40d9b87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?=
 <jaime.marquinez.ferrandiz@gmail.com>
Date: Wed, 28 Aug 2013 10:26:44 +0200
Subject: [youtube] update algo for length 88, thanks to @Ramhack (fixes #1328)

---
 youtube_dl/extractor/youtube.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index af01c9da0..8e486afd0 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -419,7 +419,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         elif len(s) == 89:
             return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
         elif len(s) == 88:
-            return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12]
+            return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
         elif len(s) == 87:
             return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
         elif len(s) == 86:
-- 
cgit v1.2.3


From 4f5f18acb93ea2bf70f80c7f76e6bb6b8dee3fbf Mon Sep 17 00:00:00 2001
From: Philipp Hagemeister <phihag@phihag.de>
Date: Wed, 28 Aug 2013 10:28:16 +0200
Subject: [addanime] add file

---
 youtube_dl/extractor/addanime.py | 76 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 youtube_dl/extractor/addanime.py

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py
new file mode 100644
index 000000000..46db8262f
--- /dev/null
+++ b/youtube_dl/extractor/addanime.py
@@ -0,0 +1,76 @@
+import ast
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_HTTPError,
+    compat_str,
+    compat_urllib_parse,
+    compat_urllib_parse_urlparse,
+
+    ExtractorError,
+)
+
+
+class AddAnimeIE(InfoExtractor):
+
+    _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
+    IE_NAME = u'AddAnime'
+    _TEST = {
+        u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
+        u'file': u'24MR3YO5SAS9.flv',
+        u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1',
+        u'info_dict': {
+            u"description": u"One Piece 606",
+            u"title": u"One Piece 606"
+        }
+    }
+
+    def _real_extract(self, url):
+        try:
+            mobj = re.match(self._VALID_URL, url)
+            video_id = mobj.group('video_id')
+            webpage = self._download_webpage(url, video_id)
+        except ExtractorError as ee:
+            if not isinstance(ee.cause, compat_HTTPError):
+                raise
+
+            redir_webpage = ee.cause.read().decode('utf-8')
+            action = self._search_regex(
+                r'<form id="challenge-form" action="([^"]+)"',
+                redir_webpage, u'Redirect form')
+            vc = self._search_regex(
+                r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>',
+                redir_webpage, u'redirect vc value')
+            av = re.search(
+                r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
+                redir_webpage)
+            if av is None:
+                raise ExtractorError(u'Cannot find redirect math task')
+            av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3))
+
+            parsed_url = compat_urllib_parse_urlparse(url)
+            av_val = av_res + len(parsed_url.netloc)
+            confirm_url = (
+                parsed_url.scheme + u'://' + parsed_url.netloc +
+                action + '?' +
+                compat_urllib_parse.urlencode({
+                    'jschl_vc': vc, 'jschl_answer': compat_str(av_val)}))
+            self._download_webpage(
+                confirm_url, video_id,
+                note=u'Confirming after redirect')
+            webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(r"var normal_video_file = '(.*?)';",
+                                       webpage, u'video file URL')
+        video_title = self._og_search_title(webpage)
+        video_description = self._og_search_description(webpage)
+
+        return {
+            '_type': 'video',
+            'id':  video_id,
+            'url': video_url,
+            'ext': 'flv',
+            'title': video_title,
+            'description': video_description
+        }
-- 
cgit v1.2.3


From ae3531adf926998d42d1fb52453491c85e33b5f0 Mon Sep 17 00:00:00 2001
From: Philipp Hagemeister <phihag@phihag.de>
Date: Wed, 28 Aug 2013 12:04:44 +0200
Subject: [generic] Fix URL concatenation

When the url is something like http://example.org/foo/bar?x=y  and the added is file/video.mp4 , we want http://example.org/foo/file/video.mp4
Fixes #1268.
---
 youtube_dl/extractor/generic.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index d034a11bb..bfc9bff49 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -166,7 +166,12 @@ class GenericIE(InfoExtractor):
         if video_url.startswith('//'):
             video_url = compat_urllib_parse_urlparse(url).scheme + ':' + video_url
         if '://' not in video_url:
-            video_url = url + ('' if url.endswith('/') else '/') + video_url
+            up = compat_urllib_parse_urlparse(url)
+            if video_url.startswith('/'):
+                video_url = up.scheme + '://' + up.netloc + video_url
+            else:  # relative path
+                video_url = (up.scheme + '://' + up.netloc +
+                             up.path.rpartition('/')[0] + '/' + video_url)
         video_id = os.path.basename(video_url)
 
         # here's a fun little line of code for you:
-- 
cgit v1.2.3


From a5caba1eb02665cdc982d6be4a933aafd79243de Mon Sep 17 00:00:00 2001
From: Philipp Hagemeister <phihag@phihag.de>
Date: Wed, 28 Aug 2013 12:47:27 +0200
Subject: [generic] simply use urljoin

---
 youtube_dl/extractor/generic.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index bfc9bff49..dc4dea4ad 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -7,8 +7,8 @@ from .common import InfoExtractor
 from ..utils import (
     compat_urllib_error,
     compat_urllib_parse,
-    compat_urllib_parse_urlparse,
     compat_urllib_request,
+    compat_urlparse,
 
     ExtractorError,
 )
@@ -163,15 +163,7 @@ class GenericIE(InfoExtractor):
             raise ExtractorError(u'Invalid URL: %s' % url)
 
         video_url = compat_urllib_parse.unquote(mobj.group(1))
-        if video_url.startswith('//'):
-            video_url = compat_urllib_parse_urlparse(url).scheme + ':' + video_url
-        if '://' not in video_url:
-            up = compat_urllib_parse_urlparse(url)
-            if video_url.startswith('/'):
-                video_url = up.scheme + '://' + up.netloc + video_url
-            else:  # relative path
-                video_url = (up.scheme + '://' + up.netloc +
-                             up.path.rpartition('/')[0] + '/' + video_url)
+        video_url = compat_urlparse.urljoin(url, video_url)
         video_id = os.path.basename(video_url)
 
         # here's a fun little line of code for you:
-- 
cgit v1.2.3


From ce6a696e4d964aeb27de46a31a899b28d7ca7754 Mon Sep 17 00:00:00 2001
From: Philipp Hagemeister <phihag@phihag.de>
Date: Wed, 28 Aug 2013 12:47:38 +0200
Subject: Remove unused imports

---
 youtube_dl/extractor/addanime.py      | 1 -
 youtube_dl/extractor/appletrailers.py | 1 -
 youtube_dl/extractor/trilulilu.py     | 3 ---
 youtube_dl/extractor/wat.py           | 1 -
 4 files changed, 6 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py
index 46db8262f..82a785a19 100644
--- a/youtube_dl/extractor/addanime.py
+++ b/youtube_dl/extractor/addanime.py
@@ -1,4 +1,3 @@
-import ast
 import re
 
 from .common import InfoExtractor
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index b3bdb2955..8b191c196 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -4,7 +4,6 @@ import xml.etree.ElementTree
 from .common import InfoExtractor
 from ..utils import (
     determine_ext,
-    ExtractorError,
 )
 
 
diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py
index 1c46156c7..f278951ba 100644
--- a/youtube_dl/extractor/trilulilu.py
+++ b/youtube_dl/extractor/trilulilu.py
@@ -3,9 +3,6 @@ import re
 import xml.etree.ElementTree
 
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-)
 
 
 class TriluliluIE(InfoExtractor):
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py
index 7d228edac..29c25f0e3 100644
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -6,7 +6,6 @@ import re
 from .common import InfoExtractor
 
 from ..utils import (
-    compat_urllib_parse,
     unified_strdate,
 )
 
-- 
cgit v1.2.3


From 67b22dd03686d9e360d87a7751de74b321d3f231 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?=
 <jaime.marquinez.ferrandiz@gmail.com>
Date: Wed, 28 Aug 2013 12:51:22 +0200
Subject: Add extractors for video.mit.edu and techtv.mit.edu (closes #1327)

video.mit.edu just embeds the videos from techtv.mit.edu
---
 youtube_dl/extractor/__init__.py |  1 +
 youtube_dl/extractor/mit.py      | 76 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 youtube_dl/extractor/mit.py

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index c76b99a81..21e9e5d37 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -50,6 +50,7 @@ from .keek import KeekIE
 from .liveleak import LiveLeakIE
 from .livestream import LivestreamIE
 from .metacafe import MetacafeIE
+from .mit import TechTVMITIE, MITIE
 from .mixcloud import MixcloudIE
 from .mtv import MTVIE
 from .muzu import MuzuTVIE
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
new file mode 100644
index 000000000..d09d03e36
--- /dev/null
+++ b/youtube_dl/extractor/mit.py
@@ -0,0 +1,76 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    get_element_by_id,
+)
+
+
+class TechTVMITIE(InfoExtractor):
+    IE_NAME = u'techtv.mit.edu'
+    _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
+        u'file': u'25418.mp4',
+        u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f',
+        u'info_dict': {
+            u'title': u'MIT DNA Learning Center Set',
+            u'description': u'md5:82313335e8a8a3f243351ba55bc1b474',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(
+            'http://techtv.mit.edu/videos/%s' % video_id, video_id)
+        embed_page = self._download_webpage(
+            'http://techtv.mit.edu/embeds/%s/' % video_id, video_id,
+            note=u'Downloading embed page')
+
+        base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
+            embed_page, u'base url')
+        formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page,
+            u'video formats')
+        formats = json.loads(formats_json)
+        formats = sorted(formats, key=lambda f: f['bitrate'])
+
+        title = get_element_by_id('edit-title', webpage)
+        description = clean_html(get_element_by_id('edit-description', webpage))
+        thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
+            embed_page, u'thumbnail', flags=re.DOTALL)
+
+        return {'id': video_id,
+                'title': title,
+                'url': base_url + formats[-1]['url'].replace('mp4:', ''),
+                'ext': 'mp4',
+                'description': description,
+                'thumbnail': thumbnail,
+                }
+
+
+class MITIE(TechTVMITIE):
+    IE_NAME = u'video.mit.edu'
+    _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)'
+
+    _TEST = {
+        u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
+        u'file': u'21783.mp4',
+        u'md5': u'7db01d5ccc1895fc5010e9c9e13648da',
+        u'info_dict': {
+            u'title': u'The Government is Profiling You',
+            u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        page_title = mobj.group('title')
+        webpage = self._download_webpage(url, page_title)
+        self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME))
+        embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage,
+            u'embed url')
+        return self.url_result(embed_url, ie='TechTVMIT')
-- 
cgit v1.2.3


From f143d86ad2fc0633d8e2da598cf21e73ff0f2872 Mon Sep 17 00:00:00 2001
From: Philipp Hagemeister <phihag@phihag.de>
Date: Wed, 28 Aug 2013 13:59:08 +0200
Subject: [sohu] Handle encoding, and fix tests

---
 youtube_dl/extractor/common.py |   9 ++-
 youtube_dl/extractor/sohu.py   | 131 ++++++++++++++++++++---------------------
 2 files changed, 71 insertions(+), 69 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 77a13aea5..a2986cebe 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -145,12 +145,17 @@ class InfoExtractor(object):
 
         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
         content_type = urlh.headers.get('Content-Type', '')
+        webpage_bytes = urlh.read()
         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
         if m:
             encoding = m.group(1)
         else:
-            encoding = 'utf-8'
-        webpage_bytes = urlh.read()
+            m = re.search(br'<meta[^>]+charset="?([^"]+)[ /">]',
+                          webpage_bytes[:1024])
+            if m:
+                encoding = m.group(1).decode('ascii')
+            else:
+                encoding = 'utf-8'
         if self._downloader.params.get('dump_intermediate_pages', False):
             try:
                 url = url_or_request.get_full_url()
diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
index 24fc3a5d7..77bb0a8dc 100644
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@@ -1,13 +1,10 @@
 # encoding: utf-8
 
-import re
 import json
-import time
-import logging
-import urllib2
+import re
 
 from .common import InfoExtractor
-from ..utils import compat_urllib_request, clean_html
+from ..utils import ExtractorError
 
 
 class SohuIE(InfoExtractor):
@@ -15,79 +12,79 @@ class SohuIE(InfoExtractor):
 
     _TEST = {
         u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super',
-        u'file': u'382479172.flv',
-        u'md5': u'cc84eed6b6fbf0f2f9a8d3cb9da1939b',
+        u'file': u'382479172.mp4',
+        u'md5': u'bde8d9a6ffd82c63a1eefaef4eeefec7',
         u'info_dict': {
-            u'title': u'The Illest - Far East Movement Riff Raff',
+            u'title': u'MV：Far East Movement《The Illest》',
         },
     }
 
-
     def _real_extract(self, url):
+
+        def _fetch_data(vid_id):
+            base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid='
+            data_url = base_data_url + str(vid_id)
+            data_json = self._download_webpage(
+                data_url, video_id,
+                note=u'Downloading JSON data for ' + str(vid_id))
+            return json.loads(data_json)
+
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
+
         webpage = self._download_webpage(url, video_id)
-        pattern = r'<title>(.+?)</title>'
-        compiled = re.compile(pattern, re.DOTALL)
-        title = self._search_regex(compiled, webpage, u'video title')
-        title = clean_html(title).split('-')[0].strip()
-        self.to_screen('Title: %s' % title)
-        pattern = re.compile(r'var vid="(\d+)"')
-        result = re.search(pattern, webpage)
-        if not result:
-            logging.info('[Sohu] could not get vid')
-            return None
-        vid = result.group(1)
-        logging.info('vid: %s' % vid)
-        base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
-        url_1 = base_url_1 + vid
-        logging.info('json url: %s' % url_1)
-        webpage = self._download_webpage(url_1, vid)
-        json_1 = json.loads(webpage)
-        # get the highest definition video vid and json infomation.
-        vids = []
-        qualities = ('oriVid', 'superVid', 'highVid', 'norVid')
-        for vid_name in qualities:
-            vids.append(json_1['data'][vid_name])
-        clearest_vid = 0
-        for i, v in enumerate(vids):
-            if v != 0:
-                clearest_vid = v
-                logging.info('quality definition: %s' % qualities[i][:-3])
-                break
-        if not clearest_vid:
-            logging.warning('could not find valid clearest_vid')
-            return None
-        if vid != clearest_vid:
-            url_1 = '%s%d' % (base_url_1, clearest_vid)
-            logging.info('highest definition json url: %s' % url_1)
-            json_1 = json.loads(urllib2.urlopen(url_1).read())
-        allot = json_1['allot']
-        prot = json_1['prot']
-        clipsURL = json_1['data']['clipsURL']
-        su = json_1['data']['su']
-        num_of_parts = json_1['data']['totalBlocks']
-        logging.info('Total parts: %d' % num_of_parts)
-        base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]'
-        files_info = []
-        for i in range(num_of_parts):
-            self.to_screen('Geting json infomation of part %s/%s' % (i + 1, num_of_parts))
-            middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i])
-            logging.info('middle url part %d: %s' % (i, middle_url))
-            middle_info = urllib2.urlopen(middle_url).read().split('|')
-            middle_part_1 = middle_info[0]
-            download_url = '%s%s?key=%s' % (middle_info[0], su[i], middle_info[3])
+        raw_title = self._html_search_regex(r'(?s)<title>(.+?)</title>',
+                                            webpage, u'video title')
+        title = raw_title.partition('-')[0].strip()
 
-            info = {
+        vid = self._html_search_regex(r'var vid="(\d+)"', webpage,
+                                      u'video path')
+        data = _fetch_data(vid)
+
+        QUALITIES = ('ori', 'super', 'high', 'nor')
+        vid_ids = [data['data'][q + 'Vid']
+                   for q in QUALITIES
+                   if data['data'][q + 'Vid'] != 0]
+        if not vid_ids:
+            raise ExtractorError(u'No formats available for this video')
+
+        # For now, we just pick the highest available quality
+        vid_id = vid_ids[-1]
+
+        format_data = data if vid == vid_id else _fetch_data(vid_id)
+        part_count = format_data['data']['totalBlocks']
+        allot = format_data['allot']
+        prot = format_data['prot']
+        clipsURL = format_data['data']['clipsURL']
+        su = format_data['data']['su']
+
+        playlist = []
+        for i in range(part_count):
+            part_url = ('http://%s/?prot=%s&file=%s&new=%s' %
+                        (allot, prot, clipsURL[i], su[i]))
+            part_str = self._download_webpage(
+                part_url, video_id,
+                note=u'Downloading part %d of %d' % (i+1, part_count))
+
+            part_info = part_str.split('|')
+            video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3])
+
+            video_info = {
                 'id': '%s_part%02d' % (video_id, i + 1),
                 'title': title,
-                'url': download_url,
+                'url': video_url,
                 'ext': 'mp4',
             }
-            files_info.append(info)
-            time.sleep(1)
-        if num_of_parts == 1:
-            info =  files_info[0]
+            playlist.append(video_info)
+
+        if len(playlist) == 1:
+            info = playlist[0]
             info['id'] = video_id
-            return info
-        return files_info
+        else:
+            info = {
+                '_type': 'playlist',
+                'entries': playlist,
+                'id': video_id,
+            }
+
+        return info
-- 
cgit v1.2.3


From 48ea9cea77e7ea24ee867027f03ca37dd1b935d8 Mon Sep 17 00:00:00 2001
From: Philipp Hagemeister <phihag@phihag.de>
Date: Wed, 28 Aug 2013 14:28:55 +0200
Subject: Allow changes to run under Python 3

---
 youtube_dl/extractor/youporn.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index cc9c37027..19360e273 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -5,6 +5,7 @@ import sys
 
 from .common import InfoExtractor
 from ..utils import (
+    compat_str,
     compat_urllib_parse_urlparse,
     compat_urllib_request,
 
@@ -79,13 +80,16 @@ class YouPornIE(InfoExtractor):
         links = re.findall(LINK_RE, download_list_html)
         
         # Get link of hd video
-        encrypted_video_url = self._html_search_regex(r'var encryptedURL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';',
+        encrypted_video_url = self._html_search_regex(
+            r'var encrypted(?:Quality[0-9]+)?URL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';',
             webpage, u'encrypted_video_url')
-        video_url = unicode( aes_decrypt_text(encrypted_video_url, video_title, 32), 'utf-8')
+        video_url = aes_decrypt_text(encrypted_video_url, video_title, 32)
+        print(video_url)
+        assert isinstance(video_url, compat_str)
         if video_url.split('/')[6].split('_')[0] == u'720p': # only add if 720p to avoid duplicates
             links = [video_url] + links
         
-        if(len(links) == 0):
+        if not links:
             raise ExtractorError(u'ERROR: no known formats available for video')
 
         self.to_screen(u'Links found: %d' % len(links))
@@ -122,7 +126,7 @@ class YouPornIE(InfoExtractor):
             self._print_formats(formats)
             return
 
-        req_format = self._downloader.params.get('format', None)
+        req_format = self._downloader.params.get('format', 'best')
         self.to_screen(u'Format: %s' % req_format)
 
         if req_format is None or req_format == 'best':
-- 
cgit v1.2.3


From 878e83c5a4c84c7abbf3484366e76fbe906c8947 Mon Sep 17 00:00:00 2001
From: rzhxeo <rzhxeo@users.noreply.github.com>
Date: Wed, 28 Aug 2013 16:04:48 +0200
Subject: YoupornIE: Clean up extraction of hd video

---
 youtube_dl/extractor/youporn.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'youtube_dl/extractor')

diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index 19360e273..c85fd4b5a 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -5,7 +5,6 @@ import sys
 
 from .common import InfoExtractor
 from ..utils import (
-    compat_str,
     compat_urllib_parse_urlparse,
     compat_urllib_request,
 
@@ -79,14 +78,11 @@ class YouPornIE(InfoExtractor):
         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
         links = re.findall(LINK_RE, download_list_html)
         
-        # Get link of hd video
-        encrypted_video_url = self._html_search_regex(
-            r'var encrypted(?:Quality[0-9]+)?URL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';',
-            webpage, u'encrypted_video_url')
-        video_url = aes_decrypt_text(encrypted_video_url, video_title, 32)
-        print(video_url)
-        assert isinstance(video_url, compat_str)
-        if video_url.split('/')[6].split('_')[0] == u'720p': # only add if 720p to avoid duplicates
+        # Get link of hd video if available
+        mobj = re.search(r'var encryptedQuality720URL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';', webpage)
+        if mobj != None:
+            encrypted_video_url = mobj.group(u'encrypted_video_url')
+            video_url = aes_decrypt_text(encrypted_video_url, video_title, 32).decode('utf-8')
             links = [video_url] + links
         
         if not links:
-- 
cgit v1.2.3