11 files changed, 231 insertions, 17 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index ec800d9fb..ba0e86713 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -47,14 +47,17 @@ from .statigram import StatigramIE
 from .steam import SteamIE
 from .teamcoco import TeamcocoIE
 from .ted import TEDIE
+from .tf1 import TF1IE
 from .traileraddict import TrailerAddictIE
 from .tudou import TudouIE
 from .tumblr import TumblrIE
+from .tutv import TutvIE
 from .ustream import UstreamIE
 from .vbox7 import Vbox7IE
 from .vevo import VevoIE
 from .vimeo import VimeoIE
 from .vine import VineIE
+from .wat import WatIE
 from .wimp import WimpIE
 from .worldstarhiphop import WorldStarHipHopIE
 from .xhamster import XHamsterIE
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index b061b9566..183274eb7 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -11,11 +11,21 @@ from ..utils import (
 )
 
 class ArteTvIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+    """
+    There are two sources of video in arte.tv: videos.arte.tv and
+    www.arte.tv/guide, the extraction process is different for each one.
+    The videos expire in 7 days, so we can't add tests.
+    """
+    _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+    _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P<id>.*?).html'
     _LIVE_URL = r'index-[0-9]+\.html$'
 
     IE_NAME = u'arte.tv'
 
+    @classmethod
+    def suitable(cls, url):
+        return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL))
+
     # TODO implement Live Stream
     # def extractLiveStream(self, url):
     #     video_lang = url.split('/')[-4]
@@ -44,17 +54,26 @@ class ArteTvIE(InfoExtractor):
     #     video_url = u'%s/%s' % (info.get('url'), info.get('path'))
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        name = mobj.group('name')
-        # This is not a real id, it can be for example AJT for the news
-        # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
-        video_id = mobj.group('id')
+        mobj = re.match(self._EMISSION_URL, url)
+        if mobj is not None:
+            name = mobj.group('name')
+            # This is not a real id, it can be for example AJT for the news
+            # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
+            video_id = mobj.group('id')
+            return self._extract_emission(url, video_id)
+
+        mobj = re.match(self._VIDEOS_URL, url)
+        if mobj is not None:
+            id = mobj.group('id')
+            return self._extract_video(url, id)
 
         if re.search(self._LIVE_URL, video_id) is not None:
             raise ExtractorError(u'Arte live streams are not yet supported, sorry')
             # self.extractLiveStream(url)
             # return
 
+    def _extract_emission(self, url, video_id):
+        """Extract from www.arte.tv/guide"""
         webpage = self._download_webpage(url, video_id)
         json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
 
@@ -68,6 +87,7 @@ class ArteTvIE(InfoExtractor):
                      'description': player_info['VDE'],
                      'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
                      'thumbnail': player_info['programImage'],
+                     'ext': 'flv',
                      }
 
         formats = player_info['VSR'].values()
@@ -78,9 +98,36 @@ class ArteTvIE(InfoExtractor):
         if format_info['mediaType'] == u'rtmp':
             info_dict['url'] = format_info['streamer']
             info_dict['play_path'] = 'mp4:' + format_info['url']
-            info_dict['ext'] = 'mp4'
         else:
             info_dict['url'] = format_info['url']
-            info_dict['ext'] = 'mp4'
 
         return info_dict
+
+    def _extract_video(self, url, video_id):
+        """Extract from videos.arte.tv"""
+        config_xml_url = url.replace('/videos/', '/do_delegate/videos/')
+        config_xml_url = config_xml_url.replace('.html', ',view,asPlayerXml.xml')
+        config_xml = self._download_webpage(config_xml_url, video_id)
+        config_xml_url = self._html_search_regex(r'<video lang=".*?" ref="(.*?)"', config_xml, 'config xml url')
+        config_xml = self._download_webpage(config_xml_url, video_id)
+
+        video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
+        def _key(m):
+            quality = m.group('quality')
+            if quality == 'hd':
+                return 2
+            else:
+                return 1
+        # We pick the best quality
+        video_urls = sorted(video_urls, key=_key)
+        video_url = list(video_urls)[-1].group('url')
+        
+        title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title')
+        thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>',
+                                            config_xml, 'thumbnail')
+        return {'id': video_id,
+                'title': title,
+                'thumbnail': thumbnail,
+                'url': video_url,
+                'ext': 'flv',
+                }
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 64d63e109..5c6fd7945 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -44,6 +44,7 @@ class InfoExtractor(object):
     location:       Physical location of the video.
     player_url:     SWF Player URL (used for rtmpdump).
     subtitles:      The subtitle file contents.
+    view_count:     How many users have watched the video on the platform.
     urlhandle:      [internal] The urlHandle to be used to download the file,
                     like returned by urllib.request.urlopen
 
diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/statigram.py
index 25bf5b85c..ae9a63e8b 100644
--- a/youtube_dl/extractor/statigram.py
+++ b/youtube_dl/extractor/statigram.py
@@ -10,7 +10,7 @@ class StatigramIE(InfoExtractor):
         u'md5': u'deda4ff333abe2e118740321e992605b',
         u'info_dict': {
             u"uploader_id": u"videoseconds", 
-            u"title": u"Instagram photo by @videoseconds (Videos)"
+            u"title": u"Instagram photo by @videoseconds"
         }
     }
 
@@ -27,7 +27,7 @@ class StatigramIE(InfoExtractor):
         html_title = self._html_search_regex(
             r'<title>(.+?)</title>',
             webpage, u'title')
-        title = html_title.rpartition(u' | Statigram')[0]
+        title = re.sub(r'(?: *\(Videos?\))? \| Statigram$', '', html_title)
         uploader_id = self._html_search_regex(
             r'@([^ ]+)', title, u'uploader name', fatal=False)
         ext = 'mp4'
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 46b66582c..8b73b8340 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -17,7 +17,7 @@ class TEDIE(InfoExtractor):
     _TEST = {
         u'url': u'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
         u'file': u'102.mp4',
-        u'md5': u'8cd9dfa41ee000ce658fd48fb5d89a61',
+        u'md5': u'2d76ee1576672e0bd8f187513267adf6',
         u'info_dict': {
             u"description": u"md5:c6fa72e6eedbd938c9caf6b2702f5922", 
             u"title": u"Dan Dennett: The illusion of consciousness"
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
new file mode 100644
index 000000000..e0ffeced5
--- /dev/null
+++ b/youtube_dl/extractor/tf1.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+
+import json
+import re
+
+from .common import InfoExtractor
+
+class TF1IE(InfoExtractor):
+    """
+    TF1 uses the wat.tv player, currently it can only download videos with the
+    html5 player enabled, it cannot download HD videos.
+    """
+    _VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html'
+    _TEST = {
+        u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
+        u'file': u'10635995.mp4',
+        u'md5': u'66789d3e91278d332f75e1feb7aea327',
+        u'info_dict': {
+            u'title': u'Citroën Grand C4 Picasso 2013 : présentation officielle',
+            u'description': u'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        id = mobj.group(1)
+        webpage = self._download_webpage(url, id)
+        embed_url = self._html_search_regex(r'"(https://www.wat.tv/embedframe/.*?)"',
+                                webpage, 'embed url')
+        embed_page = self._download_webpage(embed_url, id, u'Downloading embed player page')
+        wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id')
+        wat_info = self._download_webpage('http://www.wat.tv/interface/contentv3/%s' % wat_id, id, u'Downloading Wat info')
+        wat_info = json.loads(wat_info)['media']
+        wat_url = wat_info['url']
+        return self.url_result(wat_url, 'Wat')
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py
index 243f04bff..ad5840ca2 100644
--- a/youtube_dl/extractor/tumblr.py
+++ b/youtube_dl/extractor/tumblr.py
@@ -9,11 +9,11 @@ from ..utils import (
 class TumblrIE(InfoExtractor):
     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
     _TEST = {
-        u'url': u'http://resigno.tumblr.com/post/53364321212/e-de-extrema-importancia-que-esse-video-seja',
-        u'file': u'53364321212.mp4',
-        u'md5': u'0716d3dd51baf68a28b40fdf1251494e',
+        u'url': u'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes',
+        u'file': u'54196191430.mp4',
+        u'md5': u'479bb068e5b16462f5176a6828829767',
         u'info_dict': {
-            u"title": u"Rafael Lemos"
+            u"title": u"tatiana maslany news"
         }
     }
 
diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py
new file mode 100644
index 000000000..fcaa6ac01
--- /dev/null
+++ b/youtube_dl/extractor/tutv.py
@@ -0,0 +1,41 @@
+import base64
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_parse_qs,
+)
+
+class TutvIE(InfoExtractor):
+    _VALID_URL=r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)'
+    _TEST = {
+        u'url': u'http://tu.tv/videos/noah-en-pabellon-cuahutemoc',
+        u'file': u'2742556.flv',
+        u'md5': u'5eb766671f69b82e528dc1e7769c5cb2',
+        u'info_dict': {
+            u"title": u"Noah en pabellon cuahutemoc"
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._html_search_regex(
+            r'<meta property="og:title" content="(.*?)">', webpage, u'title')
+        internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID')
+
+        data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id)
+        data_content = self._download_webpage(data_url, video_id, note=u'Downloading video info')
+        data = compat_parse_qs(data_content)
+        video_url = base64.b64decode(data['kpt'][0]).decode('utf-8')
+        ext = video_url.partition(u'?')[0].rpartition(u'.')[2]
+
+        info = {
+            'id': internal_id,
+            'url': video_url,
+            'ext': ext,
+            'title': title,
+        }
+        return [info]
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 11741e27d..7c4562790 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -16,7 +16,7 @@ class VimeoIE(InfoExtractor):
     """Information extractor for vimeo.com."""
 
     # _VALID_URL matches Vimeo URLs
-    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
+    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)(?:[?].*)?$'
     IE_NAME = u'vimeo'
     _TEST = {
         u'url': u'http://vimeo.com/56015672',
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py
new file mode 100644
index 000000000..0d1302cd2
--- /dev/null
+++ b/youtube_dl/extractor/wat.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+
+import json
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+    compat_urllib_parse,
+    unified_strdate,
+)
+
+
+class WatIE(InfoExtractor):
+    _VALID_URL=r'http://www.wat.tv/.*-(?P<shortID>.*?)_.*?.html'
+    IE_NAME = 'wat.tv'
+    _TEST = {
+        u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
+        u'file': u'10631273.mp4',
+        u'md5': u'0a4fe7870f31eaeabb5e25fd8da8414a',
+        u'info_dict': {
+            u'title': u'World War Z - Philadelphia VOST',
+            u'description': u'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
+        }
+    }
+    
+    def download_video_info(self, real_id):
+        # 'contentv4' is used in the website, but it also returns the related
+        # videos, we don't need them
+        info = self._download_webpage('http://www.wat.tv/interface/contentv3/' + real_id, real_id, 'Downloading video info')
+        info = json.loads(info)
+        return info['media']
+
+
+    def _real_extract(self, url):
+        def real_id_for_chapter(chapter):
+            return chapter['tc_start'].split('-')[0]
+        mobj = re.match(self._VALID_URL, url)
+        short_id = mobj.group('shortID')
+        webpage = self._download_webpage(url, short_id)
+        real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
+
+        video_info = self.download_video_info(real_id)
+        chapters = video_info['chapters']
+        first_chapter = chapters[0]
+
+        if real_id_for_chapter(first_chapter) != real_id:
+            self.to_screen('Multipart video detected')
+            chapter_urls = []
+            for chapter in chapters:
+                chapter_id = real_id_for_chapter(chapter)
+                # Yes, when we this chapter is processed by WatIE,
+                # it will download the info again
+                chapter_info = self.download_video_info(chapter_id)
+                chapter_urls.append(chapter_info['url'])
+            entries = [self.url_result(chapter_url) for chapter_url in chapter_urls]
+            return self.playlist_result(entries, real_id, video_info['title'])
+
+        # Otherwise we can continue and extract just one part, we have to use
+        # the short id for getting the video url
+        player_data = compat_urllib_parse.urlencode({'shortVideoId': short_id,
+                                                     'html5': '1'})
+        player_info = self._download_webpage('http://www.wat.tv/player?' + player_data,
+                                             real_id, u'Downloading player info')
+        player = json.loads(player_info)['player']
+        html5_player = self._html_search_regex(r'iframe src="(.*?)"', player,
+                                               'html5 player')
+        player_webpage = self._download_webpage(html5_player, real_id,
+                                                u'Downloading player webpage')
+
+        video_url = self._search_regex(r'urlhtml5 : "(.*?)"', player_webpage,
+                                       'video url')
+        info = {'id': real_id,
+                'url': video_url,
+                'ext': 'mp4',
+                'title': first_chapter['title'],
+                'thumbnail': first_chapter['preview'],
+                'description': first_chapter['description'],
+                'view_count': video_info['views'],
+                }
+        if 'date_diffusion' in first_chapter:
+            info['upload_date'] = unified_strdate(first_chapter['date_diffusion'])
+
+        return info
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 96d8257d9..109c8a93f 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -168,7 +168,7 @@ class YoutubeIE(InfoExtractor):
         self.to_screen(u'RTMP download detected')
 
     def _decrypt_signature(self, s):
-        """Decrypt the key"""
+        """Turn the encrypted s field into a working signature"""
 
         if len(s) == 88:
             return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12]
@@ -402,6 +402,9 @@ class YoutubeIE(InfoExtractor):
         return video_id
 
     def _real_extract(self, url):
+        if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
+            self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like  youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply  youtube-dl BaW_jenozKc  ).')
+
         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
         mobj = re.search(self._NEXT_URL_RE, url)
         if mobj: