diff options
33 files changed, 777 insertions, 638 deletions
| diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 75547f42a..94cbce6e8 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -120,5 +120,9 @@ class TestAllURLsMatching(unittest.TestCase):      def test_soundcloud_not_matching_sets(self):          self.assertMatch('http://soundcloud.com/floex/sets/gone-ep', ['soundcloud:set']) +    def test_tumblr(self): +        self.assertMatch('http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', ['Tumblr']) +        self.assertMatch('http://tatianamaslanydaily.tumblr.com/post/54196191430', ['Tumblr']) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_playlists.py b/test/test_playlists.py index 5eeba091e..b3ce6f71e 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -33,6 +33,7 @@ from youtube_dl.extractor import (      ImdbListIE,      KhanAcademyIE,      EveryonesMixtapeIE, +    RutubeChannelIE,  ) @@ -195,11 +196,11 @@ class TestPlaylists(unittest.TestCase):      def test_imdb_list(self):          dl = FakeYDL()          ie = ImdbListIE(dl) -        result = ie.extract('http://www.imdb.com/list/sMjedvGDd8U') +        result = ie.extract('http://www.imdb.com/list/JFs9NWw6XI0')          self.assertIsPlaylist(result) -        self.assertEqual(result['id'], 'sMjedvGDd8U') -        self.assertEqual(result['title'], 'Animated and Family Films') -        self.assertTrue(len(result['entries']) >= 48) +        self.assertEqual(result['id'], 'JFs9NWw6XI0') +        self.assertEqual(result['title'], 'March 23, 2012 Releases') +        self.assertEqual(len(result['entries']), 7)      def test_khanacademy_topic(self):          dl = FakeYDL() @@ -219,6 +220,14 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['id'], 'm7m0jJAbMQi')          self.assertEqual(result['title'], 'Driving')          self.assertEqual(len(result['entries']), 24) +         +    def test_rutube_channel(self): +        dl = FakeYDL() +        ie = RutubeChannelIE(dl) +        result = ie.extract('http://rutube.ru/tags/video/1409') +        self.assertIsPlaylist(result) +        self.assertEqual(result['id'], '1409') +        self.assertTrue(len(result['entries']) >= 34)  if __name__ == '__main__': diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1e8556124..e89b5cf9d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -161,7 +161,12 @@ from .ro220 import Ro220IE  from .rottentomatoes import RottenTomatoesIE  from .roxwel import RoxwelIE  from .rtlnow import RTLnowIE -from .rutube import RutubeIE +from .rutube import ( +    RutubeIE, +    RutubeChannelIE, +    RutubeMovieIE, +    RutubePersonIE, +)  from .servingsys import ServingSysIE  from .sina import SinaIE  from .slashdot import SlashdotIE diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index dbf8eed99..b88f71bc4 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -1,22 +1,28 @@ +# coding: utf-8 +from __future__ import unicode_literals +  import re  from .common import InfoExtractor  from ..utils import ( +    determine_ext,      ExtractorError,  ) +  class ARDIE(InfoExtractor): -    _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' -    _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>' -    _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)' +    _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' +      _TEST = { -        u'url': u'http://www.ardmediathek.de/das-erste/tagesschau-in-100-sek?documentId=14077640', -        u'file': u'14077640.mp4', -        u'md5': u'6ca8824255460c787376353f9e20bbd8', -        u'info_dict': { -            u"title": u"11.04.2013 09:23 Uhr - Tagesschau in 100 Sekunden" +        'url': 'http://www.ardmediathek.de/das-erste/guenther-jauch/edward-snowden-im-interview-held-oder-verraeter?documentId=19288786', +        'file': '19288786.mp4', +        'md5': '515bf47ce209fb3f5a61b7aad364634c', +        'info_dict': { +            'title': 'Edward Snowden im Interview - Held oder Verräter?', +            'description': 'Edward Snowden hat alles aufs Spiel gesetzt, um die weltweite \xdcberwachung durch die Geheimdienste zu enttarnen. Nun stellt sich der ehemalige NSA-Mitarbeiter erstmals weltweit in einem TV-Interview den Fragen eines NDR-Journalisten. Die Sendung vom Sonntagabend.', +            'thumbnail': 'http://www.ardmediathek.de/ard/servlet/contentblob/19/28/87/90/19288790/bild/2250037',          }, -        u'skip': u'Requires rtmpdump' +        'skip': 'Blocked outside of Germany',      }      def _real_extract(self, url): @@ -29,26 +35,49 @@ class ARDIE(InfoExtractor):          else:              video_id = m.group('video_id') -        # determine title and media streams from webpage -        html = self._download_webpage(url, video_id) -        title = re.search(self._TITLE, html).group('title') -        streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM, html)] +        webpage = self._download_webpage(url, video_id) + +        title = self._html_search_regex( +            r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', webpage, 'title') +        description = self._html_search_meta( +            'dcterms.abstract', webpage, 'description') +        thumbnail = self._og_search_thumbnail(webpage) + +        streams = [ +            mo.groupdict() +            for mo in re.finditer( +                r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)', webpage)]          if not streams: -            assert '"fsk"' in html -            raise ExtractorError(u'This video is only available after 8:00 pm') - -        # choose default media type and highest quality for now -        stream = max([s for s in streams if int(s["media_type"]) == 0], -                     key=lambda s: int(s["quality"])) - -        # there's two possibilities: RTMP stream or HTTP download -        info = {'id': video_id, 'title': title, 'ext': 'mp4'} -        if stream['rtmp_url']: -            self.to_screen(u'RTMP download detected') -            assert stream['video_url'].startswith('mp4:') -            info["url"] = stream["rtmp_url"] -            info["play_path"] = stream['video_url'] -        else: -            assert stream["video_url"].endswith('.mp4') -            info["url"] = stream["video_url"] -        return [info] +            if '"fsk"' in webpage: +                raise ExtractorError('This video is only available after 20:00') + +        formats = [] +        for s in streams: +            format = { +                'quality': int(s['quality']), +            } +            if s.get('rtmp_url'): +                format['protocol'] = 'rtmp' +                format['url'] = s['rtmp_url'] +                format['playpath'] = s['video_url'] +            else: +                format['url'] = s['video_url'] + +            quality_name = self._search_regex( +                r'[,.]([a-zA-Z0-9_-]+),?\.mp4', format['url'], +                'quality name', default='NA') +            format['format_id'] = '%s-%s-%s-%s' % ( +                determine_ext(format['url']), quality_name, s['media_type'], +                s['quality']) + +            formats.append(format) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'formats': formats, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index d18bc7e0c..df2cff81c 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -24,5 +24,5 @@ class BloombergIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          name = mobj.group('name')          webpage = self._download_webpage(url, name) -        ooyala_code = self._search_regex(r'<source src="http://player.ooyala.com/player/[^/]+/([^".]+)', webpage, u'ooyala url') -        return OoyalaIE._build_url_result(ooyala_code) +        ooyala_url = self._twitter_search_player(webpage) +        return self.url_result(ooyala_url, OoyalaIE.ie_key()) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 443294e6f..9ccf923a6 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -23,7 +23,6 @@ from ..utils import (  class BrightcoveIE(InfoExtractor):      _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'      _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' -    _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s'      _TESTS = [          { @@ -70,7 +69,7 @@ class BrightcoveIE(InfoExtractor):                  'description': 'md5:363109c02998fee92ec02211bd8000df',                  'uploader': 'National Ballet of Canada',              }, -        }, +        }      ]      @classmethod @@ -131,6 +130,11 @@ class BrightcoveIE(InfoExtractor):          """Try to extract the brightcove url from the wepbage, returns None          if it can't be found          """ + +        url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage) +        if url_m: +            return url_m.group(1) +          m_brightcove = re.search(              r'''(?sx)<object              (?: @@ -183,8 +187,9 @@ class BrightcoveIE(InfoExtractor):          return self._extract_video_info(video_info)      def _get_playlist_info(self, player_key): -        playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key, -                                               player_key, 'Downloading playlist information') +        info_url = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' % player_key +        playlist_info = self._download_webpage( +            info_url, player_key, 'Downloading playlist information')          json_data = json.loads(playlist_info)          if 'videoList' not in json_data: diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 574881b70..3867d7850 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +from __future__ import unicode_literals  import re @@ -11,38 +11,38 @@ class Channel9IE(InfoExtractor):      The type of provided URL (video or playlist) is determined according to      meta Search.PageType from web page HTML rather than URL itself, as it is -    not always possible to do.     +    not always possible to do.      ''' -    IE_DESC = u'Channel 9' -    IE_NAME = u'channel9' +    IE_DESC = 'Channel 9' +    IE_NAME = 'channel9'      _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'      _TESTS = [          { -            u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', -            u'file': u'Events_TechEd_Australia_2013_KOS002.mp4', -            u'md5': u'bbd75296ba47916b754e73c3a4bbdf10', -            u'info_dict': { -                u'title': u'Developer Kick-Off Session: Stuff We Love', -                u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f', -                u'duration': 4576, -                u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', -                u'session_code': u'KOS002', -                u'session_day': u'Day 1', -                u'session_room': u'Arena 1A', -                u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ], +            'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', +            'file': 'Events_TechEd_Australia_2013_KOS002.mp4', +            'md5': 'bbd75296ba47916b754e73c3a4bbdf10', +            'info_dict': { +                'title': 'Developer Kick-Off Session: Stuff We Love', +                'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', +                'duration': 4576, +                'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', +                'session_code': 'KOS002', +                'session_day': 'Day 1', +                'session_room': 'Arena 1A', +                'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],              },          },          { -            u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', -            u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4', -            u'md5': u'b43ee4529d111bc37ba7ee4f34813e68', -            u'info_dict': { -                u'title': u'Self-service BI with Power BI - nuclear testing', -                u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', -                u'duration': 1540, -                u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', -                u'authors': [ u'Mike Wilmot' ], +            'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', +            'file': 'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4', +            'md5': 'b43ee4529d111bc37ba7ee4f34813e68', +            'info_dict': { +                'title': 'Self-service BI with Power BI - nuclear testing', +                'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', +                'duration': 1540, +                'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', +                'authors': [ 'Mike Wilmot' ],              },          }      ] @@ -60,7 +60,7 @@ class Channel9IE(InfoExtractor):              return 0          units = m.group('units')          try: -            exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper()) +            exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())          except ValueError:              return 0          size = float(m.group('size')) @@ -80,7 +80,7 @@ class Channel9IE(InfoExtractor):              'url': x.group('url'),              'format_id': x.group('quality'),              'format_note': x.group('note'), -            'format': u'%s (%s)' % (x.group('quality'), x.group('note')), +            'format': '%s (%s)' % (x.group('quality'), x.group('note')),              'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate              'preference': self._known_formats.index(x.group('quality')),              'vcodec': 'none' if x.group('note') == 'Audio only' else None, @@ -91,10 +91,10 @@ class Channel9IE(InfoExtractor):          return formats      def _extract_title(self, html): -        title = self._html_search_meta(u'title', html, u'title') +        title = self._html_search_meta('title', html, 'title')          if title is None:                         title = self._og_search_title(html) -            TITLE_SUFFIX = u' (Channel 9)' +            TITLE_SUFFIX = ' (Channel 9)'              if title is not None and title.endswith(TITLE_SUFFIX):                  title = title[:-len(TITLE_SUFFIX)]          return title @@ -110,7 +110,7 @@ class Channel9IE(InfoExtractor):          m = re.search(DESCRIPTION_REGEX, html)          if m is not None:              return m.group('description') -        return self._html_search_meta(u'description', html, u'description') +        return self._html_search_meta('description', html, 'description')      def _extract_duration(self, html):          m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html) @@ -172,7 +172,7 @@ class Channel9IE(InfoExtractor):          # Nothing to download          if len(formats) == 0 and slides is None and zip_ is None: -            self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path) +            self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)              return          # Extract meta @@ -244,7 +244,7 @@ class Channel9IE(InfoExtractor):          return contents      def _extract_list(self, content_path): -        rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS') +        rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')          entries = [self.url_result(session_url.text, 'Channel9')                     for session_url in rss.findall('./channel/item/link')]          title_text = rss.find('./channel/title').text @@ -254,11 +254,11 @@ class Channel9IE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          content_path = mobj.group('contentpath') -        webpage = self._download_webpage(url, content_path, u'Downloading web page') +        webpage = self._download_webpage(url, content_path, 'Downloading web page')          page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)          if page_type_m is None: -            raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True) +            raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)          page_type = page_type_m.group('pagetype')          if page_type == 'List':         # List page, may contain list of 'item'-like objects @@ -268,4 +268,4 @@ class Channel9IE(InfoExtractor):          elif page_type == 'Session':    # Event session page, may contain downloadable content              return self._extract_session(webpage, content_path)          else: -            raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)
\ No newline at end of file +            raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)
\ No newline at end of file diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 3333d433b..ed3986f31 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -14,7 +14,7 @@ from ..utils import (  class ComedyCentralIE(MTVServicesInfoExtractor): -    _VALID_URL = r'''(?x)https?://(?:www.)?comedycentral.com/ +    _VALID_URL = r'''(?x)https?://(?:www\.)?comedycentral\.com/          (video-clips|episodes|cc-studios|video-collections)          /(?P<title>.*)'''      _FEED_URL = 'http://comedycentral.com/feeds/mrss/' @@ -86,7 +86,7 @@ class ComedyCentralShowsIE(InfoExtractor):      @staticmethod      def _transform_rtmp_url(rtmp_video_url): -        m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url) +        m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\.comedystor/.*)$', rtmp_video_url)          if not m:              raise ExtractorError('Cannot transform RTMP url')          base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f7478d459..70ba9eaba 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -465,6 +465,10 @@ class InfoExtractor(object):          }          return RATING_TABLE.get(rating.lower(), None) +    def _twitter_search_player(self, html): +        return self._html_search_meta('twitter:player', html, +            'twitter card player') +      def _sort_formats(self, formats):          if not formats:              raise ExtractorError(u'No video formats found') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 2b66bddbb..920728e01 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -1,4 +1,6 @@  # encoding: utf-8 +from __future__ import unicode_literals +  import re, base64, zlib  from hashlib import sha1  from math import pow, sqrt, floor @@ -18,29 +20,29 @@ from ..aes import (  )  class CrunchyrollIE(InfoExtractor): -    _VALID_URL = r'(?:https?://)?(?:www\.)?(?P<url>crunchyroll\.com/[^/]*/[^/?&]*?(?P<video_id>[0-9]+))(?:[/?&]|$)' +    _VALID_URL = r'(?:https?://)?(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'      _TESTS = [{ -        u'url': u'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', -        u'file': u'645513.flv', -        #u'md5': u'b1639fd6ddfaa43788c85f6d1dddd412', -        u'info_dict': { -            u'title': u'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', -            u'description': u'md5:2d17137920c64f2f49981a7797d275ef', -            u'thumbnail': u'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', -            u'uploader': u'Yomiuri Telecasting Corporation (YTV)', -            u'upload_date': u'20131013', +        'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', +        'file': '645513.flv', +        #'md5': 'b1639fd6ddfaa43788c85f6d1dddd412', +        'info_dict': { +            'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', +            'description': 'md5:2d17137920c64f2f49981a7797d275ef', +            'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', +            'uploader': 'Yomiuri Telecasting Corporation (YTV)', +            'upload_date': '20131013',          }, -        u'params': { +        'params': {              # rtmp -            u'skip_download': True, +            'skip_download': True,          },      }]      _FORMAT_IDS = { -        u'360': (u'60', u'106'), -        u'480': (u'61', u'106'), -        u'720': (u'62', u'106'), -        u'1080': (u'80', u'108'), +        '360': ('60', '106'), +        '480': ('61', '106'), +        '720': ('62', '106'), +        '1080': ('80', '108'),      }      def _decrypt_subtitles(self, data, iv, id): @@ -63,7 +65,7 @@ class CrunchyrollIE(InfoExtractor):              num3 = key ^ num1              num4 = num3 ^ (num3 >> 3) ^ num2              prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2))) -            shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode(u'ascii')).digest()) +            shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest())              # Extend 160 Bit hash to 256 Bit              return shaHash + [0] * 12 @@ -79,93 +81,98 @@ class CrunchyrollIE(InfoExtractor):      def _convert_subtitles_to_srt(self, subtitles):          i=1 -        output = u'' +        output = ''          for start, end, text in re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles): -            start = start.replace(u'.', u',') -            end = end.replace(u'.', u',') +            start = start.replace('.', ',') +            end = end.replace('.', ',')              text = clean_html(text) -            text = text.replace(u'\\N', u'\n') +            text = text.replace('\\N', '\n')              if not text:                  continue -            output += u'%d\n%s --> %s\n%s\n\n' % (i, start, end, text) +            output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)              i+=1          return output      def _real_extract(self,url):          mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('video_id') + +        if mobj.group('prefix') == 'm': +            mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage') +            webpage_url = self._search_regex(r'<link rel="canonical" href="([^"]+)" />', mobile_webpage, 'webpage_url') +        else: +            webpage_url = 'http://www.' + mobj.group('url') -        webpage_url = u'http://www.' + mobj.group('url') -        video_id = mobj.group(u'video_id') -        webpage = self._download_webpage(webpage_url, video_id) -        note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, u'trailer-notice', default=u'') +        webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage') +        note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='')          if note_m:              raise ExtractorError(note_m) -        video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, u'video_title', flags=re.DOTALL) -        video_title = re.sub(r' {2,}', u' ', video_title) -        video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, u'video_description', default=u'') +        video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL) +        video_title = re.sub(r' {2,}', ' ', video_title) +        video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')          if not video_description:              video_description = None -        video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, u'video_upload_date', fatal=False, flags=re.DOTALL) +        video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, 'video_upload_date', fatal=False, flags=re.DOTALL)          if video_upload_date:              video_upload_date = unified_strdate(video_upload_date) -        video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, u'video_uploader', fatal=False, flags=re.DOTALL) +        video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL) -        playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, u'playerdata_url')) +        playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))          playerdata_req = compat_urllib_request.Request(playerdata_url) -        playerdata_req.data = compat_urllib_parse.urlencode({u'current_page': webpage_url}) -        playerdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded') -        playerdata = self._download_webpage(playerdata_req, video_id, note=u'Downloading media info') +        playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url}) +        playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') +        playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info') -        stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, u'stream_id') -        video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, u'thumbnail', fatal=False) +        stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id') +        video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)          formats = []          for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage):              stream_quality, stream_format = self._FORMAT_IDS[fmt] -            video_format = fmt+u'p' -            streamdata_req = compat_urllib_request.Request(u'http://www.crunchyroll.com/xml/') +            video_format = fmt+'p' +            streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/')              # urlencode doesn't work! -            streamdata_req.data = u'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+u'&media%5Fid='+stream_id+u'&video%5Fformat='+stream_format -            streamdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded') -            streamdata_req.add_header(u'Content-Length', str(len(streamdata_req.data))) -            streamdata = self._download_webpage(streamdata_req, video_id, note=u'Downloading media info for '+video_format) -            video_url = self._search_regex(r'<host>([^<]+)', streamdata, u'video_url') -            video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, u'video_play_path') +            streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+'&media%5Fid='+stream_id+'&video%5Fformat='+stream_format +            streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') +            streamdata_req.add_header('Content-Length', str(len(streamdata_req.data))) +            streamdata = self._download_webpage(streamdata_req, video_id, note='Downloading media info for '+video_format) +            video_url = self._search_regex(r'<host>([^<]+)', streamdata, 'video_url') +            video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, 'video_play_path')              formats.append({ -                u'url': video_url, -                u'play_path':   video_play_path, -                u'ext': 'flv', -                u'format': video_format, -                u'format_id': video_format, +                'url': video_url, +                'play_path':   video_play_path, +                'ext': 'flv', +                'format': video_format, +                'format_id': video_format,              })          subtitles = {}          for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): -            sub_page = self._download_webpage(u'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\ -                                              video_id, note=u'Downloading subtitles for '+sub_name) -            id = self._search_regex(r'id=\'([0-9]+)', sub_page, u'subtitle_id', fatal=False) -            iv = self._search_regex(r'<iv>([^<]+)', sub_page, u'subtitle_iv', fatal=False) -            data = self._search_regex(r'<data>([^<]+)', sub_page, u'subtitle_data', fatal=False) +            sub_page = self._download_webpage('http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\ +                                              video_id, note='Downloading subtitles for '+sub_name) +            id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False) +            iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False) +            data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)              if not id or not iv or not data:                  continue              id = int(id)              iv = base64.b64decode(iv)              data = base64.b64decode(data) -            subtitle = self._decrypt_subtitles(data, iv, id).decode(u'utf-8') -            lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, u'subtitle_lang_code', fatal=False) +            subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') +            lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, 'subtitle_lang_code', fatal=False)              if not lang_code:                  continue              subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)          return { -            u'id':          video_id, -            u'title':       video_title, -            u'description': video_description, -            u'thumbnail':   video_thumbnail, -            u'uploader':    video_uploader, -            u'upload_date': video_upload_date, -            u'subtitles':   subtitles, -            u'formats':     formats, +            'id':          video_id, +            'title':       video_title, +            'description': video_description, +            'thumbnail':   video_thumbnail, +            'uploader':    video_uploader, +            'upload_date': video_upload_date, +            'subtitles':   subtitles, +            'formats':     formats,          } diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index b32ff9f86..ae342341c 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -1,4 +1,7 @@  # encoding: utf-8 + +from __future__ import unicode_literals +  import re  import json @@ -30,7 +33,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):  class PluzzIE(FranceTVBaseInfoExtractor): -    IE_NAME = u'pluzz.francetv.fr' +    IE_NAME = 'pluzz.francetv.fr'      _VALID_URL = r'https?://pluzz\.francetv\.fr/videos/(.*?)\.html'      # Can't use tests, videos expire in 7 days @@ -44,17 +47,17 @@ class PluzzIE(FranceTVBaseInfoExtractor):  class FranceTvInfoIE(FranceTVBaseInfoExtractor): -    IE_NAME = u'francetvinfo.fr' +    IE_NAME = 'francetvinfo.fr'      _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+)\.html'      _TEST = { -        u'url': u'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', -        u'file': u'84981923.mp4', -        u'info_dict': { -            u'title': u'Soir 3', +        'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', +        'file': '84981923.mp4', +        'info_dict': { +            'title': 'Soir 3',          }, -        u'params': { -            u'skip_download': True, +        'params': { +            'skip_download': True,          },      } @@ -62,13 +65,13 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):          mobj = re.match(self._VALID_URL, url)          page_title = mobj.group('title')          webpage = self._download_webpage(url, page_title) -        video_id = self._search_regex(r'id-video=(\d+?)"', webpage, u'video id') +        video_id = self._search_regex(r'id-video=(\d+?)[@"]', webpage, 'video id')          return self._extract_video(video_id)  class FranceTVIE(FranceTVBaseInfoExtractor): -    IE_NAME = u'francetv' -    IE_DESC = u'France 2, 3, 4, 5 and Ô' +    IE_NAME = 'francetv' +    IE_DESC = 'France 2, 3, 4, 5 and Ô'      _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/          (?:              emissions/.*?/(videos|emissions)/(?P<id>[^/?]+) @@ -78,73 +81,73 @@ class FranceTVIE(FranceTVBaseInfoExtractor):      _TESTS = [          # france2          { -            u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', -            u'file': u'75540104.mp4', -            u'info_dict': { -                u'title': u'13h15, le samedi...', -                u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d', +            'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', +            'file': '75540104.mp4', +            'info_dict': { +                'title': '13h15, le samedi...', +                'description': 'md5:2e5b58ba7a2d3692b35c792be081a03d',              }, -            u'params': { +            'params': {                  # m3u8 download -                u'skip_download': True, +                'skip_download': True,              },          },          # france3          { -            u'url': u'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575', -            u'info_dict': { -                u'id': u'000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au', -                u'ext': u'flv', -                u'title': u'Le scandale du prix des médicaments', -                u'description': u'md5:1384089fbee2f04fc6c9de025ee2e9ce', +            'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575', +            'info_dict': { +                'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au', +                'ext': 'flv', +                'title': 'Le scandale du prix des médicaments', +                'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce',              }, -            u'params': { +            'params': {                  # rtmp download -                u'skip_download': True, +                'skip_download': True,              },          },          # france4          { -            u'url': u'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', -            u'info_dict': { -                u'id': u'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', -                u'ext': u'flv', -                u'title': u'Hero Corp Making of - Extrait 1', -                u'description': u'md5:c87d54871b1790679aec1197e73d650a', +            'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', +            'info_dict': { +                'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', +                'ext': 'flv', +                'title': 'Hero Corp Making of - Extrait 1', +                'description': 'md5:c87d54871b1790679aec1197e73d650a',              }, -            u'params': { +            'params': {                  # rtmp download -                u'skip_download': True, +                'skip_download': True,              },          },          # france5          { -            u'url': u'http://www.france5.fr/emissions/c-a-dire/videos/92837968', -            u'info_dict': { -                u'id': u'92837968', -                u'ext': u'mp4', -                u'title': u'C à dire ?!', -                u'description': u'md5:fb1db1cbad784dcce7c7a7bd177c8e2f', +            'url': 'http://www.france5.fr/emissions/c-a-dire/videos/92837968', +            'info_dict': { +                'id': '92837968', +                'ext': 'mp4', +                'title': 'C à dire ?!', +                'description': 'md5:fb1db1cbad784dcce7c7a7bd177c8e2f',              }, -            u'params': { +            'params': {                  # m3u8 download -                u'skip_download': True, +                'skip_download': True,              },          },          # franceo          { -            u'url': u'http://www.franceo.fr/jt/info-afrique/04-12-2013', -            u'info_dict': { -                u'id': u'92327925', -                u'ext': u'mp4', -                u'title': u'Infô-Afrique', -                u'description': u'md5:ebf346da789428841bee0fd2a935ea55', +            'url': 'http://www.franceo.fr/jt/info-afrique/04-12-2013', +            'info_dict': { +                'id': '92327925', +                'ext': 'mp4', +                'title': 'Infô-Afrique', +                'description': 'md5:ebf346da789428841bee0fd2a935ea55',              }, -            u'params': { +            'params': {                  # m3u8 download -                u'skip_download': True, +                'skip_download': True,              }, -            u'skip': u'The id changes frequently', +            'skip': 'The id changes frequently',          },      ] @@ -160,26 +163,26 @@ class FranceTVIE(FranceTVBaseInfoExtractor):                   '\.fr/\?id-video=([^"/&]+)'),                  (r'<a class="video" id="ftv_player_(.+?)"'),              ] -            video_id = self._html_search_regex(id_res, webpage, u'video ID') +            video_id = self._html_search_regex(id_res, webpage, 'video ID')          else:              video_id = mobj.group('id')          return self._extract_video(video_id)  class GenerationQuoiIE(InfoExtractor): -    IE_NAME = u'france2.fr:generation-quoi' +    IE_NAME = 'france2.fr:generation-quoi'      _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<name>.*)(\?|$)'      _TEST = { -        u'url': u'http://generation-quoi.france2.fr/portrait/garde-a-vous', -        u'file': u'k7FJX8VBcvvLmX4wA5Q.mp4', -        u'info_dict': { -            u'title': u'Génération Quoi - Garde à Vous', -            u'uploader': u'Génération Quoi', +        'url': 'http://generation-quoi.france2.fr/portrait/garde-a-vous', +        'file': 'k7FJX8VBcvvLmX4wA5Q.mp4', +        'info_dict': { +            'title': 'Génération Quoi - Garde à Vous', +            'uploader': 'Génération Quoi',          }, -        u'params': { +        'params': {              # It uses Dailymotion -            u'skip_download': True, +            'skip_download': True,          },      } @@ -194,20 +197,20 @@ class GenerationQuoiIE(InfoExtractor):  class CultureboxIE(FranceTVBaseInfoExtractor): -    IE_NAME = u'culturebox.francetvinfo.fr' +    IE_NAME = 'culturebox.francetvinfo.fr'      _VALID_URL = r'https?://culturebox\.francetvinfo\.fr/(?P<name>.*?)(\?|$)'      _TEST = { -        u'url': u'http://culturebox.francetvinfo.fr/einstein-on-the-beach-au-theatre-du-chatelet-146813', -        u'info_dict': { -            u'id': u'EV_6785', -            u'ext': u'mp4', -            u'title': u'Einstein on the beach au Théâtre du Châtelet', -            u'description': u'md5:9ce2888b1efefc617b5e58b3f6200eeb', +        'url': 'http://culturebox.francetvinfo.fr/einstein-on-the-beach-au-theatre-du-chatelet-146813', +        'info_dict': { +            'id': 'EV_6785', +            'ext': 'mp4', +            'title': 'Einstein on the beach au Théâtre du Châtelet', +            'description': 'md5:9ce2888b1efefc617b5e58b3f6200eeb',          }, -        u'params': { +        'params': {              # m3u8 download -            u'skip_download': True, +            'skip_download': True,          },      } @@ -215,5 +218,5 @@ class CultureboxIE(FranceTVBaseInfoExtractor):          mobj = re.match(self._VALID_URL, url)          name = mobj.group('name')          webpage = self._download_webpage(url, name) -        video_id = self._search_regex(r'"http://videos\.francetv\.fr/video/(.*?)"', webpage, u'video id') +        video_id = self._search_regex(r'"http://videos\.francetv\.fr/video/(.*?)"', webpage, 'video id')          return self._extract_video(video_id) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 2ccdb7073..7c40e6753 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import re  from .common import InfoExtractor @@ -6,13 +8,16 @@ from .common import InfoExtractor  class FunnyOrDieIE(InfoExtractor):      _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'      _TEST = { -        u'url': u'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version', -        u'file': u'0732f586d7.mp4', -        u'md5': u'f647e9e90064b53b6e046e75d0241fbd', -        u'info_dict': { -            u"description": u"Lyrics changed to match the video. Spoken cameo by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a concept by Dustin McLean (DustFilms.com). Performed, edited, and written by David A. Scott.",  -            u"title": u"Heart-Shaped Box: Literal Video Version" -        } +        'url': 'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version', +        'file': '0732f586d7.mp4', +        'md5': 'f647e9e90064b53b6e046e75d0241fbd', +        'info_dict': { +            'description': ('Lyrics changed to match the video. Spoken cameo ' +                'by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a ' +                'concept by Dustin McLean (DustFilms.com). Performed, edited, ' +                'and written by David A. Scott.'), +            'title': 'Heart-Shaped Box: Literal Video Version', +        },      }      def _real_extract(self, url): @@ -23,13 +28,12 @@ class FunnyOrDieIE(InfoExtractor):          video_url = self._search_regex(              [r'type="video/mp4" src="(.*?)"', r'src="([^>]*?)" type=\'video/mp4\''], -            webpage, u'video URL', flags=re.DOTALL) +            webpage, 'video URL', flags=re.DOTALL) -        info = { +        return {              'id': video_id,              'url': video_url,              'ext': 'mp4',              'title': self._og_search_title(webpage),              'description': self._og_search_description(webpage),          } -        return [info] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 829e5894f..082da9c77 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -38,18 +38,6 @@ class GenericIE(InfoExtractor):                  'title': 'R\u00e9gis plante sa Jeep',              }          }, -        # embedded vimeo video -        { -            'add_ie': ['Vimeo'], -            'url': 'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', -            'file': '22444065.mp4', -            'md5': '2903896e23df39722c33f015af0666e2', -            'info_dict': { -                'title': 'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011', -                'uploader_id': 'skillsmatter', -                'uploader': 'Skills Matter', -            } -        },          # bandcamp page with custom domain          {              'add_ie': ['Bandcamp'], @@ -78,6 +66,18 @@ class GenericIE(InfoExtractor):                  'skip_download': True,              },          }, +        { +            # https://github.com/rg3/youtube-dl/issues/2253 +            'url': 'http://bcove.me/i6nfkrc3', +            'file': '3101154703001.mp4', +            'md5': '0ba9446db037002366bab3b3eb30c88c', +            'info_dict': { +                'title': 'Still no power', +                'uploader': 'thestar.com', +                'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', +            }, +            'add_ie': ['Brightcove'], +        },          # Direct link to a video          {              'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', @@ -242,7 +242,7 @@ class GenericIE(InfoExtractor):          # Look for embedded (iframe) Vimeo player          mobj = re.search( -            r'<iframe[^>]+?src="((?:https?:)?//player.vimeo.com/video/.+?)"', webpage) +            r'<iframe[^>]+?src="((?:https?:)?//player\.vimeo\.com/video/.+?)"', webpage)          if mobj:              player_url = unescapeHTML(mobj.group(1))              surl = smuggle_url(player_url, {'Referer': url}) @@ -250,7 +250,7 @@ class GenericIE(InfoExtractor):          # Look for embedded (swf embed) Vimeo player          mobj = re.search( -            r'<embed[^>]+?src="(https?://(?:www\.)?vimeo.com/moogaloop.swf.+?)"', webpage) +            r'<embed[^>]+?src="(https?://(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)          if mobj:              return self.url_result(mobj.group(1), 'Vimeo') @@ -320,7 +320,7 @@ class GenericIE(InfoExtractor):              return self.url_result(mobj.group(1), 'Aparat')          # Look for MPORA videos -        mobj = re.search(r'<iframe .*?src="(http://mpora\.com/videos/[^"]+)"', webpage) +        mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)          if mobj is not None:              return self.url_result(mobj.group(1), 'Mpora') @@ -338,7 +338,7 @@ class GenericIE(InfoExtractor):          # Look for embedded Huffington Post player          mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live.huffingtonpost\.com/.+?)\1', webpage) +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)          if mobj is not None:              return self.url_result(mobj.group('url'), 'HuffPost') @@ -346,7 +346,7 @@ class GenericIE(InfoExtractor):          mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)          if mobj is None:              # Look for gorilla-vid style embedding -            mobj = re.search(r'(?s)jw_plugins.*?file:\s*["\'](.*?)["\']', webpage) +            mobj = re.search(r'(?s)(?:jw_plugins|JWPlayerOptions).*?file\s*:\s*["\'](.*?)["\']', webpage)          if mobj is None:              # Broaden the search a little bit              mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 1763af020..7cee505c0 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -69,12 +69,9 @@ class ImdbListIE(InfoExtractor):          list_id = mobj.group('id')          webpage = self._download_webpage(url, list_id) -        list_code = self._search_regex( -            r'(?s)<div\s+class="list\sdetail">(.*?)class="see-more"', -            webpage, 'list code')          entries = [              self.url_result('http://www.imdb.com' + m, 'Imdb') -            for m in re.findall(r'href="(/video/imdb/vi[^"]+)"', webpage)] +            for m in re.findall(r'href="(/video/imdb/vi[^"]+)"\s+data-type="playlist"', webpage)]          list_title = self._html_search_regex(              r'<h1 class="header">(.*?)</h1>', webpage, 'list title') diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index c79c589c7..7c208b85d 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -1,27 +1,27 @@ +from __future__ import unicode_literals +  import base64  import re  from .common import InfoExtractor  from ..utils import (      compat_urllib_parse, - -    ExtractorError,  )  class InfoQIE(InfoExtractor):      _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'      _TEST = { -        u"name": u"InfoQ", -        u"url": u"http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things", -        u"file": u"12-jan-pythonthings.mp4", -        u"info_dict": { -            u"description": u"Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.", -            u"title": u"A Few of My Favorite [Python] Things" +        "name": "InfoQ", +        "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things", +        "file": "12-jan-pythonthings.mp4", +        "info_dict": { +            "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.", +            "title": "A Few of My Favorite [Python] Things", +        }, +        "params": { +            "skip_download": True,          }, -        u"params": { -            u"skip_download": True -        }      }      def _real_extract(self, url): @@ -31,32 +31,25 @@ class InfoQIE(InfoExtractor):          self.report_extraction(url)          # Extract video URL -        mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage) -        if mobj is None: -            raise ExtractorError(u'Unable to extract video url') -        real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8')) +        encoded_id = self._search_regex(r"jsclassref ?= ?'([^']*)'", webpage, 'encoded id') +        real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))          video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id          # Extract title          video_title = self._search_regex(r'contentTitle = "(.*?)";', -            webpage, u'title') +            webpage, 'title')          # Extract description          video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', -            webpage, u'description', fatal=False) +            webpage, 'description', fatal=False)          video_filename = video_url.split('/')[-1]          video_id, extension = video_filename.split('.') -        info = { +        return {              'id': video_id,              'url': video_url, -            'uploader': None, -            'upload_date': None,              'title': video_title,              'ext': extension, # Extension is always(?) mp4, but seems to be flv -            'thumbnail': None,              'description': video_description,          } - -        return [info]
\ No newline at end of file diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 98d1d272a..18dd9cb1e 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -1,4 +1,5 @@  # encoding: utf-8 +from __future__ import unicode_literals  import re  import json @@ -11,38 +12,38 @@ from ..utils import (  class IviIE(InfoExtractor): -    IE_DESC = u'ivi.ru' -    IE_NAME = u'ivi' +    IE_DESC = 'ivi.ru' +    IE_NAME = 'ivi'      _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)'      _TESTS = [          # Single movie          { -            u'url': u'http://www.ivi.ru/watch/53141', -            u'file': u'53141.mp4', -            u'md5': u'6ff5be2254e796ed346251d117196cf4', -            u'info_dict': { -                u'title': u'Иван Васильевич меняет профессию', -                u'description': u'md5:14d8eda24e9d93d29b5857012c6d6346', -                u'duration': 5498, -                u'thumbnail': u'http://thumbs.ivi.ru/f20.vcp.digitalaccess.ru/contents/d/1/c3c885163a082c29bceeb7b5a267a6.jpg', +            'url': 'http://www.ivi.ru/watch/53141', +            'file': '53141.mp4', +            'md5': '6ff5be2254e796ed346251d117196cf4', +            'info_dict': { +                'title': 'Иван Васильевич меняет профессию', +                'description': 'md5:b924063ea1677c8fe343d8a72ac2195f', +                'duration': 5498, +                'thumbnail': 'http://thumbs.ivi.ru/f20.vcp.digitalaccess.ru/contents/d/1/c3c885163a082c29bceeb7b5a267a6.jpg',              }, -            u'skip': u'Only works from Russia', +            'skip': 'Only works from Russia',          },          # Serial's serie          { -            u'url': u'http://www.ivi.ru/watch/dezhurnyi_angel/74791', -            u'file': u'74791.mp4', -            u'md5': u'3e6cc9a848c1d2ebcc6476444967baa9', -            u'info_dict': { -                u'title': u'Дежурный ангел - 1 серия', -                u'duration': 2490, -                u'thumbnail': u'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg', +            'url': 'http://www.ivi.ru/watch/dezhurnyi_angel/74791', +            'file': '74791.mp4', +            'md5': '3e6cc9a848c1d2ebcc6476444967baa9', +            'info_dict': { +                'title': 'Дежурный ангел - 1 серия', +                'duration': 2490, +                'thumbnail': 'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg',              }, -            u'skip': u'Only works from Russia', +            'skip': 'Only works from Russia',           }      ] -     +      # Sorted by quality      _known_formats = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ'] @@ -54,7 +55,7 @@ class IviIE(InfoExtractor):          return m.group('description') if m is not None else None      def _extract_comment_count(self, html): -        m = re.search(u'(?s)<a href="#" id="view-comments" class="action-button dim gradient">\s*Комментарии:\s*(?P<commentcount>\d+)\s*</a>', html) +        m = re.search('(?s)<a href="#" id="view-comments" class="action-button dim gradient">\s*Комментарии:\s*(?P<commentcount>\d+)\s*</a>', html)          return int(m.group('commentcount')) if m is not None else 0      def _real_extract(self, url): @@ -63,49 +64,49 @@ class IviIE(InfoExtractor):          api_url = 'http://api.digitalaccess.ru/api/json/' -        data = {u'method': u'da.content.get', -                u'params': [video_id, {u'site': u's183', -                                       u'referrer': u'http://www.ivi.ru/watch/%s' % video_id, -                                       u'contentid': video_id -                                    } -                            ] +        data = {'method': 'da.content.get', +                'params': [video_id, {'site': 's183', +                                      'referrer': 'http://www.ivi.ru/watch/%s' % video_id, +                                      'contentid': video_id +                                      } +                           ]                  }          request = compat_urllib_request.Request(api_url, json.dumps(data)) -        video_json_page = self._download_webpage(request, video_id, u'Downloading video JSON') +        video_json_page = self._download_webpage(request, video_id, 'Downloading video JSON')          video_json = json.loads(video_json_page) -        if u'error' in video_json: -            error = video_json[u'error'] -            if error[u'origin'] == u'NoRedisValidData': -                raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) -            raise ExtractorError(u'Unable to download video %s: %s' % (video_id, error[u'message']), expected=True) +        if 'error' in video_json: +            error = video_json['error'] +            if error['origin'] == 'NoRedisValidData': +                raise ExtractorError('Video %s does not exist' % video_id, expected=True) +            raise ExtractorError('Unable to download video %s: %s' % (video_id, error['message']), expected=True) -        result = video_json[u'result'] +        result = video_json['result']          formats = [{ -            'url': x[u'url'], -            'format_id': x[u'content_format'], -            'preference': self._known_formats.index(x[u'content_format']), -        } for x in result[u'files'] if x[u'content_format'] in self._known_formats] +            'url': x['url'], +            'format_id': x['content_format'], +            'preference': self._known_formats.index(x['content_format']), +        } for x in result['files'] if x['content_format'] in self._known_formats]          self._sort_formats(formats)          if not formats: -            raise ExtractorError(u'No media links available for %s' % video_id) +            raise ExtractorError('No media links available for %s' % video_id) -        duration = result[u'duration'] -        compilation = result[u'compilation'] -        title = result[u'title'] +        duration = result['duration'] +        compilation = result['compilation'] +        title = result['title']          title = '%s - %s' % (compilation, title) if compilation is not None else title   -        previews = result[u'preview'] +        previews = result['preview']          previews.sort(key=lambda fmt: self._known_thumbnails.index(fmt['content_format'])) -        thumbnail = previews[-1][u'url'] if len(previews) > 0 else None +        thumbnail = previews[-1]['url'] if len(previews) > 0 else None -        video_page = self._download_webpage(url, video_id, u'Downloading video page') +        video_page = self._download_webpage(url, video_id, 'Downloading video page')          description = self._extract_description(video_page)          comment_count = self._extract_comment_count(video_page) @@ -121,8 +122,8 @@ class IviIE(InfoExtractor):  class IviCompilationIE(InfoExtractor): -    IE_DESC = u'ivi.ru compilations' -    IE_NAME = u'ivi:compilation' +    IE_DESC = 'ivi.ru compilations' +    IE_NAME = 'ivi:compilation'      _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$'      def _extract_entries(self, html, compilation_id): @@ -135,22 +136,23 @@ class IviCompilationIE(InfoExtractor):          season_id = mobj.group('seasonid')          if season_id is not None: # Season link -            season_page = self._download_webpage(url, compilation_id, u'Downloading season %s web page' % season_id) +            season_page = self._download_webpage(url, compilation_id, 'Downloading season %s web page' % season_id)              playlist_id = '%s/season%s' % (compilation_id, season_id) -            playlist_title = self._html_search_meta(u'title', season_page, u'title') +            playlist_title = self._html_search_meta('title', season_page, 'title')              entries = self._extract_entries(season_page, compilation_id)          else: # Compilation link             -            compilation_page = self._download_webpage(url, compilation_id, u'Downloading compilation web page') +            compilation_page = self._download_webpage(url, compilation_id, 'Downloading compilation web page')              playlist_id = compilation_id -            playlist_title = self._html_search_meta(u'title', compilation_page, u'title') +            playlist_title = self._html_search_meta('title', compilation_page, 'title')              seasons = re.findall(r'<a href="/watch/%s/season(\d+)">[^<]+</a>' % compilation_id, compilation_page)              if len(seasons) == 0: # No seasons in this compilation                  entries = self._extract_entries(compilation_page, compilation_id)              else:                  entries = []                  for season_id in seasons: -                    season_page = self._download_webpage('http://www.ivi.ru/watch/%s/season%s' % (compilation_id, season_id), -                                                         compilation_id, u'Downloading season %s web page' % season_id) +                    season_page = self._download_webpage( +                        'http://www.ivi.ru/watch/%s/season%s' % (compilation_id, season_id), +                        compilation_id, 'Downloading season %s web page' % season_id)                      entries.extend(self._extract_entries(season_page, compilation_id))          return self.playlist_result(entries, playlist_id, playlist_title)
\ No newline at end of file diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py index a7b88d2d9..5d679e88d 100644 --- a/youtube_dl/extractor/keek.py +++ b/youtube_dl/extractor/keek.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import re  from .common import InfoExtractor @@ -5,36 +7,34 @@ from .common import InfoExtractor  class KeekIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)' -    IE_NAME = u'keek' +    IE_NAME = 'keek'      _TEST = { -        u'url': u'https://www.keek.com/ytdl/keeks/NODfbab', -        u'file': u'NODfbab.mp4', -        u'md5': u'9b0636f8c0f7614afa4ea5e4c6e57e83', -        u'info_dict': { -            u"uploader": u"ytdl",  -            u"title": u"test chars: \"'/\\\u00e4<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de ." -        } +        'url': 'https://www.keek.com/ytdl/keeks/NODfbab', +        'file': 'NODfbab.mp4', +        'md5': '9b0636f8c0f7614afa4ea5e4c6e57e83', +        'info_dict': { +            'uploader': 'ytdl', +            'title': 'test chars: "\'/\\\u00e4<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de .', +        },      }      def _real_extract(self, url):          m = re.match(self._VALID_URL, url)          video_id = m.group('videoID') -        video_url = u'http://cdn.keek.com/keek/video/%s' % video_id -        thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id +        video_url = 'http://cdn.keek.com/keek/video/%s' % video_id +        thumbnail = 'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id          webpage = self._download_webpage(url, video_id) -        video_title = self._og_search_title(webpage) - -        uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', -            webpage, u'uploader', fatal=False) - -        info = { -                'id': video_id, -                'url': video_url, -                'ext': 'mp4', -                'title': video_title, -                'thumbnail': thumbnail, -                'uploader': uploader +        uploader = self._html_search_regex( +            r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', +            webpage, 'uploader', fatal=False) + +        return { +            'id': video_id, +            'url': video_url, +            'ext': 'mp4', +            'title': self._og_search_title(webpage), +            'thumbnail': thumbnail, +            'uploader': uploader          } -        return [info] diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py index 6d61f9a90..db2028e9f 100644 --- a/youtube_dl/extractor/la7.py +++ b/youtube_dl/extractor/la7.py @@ -26,7 +26,8 @@ class LA7IE(InfoExtractor):              'title': 'IL DIVO',              'description': 'Un film di Paolo Sorrentino con Toni Servillo, Anna Bonaiuto, Giulio Bosetti  e Flavio Bucci',              'duration': 6254, -        } +        }, +        'skip': 'Blocked in the US',      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 5ae57a77c..4e76c1f4a 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -1,3 +1,6 @@ +from __future__ import unicode_literals + +import json  import re  from .common import InfoExtractor @@ -7,46 +10,57 @@ from ..utils import (  class LiveLeakIE(InfoExtractor): -      _VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' -    IE_NAME = u'liveleak' -    _TEST = { -        u'url': u'http://www.liveleak.com/view?i=757_1364311680', -        u'file': u'757_1364311680.mp4', -        u'md5': u'0813c2430bea7a46bf13acf3406992f4', -        u'info_dict': { -            u"description": u"extremely bad day for this guy..!",  -            u"uploader": u"ljfriel2",  -            u"title": u"Most unlucky car accident" +    _TESTS = [{ +        'url': 'http://www.liveleak.com/view?i=757_1364311680', +        'file': '757_1364311680.mp4', +        'md5': '0813c2430bea7a46bf13acf3406992f4', +        'info_dict': { +            'description': 'extremely bad day for this guy..!', +            'uploader': 'ljfriel2', +            'title': 'Most unlucky car accident'          } -    } +    }, +    { +        'url': 'http://www.liveleak.com/view?i=f93_1390833151', +        'file': 'f93_1390833151.mp4', +        'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf', +        'info_dict': { +            'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.', +            'uploader': 'ARD_Stinkt', +            'title': 'German Television does first Edward Snowden Interview (ENGLISH)', +        } +    }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url)          video_id = mobj.group('video_id') -          webpage = self._download_webpage(url, video_id) +        sources_raw = self._search_regex( +            r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) +        if sources_raw is None: +            sources_raw = '[{ %s}]' % ( +                self._search_regex(r'(file: ".*?"),', webpage, 'video URL')) -        video_url = self._search_regex(r'file: "(.*?)",', -            webpage, u'video URL') +        sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw) +        sources = json.loads(sources_json) -        video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip() +        formats = [{ +            'format_note': s.get('label'), +            'url': s['file'], +        } for s in sources] +        self._sort_formats(formats) +        video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()          video_description = self._og_search_description(webpage) +        video_uploader = self._html_search_regex( +            r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False) -        video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>', -            webpage, u'uploader', fatal=False) - -        info = { -            'id':  video_id, -            'url': video_url, -            'ext': 'mp4', +        return { +            'id': video_id,              'title': video_title,              'description': video_description, -            'uploader': video_uploader +            'uploader': video_uploader, +            'formats': formats,          } - -        return [info] diff --git a/youtube_dl/extractor/malemotion.py b/youtube_dl/extractor/malemotion.py index 62e99091d..8c1966ab2 100644 --- a/youtube_dl/extractor/malemotion.py +++ b/youtube_dl/extractor/malemotion.py @@ -16,7 +16,8 @@ class MalemotionIE(InfoExtractor):          'info_dict': {              "title": "Bien dur",              "age_limit": 18, -        } +        }, +        'skip': 'This video has been deleted.'      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index 4becddee6..4fa0575f8 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals  import os.path  from .common import InfoExtractor @@ -11,13 +12,13 @@ from ..utils import (  class MySpassIE(InfoExtractor):      _VALID_URL = r'http://www\.myspass\.de/.*'      _TEST = { -        u'url': u'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', -        u'file': u'11741.mp4', -        u'md5': u'0b49f4844a068f8b33f4b7c88405862b', -        u'info_dict': { -            u"description": u"Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?",  -            u"title": u"Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2" -        } +        'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', +        'file': '11741.mp4', +        'md5': '0b49f4844a068f8b33f4b7c88405862b', +        'info_dict': { +            "description": "Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?", +            "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2", +        },      }      def _real_extract(self, url): @@ -37,12 +38,11 @@ class MySpassIE(InfoExtractor):          # extract values from metadata          url_flv_el = metadata.find('url_flv')          if url_flv_el is None: -            raise ExtractorError(u'Unable to extract download url') +            raise ExtractorError('Unable to extract download url')          video_url = url_flv_el.text -        extension = os.path.splitext(video_url)[1][1:]          title_el = metadata.find('title')          if title_el is None: -            raise ExtractorError(u'Unable to extract title') +            raise ExtractorError('Unable to extract title')          title = title_el.text          format_id_el = metadata.find('format_id')          if format_id_el is None: @@ -59,13 +59,12 @@ class MySpassIE(InfoExtractor):              thumbnail = imagePreview_el.text          else:              thumbnail = None -        info = { + +        return {              'id': video_id,              'url': video_url,              'title': title, -            'ext': extension,              'format': format,              'thumbnail': thumbnail, -            'description': description +            'description': description,          } -        return [info] diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 0f178905b..7e421610e 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -1,48 +1,39 @@ +from __future__ import unicode_literals +  import re  from .common import InfoExtractor -from ..utils import ( -    ExtractorError, -)  class NBAIE(InfoExtractor):      _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'      _TEST = { -        u'url': u'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', -        u'file': u'0021200253-okc-bkn-recap.nba.mp4', -        u'md5': u'c0edcfc37607344e2ff8f13c378c88a4', -        u'info_dict': { -            u"description": u"Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.",  -            u"title": u"Thunder vs. Nets" -        } +        'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', +        'file': u'0021200253-okc-bkn-recap.nba.mp4', +        'md5': u'c0edcfc37607344e2ff8f13c378c88a4', +        'info_dict': { +            'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', +            'title': 'Thunder vs. Nets', +        },      }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url) -          video_id = mobj.group(1)          webpage = self._download_webpage(url, video_id) -        video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' +        video_url = 'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'          shortened_video_id = video_id.rpartition('/')[2]          title = self._og_search_title(webpage, default=shortened_video_id).replace('NBA.com: ', '') -        # It isn't there in the HTML it returns to us -        # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) -          description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) -        info = { +        return {              'id': shortened_video_id,              'url': video_url,              'ext': 'mp4',              'title': title, -            # 'uploader_date': uploader_date,              'description': description,          } -        return [info] diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index ea986c00e..2b7236be5 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import json  import re @@ -9,13 +11,13 @@ class NineGagIE(InfoExtractor):      _VALID_URL = r'^https?://(?:www\.)?9gag\.tv/v/(?P<id>[0-9]+)'      _TEST = { -        u"url": u"http://9gag.tv/v/1912", -        u"file": u"1912.mp4", -        u"info_dict": { -            u"description": u"This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)", -            u"title": u"\"People Are Awesome 2013\" Is Absolutely Awesome" +        "url": "http://9gag.tv/v/1912", +        "file": "1912.mp4", +        "info_dict": { +            "description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)", +            "title": "\"People Are Awesome 2013\" Is Absolutely Awesome"          }, -        u'add_ie': [u'Youtube'] +        'add_ie': ['Youtube']      }      def _real_extract(self, url): @@ -25,7 +27,7 @@ class NineGagIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          data_json = self._html_search_regex(r'''(?x)              <div\s*id="tv-video"\s*data-video-source="youtube"\s* -                data-video-meta="([^"]+)"''', webpage, u'video metadata') +                data-video-meta="([^"]+)"''', webpage, 'video metadata')          data = json.loads(data_json) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index d08e47734..44312ba4e 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -5,7 +5,7 @@ from .common import InfoExtractor  from ..utils import unescapeHTML  class OoyalaIE(InfoExtractor): -    _VALID_URL = r'https?://.+?\.ooyala\.com/.*?embedCode=(?P<id>.+?)(&|$)' +    _VALID_URL = r'https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=(?P<id>.+?)(&|$)'      _TEST = {          # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index e9ff8d1af..58f9c690e 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import re  from .common import InfoExtractor @@ -7,12 +9,12 @@ from ..utils import compat_urllib_parse  class PornHdIE(InfoExtractor):      _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)'      _TEST = { -        u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', -        u'file': u'1962.flv', -        u'md5': u'35272469887dca97abd30abecc6cdf75', -        u'info_dict': { -            u"title": u"sierra-day-gets-his-cum-all-over-herself-hd-porn-video", -            u"age_limit": 18, +        'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', +        'file': '1962.flv', +        'md5': '35272469887dca97abd30abecc6cdf75', +        'info_dict': { +            "title": "sierra-day-gets-his-cum-all-over-herself-hd-porn-video", +            "age_limit": 18,          }      } @@ -24,9 +26,13 @@ class PornHdIE(InfoExtractor):          webpage = self._download_webpage(url, video_id) -        video_url = self._html_search_regex( -            r'&hd=(http.+?)&', webpage, u'video URL') -        video_url = compat_urllib_parse.unquote(video_url) +        next_url = self._html_search_regex( +            r'&hd=(http.+?)&', webpage, 'video URL') +        next_url = compat_urllib_parse.unquote(next_url) + +        video_url = self._download_webpage( +            next_url, video_id, note='Retrieving video URL', +            errnote='Could not retrieve video URL')          age_limit = 18          return { diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index 4b6147a73..b9cb7abd1 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -1,3 +1,6 @@ +# encoding: utf-8 +from __future__ import unicode_literals +  import json  import re @@ -12,16 +15,16 @@ from ..utils import (  class RBMARadioIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'      _TEST = { -        u'url': u'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011', -        u'file': u'ford-lopatin-live-at-primavera-sound-2011.mp3', -        u'md5': u'6bc6f9bcb18994b4c983bc3bf4384d95', -        u'info_dict': { -            u"uploader_id": u"ford-lopatin",  -            u"location": u"Spain",  -            u"description": u"Joel Ford and Daniel \u2019Oneohtrix Point Never\u2019 Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.",  -            u"uploader": u"Ford & Lopatin",  -            u"title": u"Live at Primavera Sound 2011" -        } +        'url': 'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011', +        'file': 'ford-lopatin-live-at-primavera-sound-2011.mp3', +        'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', +        'info_dict': { +            "uploader_id": "ford-lopatin", +            "location": "Spain", +            "description": "Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.", +            "uploader": "Ford & Lopatin", +            "title": "Live at Primavera Sound 2011", +        },      }      def _real_extract(self, url): @@ -31,26 +34,24 @@ class RBMARadioIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', -            webpage, u'json data', flags=re.MULTILINE) +            webpage, 'json data', flags=re.MULTILINE)          try:              data = json.loads(json_data)          except ValueError as e: -            raise ExtractorError(u'Invalid JSON: ' + str(e)) +            raise ExtractorError('Invalid JSON: ' + str(e))          video_url = data['akamai_url'] + '&cbr=256'          url_parts = compat_urllib_parse_urlparse(video_url) -        video_ext = url_parts.path.rpartition('.')[2] -        info = { -                'id': video_id, -                'url': video_url, -                'ext': video_ext, -                'title': data['title'], -                'description': data.get('teaser_text'), -                'location': data.get('country_of_origin'), -                'uploader': data.get('host', {}).get('name'), -                'uploader_id': data.get('host', {}).get('slug'), -                'thumbnail': data.get('image', {}).get('large_url_2x'), -                'duration': data.get('duration'), + +        return { +            'id': video_id, +            'url': video_url, +            'title': data['title'], +            'description': data.get('teaser_text'), +            'location': data.get('country_of_origin'), +            'uploader': data.get('host', {}).get('name'), +            'uploader_id': data.get('host', {}).get('slug'), +            'thumbnail': data.get('image', {}).get('large_url_2x'), +            'duration': data.get('duration'),          } -        return [info] diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index e3e9bc07f..4922dd764 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -1,58 +1,124 @@  # encoding: utf-8 +from __future__ import unicode_literals +  import re  import json +import itertools  from .common import InfoExtractor  from ..utils import ( -    compat_urlparse,      compat_str, +    unified_strdate,      ExtractorError,  )  class RutubeIE(InfoExtractor): -    _VALID_URL = r'https?://rutube\.ru/video/(?P<long_id>\w+)' +    IE_NAME = 'rutube' +    IE_DESC = 'Rutube videos' +    _VALID_URL = r'https?://rutube\.ru/video/(?P<id>[\da-z]{32})'      _TEST = { -        u'url': u'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', -        u'file': u'3eac3b4561676c17df9132a9a1e62e3e.mp4', -        u'info_dict': { -            u'title': u'Раненный кенгуру забежал в аптеку', -            u'uploader': u'NTDRussian', -            u'uploader_id': u'29790', +        'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', +        'file': '3eac3b4561676c17df9132a9a1e62e3e.mp4', +        'info_dict': { +            'title': 'Раненный кенгуру забежал в аптеку', +            'description': 'http://www.ntdtv.ru ', +            'duration': 80, +            'uploader': 'NTDRussian', +            'uploader_id': '29790', +            'upload_date': '20131016',          }, -        u'params': { +        'params': {              # It requires ffmpeg (m3u8 download) -            u'skip_download': True, +            'skip_download': True,          },      } -    def _get_api_response(self, short_id, subpath): -        api_url = 'http://rutube.ru/api/play/%s/%s/?format=json' % (subpath, short_id) -        response_json = self._download_webpage(api_url, short_id, -            u'Downloading %s json' % subpath) -        return json.loads(response_json) -      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        long_id = mobj.group('long_id') -        webpage = self._download_webpage(url, long_id) -        og_video = self._og_search_video_url(webpage) -        short_id = compat_urlparse.urlparse(og_video).path[1:] -        options = self._get_api_response(short_id, 'options') -        trackinfo = self._get_api_response(short_id, 'trackinfo') +        video_id = mobj.group('id') +         +        api_response = self._download_webpage('http://rutube.ru/api/video/%s/?format=json' % video_id, +                                              video_id, 'Downloading video JSON') +        video = json.loads(api_response) +         +        api_response = self._download_webpage('http://rutube.ru/api/play/trackinfo/%s/?format=json' % video_id, +                                              video_id, 'Downloading trackinfo JSON') +        trackinfo = json.loads(api_response) +                  # Some videos don't have the author field          author = trackinfo.get('author') or {}          m3u8_url = trackinfo['video_balancer'].get('m3u8')          if m3u8_url is None: -            raise ExtractorError(u'Couldn\'t find m3u8 manifest url') +            raise ExtractorError('Couldn\'t find m3u8 manifest url')          return { -            'id': trackinfo['id'], -            'title': trackinfo['title'], +            'id': video['id'], +            'title': video['title'], +            'description': video['description'], +            'duration': video['duration'], +            'view_count': video['hits'],              'url': m3u8_url,              'ext': 'mp4', -            'thumbnail': options['thumbnail_url'], +            'thumbnail': video['thumbnail_url'],              'uploader': author.get('name'),              'uploader_id': compat_str(author['id']) if author else None, +            'upload_date': unified_strdate(video['created_ts']), +            'age_limit': 18 if video['is_adult'] else 0,          } + + +class RutubeChannelIE(InfoExtractor): +    IE_NAME = 'rutube:channel' +    IE_DESC = 'Rutube channels' +    _VALID_URL = r'http://rutube\.ru/tags/video/(?P<id>\d+)' + +    _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' + +    def _extract_videos(self, channel_id, channel_title=None): +        entries = [] +        for pagenum in itertools.count(1): +            api_response = self._download_webpage( +                self._PAGE_TEMPLATE % (channel_id, pagenum), +                channel_id, 'Downloading page %s' % pagenum) +            page = json.loads(api_response) +            results = page['results'] +            if not results: +                break +            entries.extend(self.url_result(result['video_url'], 'Rutube') for result in results) +            if not page['has_next']: +                break +        return self.playlist_result(entries, channel_id, channel_title) + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        channel_id = mobj.group('id') +        return self._extract_videos(channel_id) + + +class RutubeMovieIE(RutubeChannelIE): +    IE_NAME = 'rutube:movie' +    IE_DESC = 'Rutube movies' +    _VALID_URL = r'http://rutube\.ru/metainfo/tv/(?P<id>\d+)' + +    _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' +    _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        movie_id = mobj.group('id') +        api_response = self._download_webpage( +            self._MOVIE_TEMPLATE % movie_id, movie_id, +            'Downloading movie JSON') +        movie = json.loads(api_response) +        movie_name = movie['name'] +        return self._extract_videos(movie_id, movie_name) + + +class RutubePersonIE(RutubeChannelIE): +    IE_NAME = 'rutube:person' +    IE_DESC = 'Rutube person videos' +    _VALID_URL = r'http://rutube\.ru/video/person/(?P<id>\d+)' + +    _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 99f5b19d2..f249f013c 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -1,4 +1,5 @@  # encoding: utf-8 +from __future__ import unicode_literals  import os.path  import re @@ -16,76 +17,76 @@ from ..utils import (  class SmotriIE(InfoExtractor): -    IE_DESC = u'Smotri.com' -    IE_NAME = u'smotri' +    IE_DESC = 'Smotri.com' +    IE_NAME = 'smotri'      _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))'      _TESTS = [          # real video id 2610366          { -            u'url': u'http://smotri.com/video/view/?id=v261036632ab', -            u'file': u'v261036632ab.mp4', -            u'md5': u'2a7b08249e6f5636557579c368040eb9', -            u'info_dict': { -                u'title': u'катастрофа с камер видеонаблюдения', -                u'uploader': u'rbc2008', -                u'uploader_id': u'rbc08', -                u'upload_date': u'20131118', -                u'description': u'катастрофа с камер видеонаблюдения, видео катастрофа с камер видеонаблюдения', -                u'thumbnail': u'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg', +            'url': 'http://smotri.com/video/view/?id=v261036632ab', +            'file': 'v261036632ab.mp4', +            'md5': '2a7b08249e6f5636557579c368040eb9', +            'info_dict': { +                'title': 'катастрофа с камер видеонаблюдения', +                'uploader': 'rbc2008', +                'uploader_id': 'rbc08', +                'upload_date': '20131118', +                'description': 'катастрофа с камер видеонаблюдения, видео катастрофа с камер видеонаблюдения', +                'thumbnail': 'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg',              },          },          # real video id 57591          { -            u'url': u'http://smotri.com/video/view/?id=v57591cb20', -            u'file': u'v57591cb20.flv', -            u'md5': u'830266dfc21f077eac5afd1883091bcd', -            u'info_dict': { -                u'title': u'test', -                u'uploader': u'Support Photofile@photofile', -                u'uploader_id': u'support-photofile', -                u'upload_date': u'20070704', -                u'description': u'test, видео test', -                u'thumbnail': u'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', +            'url': 'http://smotri.com/video/view/?id=v57591cb20', +            'file': 'v57591cb20.flv', +            'md5': '830266dfc21f077eac5afd1883091bcd', +            'info_dict': { +                'title': 'test', +                'uploader': 'Support Photofile@photofile', +                'uploader_id': 'support-photofile', +                'upload_date': '20070704', +                'description': 'test, видео test', +                'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',              },          },          # video-password          { -            u'url': u'http://smotri.com/video/view/?id=v1390466a13c', -            u'file': u'v1390466a13c.mp4', -            u'md5': u'f6331cef33cad65a0815ee482a54440b', -            u'info_dict': { -                u'title': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', -                u'uploader': u'timoxa40', -                u'uploader_id': u'timoxa40', -                u'upload_date': u'20100404', -                u'thumbnail': u'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg', -                u'description': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1, видео TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', +            'url': 'http://smotri.com/video/view/?id=v1390466a13c', +            'file': 'v1390466a13c.mp4', +            'md5': 'f6331cef33cad65a0815ee482a54440b', +            'info_dict': { +                'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', +                'uploader': 'timoxa40', +                'uploader_id': 'timoxa40', +                'upload_date': '20100404', +                'thumbnail': 'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg', +                'description': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1, видео TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',              }, -            u'params': { -                u'videopassword': u'qwerty', +            'params': { +                'videopassword': 'qwerty',              },          },          # age limit + video-password          { -            u'url': u'http://smotri.com/video/view/?id=v15408898bcf', -            u'file': u'v15408898bcf.flv', -            u'md5': u'91e909c9f0521adf5ee86fbe073aad70', -            u'info_dict': { -                u'title': u'этот ролик не покажут по ТВ', -                u'uploader': u'zzxxx', -                u'uploader_id': u'ueggb', -                u'upload_date': u'20101001', -                u'thumbnail': u'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', -                u'age_limit': 18, -                u'description': u'этот ролик не покажут по ТВ, видео этот ролик не покажут по ТВ', +            'url': 'http://smotri.com/video/view/?id=v15408898bcf', +            'file': 'v15408898bcf.flv', +            'md5': '91e909c9f0521adf5ee86fbe073aad70', +            'info_dict': { +                'title': 'этот ролик не покажут по ТВ', +                'uploader': 'zzxxx', +                'uploader_id': 'ueggb', +                'upload_date': '20101001', +                'thumbnail': 'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', +                'age_limit': 18, +                'description': 'этот ролик не покажут по ТВ, видео этот ролик не покажут по ТВ',              }, -            u'params': { -                u'videopassword': u'333' +            'params': { +                'videopassword': '333'              }          }      ] -     +      _SUCCESS = 0      _PASSWORD_NOT_VERIFIED = 1      _PASSWORD_DETECTED = 2 @@ -106,71 +107,71 @@ class SmotriIE(InfoExtractor):          # Download video JSON data          video_json_url = 'http://smotri.com/vt.php?id=%s' % real_video_id -        video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON') +        video_json_page = self._download_webpage(video_json_url, video_id, 'Downloading video JSON')          video_json = json.loads(video_json_page) -         +          status = video_json['status']          if status == self._VIDEO_NOT_FOUND: -            raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) -        elif status == self._PASSWORD_DETECTED:  # The video is protected by a password, retry with +            raise ExtractorError('Video %s does not exist' % video_id, expected=True) +        elif status == self._PASSWORD_DETECTED: # The video is protected by a password, retry with                                                  # video-password set              video_password = self._downloader.params.get('videopassword', None)              if not video_password: -                raise ExtractorError(u'This video is protected by a password, use the --video-password option', expected=True) +                raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)              video_json_url += '&md5pass=%s' % hashlib.md5(video_password.encode('utf-8')).hexdigest() -            video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON (video-password set)') +            video_json_page = self._download_webpage(video_json_url, video_id, 'Downloading video JSON (video-password set)')              video_json = json.loads(video_json_page)              status = video_json['status']              if status == self._PASSWORD_NOT_VERIFIED: -                raise ExtractorError(u'Video password is invalid', expected=True) -         +                raise ExtractorError('Video password is invalid', expected=True) +          if status != self._SUCCESS: -            raise ExtractorError(u'Unexpected status value %s' % status) -         +            raise ExtractorError('Unexpected status value %s' % status) +          # Extract the URL of the video          video_url = video_json['file_data'] -         +          # Video JSON does not provide enough meta data          # We will extract some from the video web page instead          video_page_url = 'http://' + mobj.group('url') -        video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page') +        video_page = self._download_webpage(video_page_url, video_id, 'Downloading video page')          # Warning if video is unavailable          warning = self._html_search_regex(              r'<div class="videoUnModer">(.*?)</div>', video_page, -            u'warning message', default=None) +            'warning message', default=None)          if warning is not None:              self._downloader.report_warning( -                u'Video %s may not be available; smotri said: %s ' % +                'Video %s may not be available; smotri said: %s ' %                  (video_id, warning))          # Adult content -        if re.search(u'EroConfirmText">', video_page) is not None: +        if re.search('EroConfirmText">', video_page) is not None:              self.report_age_confirmation()              confirm_string = self._html_search_regex(                  r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id, -                video_page, u'confirm string') +                video_page, 'confirm string')              confirm_url = video_page_url + '&confirm=%s' % confirm_string -            video_page = self._download_webpage(confirm_url, video_id, u'Downloading video page (age confirmed)') +            video_page = self._download_webpage(confirm_url, video_id, 'Downloading video page (age confirmed)')              adult_content = True          else:              adult_content = False -         +          # Extract the rest of meta data -        video_title = self._search_meta(u'name', video_page, u'title') +        video_title = self._search_meta('name', video_page, 'title')          if not video_title:              video_title = os.path.splitext(url_basename(video_url))[0] -        video_description = self._search_meta(u'description', video_page) -        END_TEXT = u' на сайте Smotri.com' +        video_description = self._search_meta('description', video_page) +        END_TEXT = ' на сайте Smotri.com'          if video_description and video_description.endswith(END_TEXT):              video_description = video_description[:-len(END_TEXT)] -        START_TEXT = u'Смотреть онлайн ролик ' +        START_TEXT = 'Смотреть онлайн ролик '          if video_description and video_description.startswith(START_TEXT):              video_description = video_description[len(START_TEXT):] -        video_thumbnail = self._search_meta(u'thumbnail', video_page) +        video_thumbnail = self._search_meta('thumbnail', video_page) -        upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date') +        upload_date_str = self._search_meta('uploadDate', video_page, 'upload date')          if upload_date_str:              upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)              video_upload_date = ( @@ -183,8 +184,8 @@ class SmotriIE(InfoExtractor):              )          else:              video_upload_date = None -         -        duration_str = self._search_meta(u'duration', video_page) + +        duration_str = self._search_meta('duration', video_page)          if duration_str:              duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)              video_duration = ( @@ -197,19 +198,19 @@ class SmotriIE(InfoExtractor):              )          else:              video_duration = None -         +          video_uploader = self._html_search_regex( -            u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', -            video_page, u'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL) -         +            '<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', +            video_page, 'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL) +          video_uploader_id = self._html_search_regex( -            u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\\(.*?\'([^\']+)\'\\);">', -            video_page, u'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL) -         +            '<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\\(.*?\'([^\']+)\'\\);">', +            video_page, 'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL) +          video_view_count = self._html_search_regex( -            u'Общее количество просмотров.*?<span class="Number">(\\d+)</span>', -            video_page, u'view count', fatal=False, flags=re.MULTILINE|re.DOTALL) -                 +            'Общее количество просмотров.*?<span class="Number">(\\d+)</span>', +            video_page, 'view count', fatal=False, flags=re.MULTILINE|re.DOTALL) +          return {              'id': video_id,              'url': video_url, @@ -227,8 +228,8 @@ class SmotriIE(InfoExtractor):  class SmotriCommunityIE(InfoExtractor): -    IE_DESC = u'Smotri.com community videos' -    IE_NAME = u'smotri:community' +    IE_DESC = 'Smotri.com community videos' +    IE_NAME = 'smotri:community'      _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)'      def _real_extract(self, url): @@ -236,21 +237,21 @@ class SmotriCommunityIE(InfoExtractor):          community_id = mobj.group('communityid')          url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id -        rss = self._download_xml(url, community_id, u'Downloading community RSS') +        rss = self._download_xml(url, community_id, 'Downloading community RSS')          entries = [self.url_result(video_url.text, 'Smotri')                     for video_url in rss.findall('./channel/item/link')]          description_text = rss.find('./channel/description').text          community_title = self._html_search_regex( -            u'^Видео сообщества "([^"]+)"$', description_text, u'community title') +            '^Видео сообщества "([^"]+)"$', description_text, 'community title')          return self.playlist_result(entries, community_id, community_title)  class SmotriUserIE(InfoExtractor): -    IE_DESC = u'Smotri.com user videos' -    IE_NAME = u'smotri:user' +    IE_DESC = 'Smotri.com user videos' +    IE_NAME = 'smotri:user'      _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)'      def _real_extract(self, url): @@ -258,22 +259,22 @@ class SmotriUserIE(InfoExtractor):          user_id = mobj.group('userid')          url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id -        rss = self._download_xml(url, user_id, u'Downloading user RSS') +        rss = self._download_xml(url, user_id, 'Downloading user RSS')          entries = [self.url_result(video_url.text, 'Smotri')                     for video_url in rss.findall('./channel/item/link')]          description_text = rss.find('./channel/description').text          user_nickname = self._html_search_regex( -            u'^Видео режиссера (.*)$', description_text, -            u'user nickname') +            '^Видео режиссера (.*)$', description_text, +            'user nickname')          return self.playlist_result(entries, user_id, user_nickname)  class SmotriBroadcastIE(InfoExtractor): -    IE_DESC = u'Smotri.com broadcasts' -    IE_NAME = u'smotri:broadcast' +    IE_DESC = 'Smotri.com broadcasts' +    IE_NAME = 'smotri:broadcast'      _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<broadcastid>[^/]+))/?.*'      def _real_extract(self, url): @@ -281,46 +282,40 @@ class SmotriBroadcastIE(InfoExtractor):          broadcast_id = mobj.group('broadcastid')          broadcast_url = 'http://' + mobj.group('url') -        broadcast_page = self._download_webpage(broadcast_url, broadcast_id, u'Downloading broadcast page') +        broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') -        if re.search(u'>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None: -            raise ExtractorError(u'Broadcast %s does not exist' % broadcast_id, expected=True) +        if re.search('>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None: +            raise ExtractorError('Broadcast %s does not exist' % broadcast_id, expected=True)          # Adult content -        if re.search(u'EroConfirmText">', broadcast_page) is not None: +        if re.search('EroConfirmText">', broadcast_page) is not None:              (username, password) = self._get_login_info()              if username is None: -                raise ExtractorError(u'Erotic broadcasts allowed only for registered users, ' -                    u'use --username and --password options to provide account credentials.', expected=True) - -            # Log in -            login_form_strs = { -                u'login-hint53': '1', -                u'confirm_erotic': '1', -                u'login': username, -                u'password': password, +                raise ExtractorError('Erotic broadcasts allowed only for registered users, ' +                    'use --username and --password options to provide account credentials.', expected=True) + +            login_form = { +                'login-hint53': '1', +                'confirm_erotic': '1', +                'login': username, +                'password': password,              } -            # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode -            # chokes on unicode -            login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) -            login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') -            login_url = broadcast_url + '/?no_redirect=1' -            request = compat_urllib_request.Request(login_url, login_data) + +            request = compat_urllib_request.Request(broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form))              request.add_header('Content-Type', 'application/x-www-form-urlencoded') -            broadcast_page = self._download_webpage( -                request, broadcast_id, note=u'Logging in and confirming age') +            broadcast_page = self._download_webpage(request, broadcast_id, 'Logging in and confirming age') -            if re.search(u'>Неверный логин или пароль<', broadcast_page) is not None: -                raise ExtractorError(u'Unable to log in: bad username or password', expected=True) +            if re.search('>Неверный логин или пароль<', broadcast_page) is not None: +                raise ExtractorError('Unable to log in: bad username or password', expected=True)              adult_content = True          else:              adult_content = False          ticket = self._html_search_regex( -            u'window\.broadcast_control\.addFlashVar\\(\'file\', \'([^\']+)\'\\);', -            broadcast_page, u'broadcast ticket') +            'window\.broadcast_control\.addFlashVar\\(\'file\', \'([^\']+)\'\\);', +            broadcast_page, 'broadcast ticket')          url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket @@ -328,22 +323,22 @@ class SmotriBroadcastIE(InfoExtractor):          if broadcast_password:              url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() -        broadcast_json_page = self._download_webpage(url, broadcast_id, u'Downloading broadcast JSON') +        broadcast_json_page = self._download_webpage(url, broadcast_id, 'Downloading broadcast JSON')          try:              broadcast_json = json.loads(broadcast_json_page)              protected_broadcast = broadcast_json['_pass_protected'] == 1              if protected_broadcast and not broadcast_password: -                raise ExtractorError(u'This broadcast is protected by a password, use the --video-password option', expected=True) +                raise ExtractorError('This broadcast is protected by a password, use the --video-password option', expected=True)              broadcast_offline = broadcast_json['is_play'] == 0              if broadcast_offline: -                raise ExtractorError(u'Broadcast %s is offline' % broadcast_id, expected=True) +                raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True)              rtmp_url = broadcast_json['_server']              if not rtmp_url.startswith('rtmp://'): -                raise ExtractorError(u'Unexpected broadcast rtmp URL') +                raise ExtractorError('Unexpected broadcast rtmp URL')              broadcast_playpath = broadcast_json['_streamName']              broadcast_thumbnail = broadcast_json['_imgURL'] @@ -354,8 +349,8 @@ class SmotriBroadcastIE(InfoExtractor):              rtmp_conn = 'S:%s' % uuid.uuid4().hex          except KeyError:              if protected_broadcast: -                raise ExtractorError(u'Bad broadcast password', expected=True) -            raise ExtractorError(u'Unexpected broadcast JSON') +                raise ExtractorError('Bad broadcast password', expected=True) +            raise ExtractorError('Unexpected broadcast JSON')          return {              'id': broadcast_id, diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index f7bc77c48..544369068 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -9,7 +9,7 @@ from ..utils import (  class TumblrIE(InfoExtractor): -    _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)' +    _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)($|/)'      _TEST = {          'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes',          'file': '54196191430.mp4', diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index 4e404fbf5..c980153ec 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals  import base64  import re @@ -6,15 +7,16 @@ from ..utils import (      compat_parse_qs,  ) +  class TutvIE(InfoExtractor): -    _VALID_URL=r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)' +    _VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)'      _TEST = { -        u'url': u'http://tu.tv/videos/noah-en-pabellon-cuahutemoc', -        u'file': u'2742556.flv', -        u'md5': u'5eb766671f69b82e528dc1e7769c5cb2', -        u'info_dict': { -            u"title": u"Noah en pabellon cuahutemoc" -        } +        'url': 'http://tu.tv/videos/noah-en-pabellon-cuahutemoc', +        'file': '2742556.flv', +        'md5': '5eb766671f69b82e528dc1e7769c5cb2', +        'info_dict': { +            'title': 'Noah en pabellon cuahutemoc', +        },      }      def _real_extract(self, url): @@ -22,18 +24,15 @@ class TutvIE(InfoExtractor):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID') +        internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, 'internal video ID') -        data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) -        data_content = self._download_webpage(data_url, video_id, note=u'Downloading video info') +        data_url = 'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) +        data_content = self._download_webpage(data_url, video_id, note='Downloading video info')          data = compat_parse_qs(data_content)          video_url = base64.b64decode(data['kpt'][0]).decode('utf-8') -        ext = video_url.partition(u'?')[0].rpartition(u'.')[2] -        info = { +        return {              'id': internal_id,              'url': video_url, -            'ext': ext,              'title': self._og_search_title(webpage),          } -        return [info] diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index e971b5b4b..fcb5ff758 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import re  from .common import InfoExtractor @@ -9,12 +11,12 @@ from ..utils import (  class YouJizzIE(InfoExtractor):      _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+)\.html$'      _TEST = { -        u'url': u'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', -        u'file': u'2189178.flv', -        u'md5': u'07e15fa469ba384c7693fd246905547c', -        u'info_dict': { -            u"title": u"Zeichentrick 1", -            u"age_limit": 18, +        'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', +        'file': '2189178.flv', +        'md5': '07e15fa469ba384c7693fd246905547c', +        'info_dict': { +            "title": "Zeichentrick 1", +            "age_limit": 18,          }      } @@ -30,12 +32,12 @@ class YouJizzIE(InfoExtractor):          # Get the video title          video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>', -            webpage, u'title').strip() +            webpage, 'title').strip()          # Get the embed page          result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)          if result is None: -            raise ExtractorError(u'ERROR: unable to extract embed page') +            raise ExtractorError('ERROR: unable to extract embed page')          embed_page_url = result.group(0).strip()          video_id = result.group('videoid') @@ -47,23 +49,23 @@ class YouJizzIE(InfoExtractor):          if m_playlist is not None:              playlist_url = m_playlist.group('playlist')              playlist_page = self._download_webpage(playlist_url, video_id, -                                                   u'Downloading playlist page') +                                                   'Downloading playlist page')              m_levels = list(re.finditer(r'<level bitrate="(\d+?)" file="(.*?)"', playlist_page))              if len(m_levels) == 0: -                raise ExtractorError(u'Unable to extract video url') +                raise ExtractorError('Unable to extract video url')              videos = [(int(m.group(1)), m.group(2)) for m in m_levels]              (_, video_url) = sorted(videos)[0]              video_url = video_url.replace('%252F', '%2F')          else:              video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', -                                           webpage, u'video URL') - -        info = {'id': video_id, -                'url': video_url, -                'title': video_title, -                'ext': 'flv', -                'format': 'flv', -                'player_url': embed_page_url, -                'age_limit': age_limit} +                                           webpage, 'video URL') -        return [info] +        return { +            'id': video_id, +            'url': video_url, +            'title': video_title, +            'ext': 'flv', +            'format': 'flv', +            'player_url': embed_page_url, +            'age_limit': age_limit, +        } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 87a5a452e..54592d174 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1662,7 +1662,7 @@ class YoutubeUserIE(InfoExtractor):                      '_type': 'url',                      'url': video_id,                      'ie_key': 'Youtube', -                    'id': 'video_id', +                    'id': video_id,                      'title': title,                  }          url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index dd3c37007..8b8a3977a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.01.27.1' +__version__ = '2014.01.30.1' | 
