diff options
| -rw-r--r-- | test/test_all_urls.py | 6 | ||||
| -rw-r--r-- | test/test_playlists.py | 1 | ||||
| -rw-r--r-- | test/test_utils.py | 4 | ||||
| -rw-r--r-- | youtube_dl/__init__.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/breakcom.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/dailymotion.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/keezmovies.py | 26 | ||||
| -rw-r--r-- | youtube_dl/extractor/morningstar.py | 8 | ||||
| -rw-r--r-- | youtube_dl/extractor/motorsport.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/pornhd.py | 77 | ||||
| -rw-r--r-- | youtube_dl/extractor/ro220.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/rts.py | 13 | ||||
| -rw-r--r-- | youtube_dl/extractor/teamcoco.py | 27 | ||||
| -rw-r--r-- | youtube_dl/extractor/yahoo.py | 38 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 5 | ||||
| -rw-r--r-- | youtube_dl/version.py | 2 | 
16 files changed, 139 insertions, 82 deletions
| diff --git a/test/test_all_urls.py b/test/test_all_urls.py index bea8c41fb..577f6ac32 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -157,5 +157,11 @@ class TestAllURLsMatching(unittest.TestCase):              'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3',              ['ComedyCentralShows']) +    def test_yahoo_https(self): +        # https://github.com/rg3/youtube-dl/issues/2701 +        self.assertMatch( +            'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', +            ['Yahoo']) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_playlists.py b/test/test_playlists.py index 5fb679aa1..75c6a6bbb 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -324,7 +324,6 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['id'], '342759')          self.assertEqual(              result['title'], 'General Motors Ignition Switch Recall') -        self.assertEqual(len(result['entries']), 9)          whole_duration = sum(e['duration'] for e in result['entries'])          self.assertEqual(whole_duration, 14855) diff --git a/test/test_utils.py b/test/test_utils.py index 2348c0415..51eb0b6b9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -38,6 +38,7 @@ from youtube_dl.utils import (      xpath_with_ns,      parse_iso8601,      strip_jsonp, +    uppercase_escape,  )  if sys.version_info < (3, 0): @@ -279,6 +280,9 @@ class TestUtil(unittest.TestCase):          d = json.loads(stripped)          self.assertEqual(d, [{"id": "532cb", "x": 3}]) +    def test_uppercase_escpae(self): +        self.assertEqual(uppercase_escape(u'aä'), u'aä') +        self.assertEqual(uppercase_escape(u'\\U0001d550'), u'𝕐')  if __name__ == '__main__':      unittest.main() diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 7c135db32..aba8b4537 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -242,7 +242,7 @@ def parseOpts(overrideArguments=None):          help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')      general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')      general.add_option( -        '--prefer-insecure', action='store_true', dest='prefer_insecure', +        '--prefer-insecure', '--prefer-unsecure', action='store_true', dest='prefer_insecure',          help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)')      general.add_option(          '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR', diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 85635d1cc..1bfc9f35b 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -27,9 +27,10 @@ class BreakIE(InfoExtractor):              webpage, 'info json', flags=re.DOTALL)          info = json.loads(info_json)          video_url = info['videoUri'] -        m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) -        if m_youtube is not None: -            return self.url_result(m_youtube.group(1), 'Youtube') +        youtube_id = info.get('youtubeId') +        if youtube_id: +            return self.url_result(youtube_id, 'Youtube') +          final_url = video_url + '?' + info['AuthToken']          return {              'id': video_id, diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 7ed7f2723..5504d93eb 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -8,7 +8,6 @@ from .subtitles import SubtitlesInfoExtractor  from ..utils import (      compat_urllib_request,      compat_str, -    get_element_by_attribute,      get_element_by_id,      orderedSet,      str_to_int, diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index 29658a7d6..75b63cffb 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import os  import re @@ -11,22 +13,22 @@ from ..aes import (      aes_decrypt_text  ) +  class KeezMoviesIE(InfoExtractor): -    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)' +    _VALID_URL = r'^https?://(?:www\.)?keezmovies\.com/video/.+?(?P<videoid>[0-9]+)(?:[/?&]|$)'      _TEST = { -        u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', -        u'file': u'1214711.mp4', -        u'md5': u'6e297b7e789329923fcf83abb67c9289', -        u'info_dict': { -            u"title": u"Petite Asian Lady Mai Playing In Bathtub", -            u"age_limit": 18, +        'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', +        'file': '1214711.mp4', +        'md5': '6e297b7e789329923fcf83abb67c9289', +        'info_dict': { +            'title': 'Petite Asian Lady Mai Playing In Bathtub', +            'age_limit': 18,          }      }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('videoid') -        url = 'http://www.' + mobj.group('url')          req = compat_urllib_request.Request(url)          req.add_header('Cookie', 'age_verified=1') @@ -38,10 +40,10 @@ class KeezMoviesIE(InfoExtractor):              embedded_url = mobj.group(1)              return self.url_result(embedded_url) -        video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, u'title') -        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&', webpage, u'video_url')) -        if webpage.find('encrypted=true')!=-1: -            password = self._html_search_regex(r'video_title=(.+?)&', webpage, u'password') +        video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, 'title') +        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&', webpage, 'video_url')) +        if 'encrypted=true' in webpage: +            password = self._html_search_regex(r'video_title=(.+?)&', webpage, 'password')              video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8')          path = compat_urllib_parse_urlparse(video_url).path          extension = os.path.splitext(path)[1][1:] diff --git a/youtube_dl/extractor/morningstar.py b/youtube_dl/extractor/morningstar.py index 3d3225699..4f7a5d2e4 100644 --- a/youtube_dl/extractor/morningstar.py +++ b/youtube_dl/extractor/morningstar.py @@ -1,17 +1,9 @@  # coding: utf-8  from __future__ import unicode_literals -import hashlib -import json  import re -import time  from .common import InfoExtractor -from ..utils import ( -    compat_parse_qs, -    compat_str, -    int_or_none, -)  class MorningstarIE(InfoExtractor): diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py index dc727be10..7c0ec6a12 100644 --- a/youtube_dl/extractor/motorsport.py +++ b/youtube_dl/extractor/motorsport.py @@ -44,7 +44,7 @@ class MotorsportIE(InfoExtractor):          e = compat_str(int(time.time()) + 24 * 60 * 60)          base_video_url = params['location'] + '?e=' + e          s = 'h3hg713fh32' -        h = hashlib.md5(s + base_video_url).hexdigest() +        h = hashlib.md5((s + base_video_url).encode('utf-8')).hexdigest()          video_url = base_video_url + '&h=' + h          uploader = self._html_search_regex( diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 58f9c690e..718fe9aba 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -1,44 +1,81 @@  from __future__ import unicode_literals  import re +import json  from .common import InfoExtractor -from ..utils import compat_urllib_parse +from ..utils import int_or_none  class PornHdIE(InfoExtractor): -    _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)' +    _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)'      _TEST = {          'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', -        'file': '1962.flv', -        'md5': '35272469887dca97abd30abecc6cdf75', +        'md5': '956b8ca569f7f4d8ec563e2c41598441',          'info_dict': { -            "title": "sierra-day-gets-his-cum-all-over-herself-hd-porn-video", -            "age_limit": 18, +            'id': '1962', +            'ext': 'mp4', +            'title': 'Sierra loves doing laundry', +            'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', +            'age_limit': 18,          }      }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) - -        video_id = mobj.group('video_id') -        video_title = mobj.group('video_title') +        video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        next_url = self._html_search_regex( -            r'&hd=(http.+?)&', webpage, 'video URL') -        next_url = compat_urllib_parse.unquote(next_url) +        title = self._og_search_title(webpage) +        TITLE_SUFFIX = ' porn HD Video | PornHD.com ' +        if title.endswith(TITLE_SUFFIX): +            title = title[:-len(TITLE_SUFFIX)] + +        description = self._html_search_regex( +            r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False) +        view_count = int_or_none(self._html_search_regex( +            r'(\d+) views 	</span>', webpage, 'view count', fatal=False)) + +        formats = [ +            { +                'url': format_url, +                'ext': format.lower(), +                'format_id': '%s-%s' % (format.lower(), quality.lower()), +                'quality': 1 if quality.lower() == 'high' else 0, +            } for format, quality, format_url in re.findall( +                r'var __video([\da-zA-Z]+?)(Low|High)StreamUrl = \'(http://.+?)\?noProxy=1\'', webpage) +        ] + +        mobj = re.search(r'flashVars = (?P<flashvars>{.+?});', webpage) +        if mobj: +            flashvars = json.loads(mobj.group('flashvars')) +            formats.extend([ +                { +                    'url': flashvars['hashlink'].replace('?noProxy=1', ''), +                    'ext': 'flv', +                    'format_id': 'flv-low', +                    'quality': 0, +                }, +                { +                    'url': flashvars['hd'].replace('?noProxy=1', ''), +                    'ext': 'flv', +                    'format_id': 'flv-high', +                    'quality': 1, +                } +            ]) +            thumbnail = flashvars['urlWallpaper'] +        else: +            thumbnail = self._og_search_thumbnail(webpage) -        video_url = self._download_webpage( -            next_url, video_id, note='Retrieving video URL', -            errnote='Could not retrieve video URL') -        age_limit = 18 +        self._sort_formats(formats)          return {              'id': video_id, -            'url': video_url, -            'ext': 'flv', -            'title': video_title, -            'age_limit': age_limit, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'view_count': view_count, +            'formats': formats, +            'age_limit': 18,          } diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py index 4678f62df..a6ad59465 100644 --- a/youtube_dl/extractor/ro220.py +++ b/youtube_dl/extractor/ro220.py @@ -18,7 +18,7 @@ class Ro220IE(InfoExtractor):          'md5': '03af18b73a07b4088753930db7a34add',          'info_dict': {              "title": "Luati-le Banii sez 4 ep 1", -            "description": "Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.", +            "description": "re:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$",          }      } diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index 40224d761..0f85b2320 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -37,7 +37,7 @@ class RTSIE(InfoExtractor):              'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html',              'md5': 'c197f0b2421995c63a64cc73d800f42e',              'info_dict': { -                'id': '5738317', +                'id': '5624067',                  'ext': 'mp4',                  'duration': 55,                  'title': 'Bande de lancement de Passe-moi les jumelles', @@ -98,17 +98,20 @@ class RTSIE(InfoExtractor):          m = re.match(self._VALID_URL, url)          video_id = m.group('id') -        def download_json(video_id): +        def download_json(internal_id):              return self._download_json( -                'http://www.rts.ch/a/%s.html?f=json/article' % video_id, video_id) +                'http://www.rts.ch/a/%s.html?f=json/article' % internal_id, +                video_id)          all_info = download_json(video_id)          # video_id extracted out of URL is not always a real id          if 'video' not in all_info and 'audio' not in all_info:              page = self._download_webpage(url, video_id) -            video_id = self._html_search_regex(r'<(?:video|audio) data-id="(\d+)"', page, 'video id') -            all_info = download_json(video_id) +            internal_id = self._html_search_regex( +                r'<(?:video|audio) data-id="([0-9]+)"', page, +                'internal video id') +            all_info = download_json(internal_id)          info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 9dcffead0..c86f1e423 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -9,8 +9,18 @@ from ..utils import (  class TeamcocoIE(InfoExtractor): -    _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)' -    _TEST = { +    _VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<url_title>.*)' +    _TESTS = [ +    { +        'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', +        'file': '80187.mp4', +        'md5': '3f7746aa0dc86de18df7539903d399ea', +        'info_dict': { +            'title': 'Conan Becomes A Mary Kay Beauty Consultant', +            'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.' +        } +    }, +    {          'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush',          'file': '19705.mp4',          'md5': 'cde9ba0fa3506f5f017ce11ead928f9a', @@ -19,6 +29,7 @@ class TeamcocoIE(InfoExtractor):              "title": "Louis C.K. Interview Pt. 1 11/3/11"          }      } +    ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -26,11 +37,13 @@ class TeamcocoIE(InfoExtractor):              raise ExtractorError('Invalid URL: %s' % url)          url_title = mobj.group('url_title')          webpage = self._download_webpage(url, url_title) - -        video_id = self._html_search_regex( -            r'<article class="video" data-id="(\d+?)"', -            webpage, 'video id') - +         +        video_id = mobj.group("video_id") +        if video_id == '': +            video_id = self._html_search_regex( +                r'<article class="video" data-id="(\d+?)"', +                webpage, 'video id') +                  self.report_extraction(video_id)          data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index d92d14f71..e2cf1ae56 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -15,22 +15,24 @@ from ..utils import (  class YahooIE(InfoExtractor):      IE_DESC = 'Yahoo screen' -    _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html' +    _VALID_URL = r'https?://screen\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html'      _TESTS = [          {              'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', -            'file': '214727115.mp4',              'md5': '4962b075c08be8690a922ee026d05e69',              'info_dict': { +                'id': '214727115', +                'ext': 'mp4',                  'title': 'Julian Smith & Travis Legg Watch Julian Smith',                  'description': 'Julian and Travis watch Julian Smith',              },          },          {              'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', -            'file': '103000935.mp4',              'md5': 'd6e6fc6e1313c608f316ddad7b82b306',              'info_dict': { +                'id': '103000935', +                'ext': 'mp4',                  'title': 'Codefellas - The Cougar Lies with Spanish Moss',                  'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',              }, @@ -60,10 +62,9 @@ class YahooIE(InfoExtractor):              'env': 'prod',              'format': 'json',          }) -        query_result_json = self._download_webpage( +        query_result = self._download_json(              'http://video.query.yahoo.com/v1/public/yql?' + data,              video_id, 'Downloading video info') -        query_result = json.loads(query_result_json)          info = query_result['query']['results']['mediaObj'][0]          meta = info['meta'] @@ -86,7 +87,6 @@ class YahooIE(InfoExtractor):              else:                  format_url = compat_urlparse.urljoin(host, path)                  format_info['url'] = format_url -                              formats.append(format_info)          self._sort_formats(formats) @@ -134,27 +134,25 @@ class YahooSearchIE(SearchInfoExtractor):      def _get_n_results(self, query, n):          """Get a specified number of results for a query""" - -        res = { -            '_type': 'playlist', -            'id': query, -            'entries': [] -        } -        for pagenum in itertools.count(0):  +        entries = [] +        for pagenum in itertools.count(0):              result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30) -            webpage = self._download_webpage(result_url, query, -                                             note='Downloading results page '+str(pagenum+1)) -            info = json.loads(webpage) +            info = self._download_json(result_url, query, +                note='Downloading results page '+str(pagenum+1))              m = info['m']              results = info['results']              for (i, r) in enumerate(results): -                if (pagenum * 30) +i >= n: +                if (pagenum * 30) + i >= n:                      break                  mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)                  e = self.url_result('http://' + mobj.group('url'), 'Yahoo') -                res['entries'].append(e) -            if (pagenum * 30 +i >= n) or (m['last'] >= (m['total'] -1)): +                entries.append(e) +            if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):                  break -        return res +        return { +            '_type': 'playlist', +            'id': query, +            'entries': entries, +        } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 5f1f664c8..92fee966f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2,6 +2,7 @@  # -*- coding: utf-8 -*-  import calendar +import codecs  import contextlib  import ctypes  import datetime @@ -1263,9 +1264,11 @@ class PagedList(object):  def uppercase_escape(s): +    unicode_escape = codecs.getdecoder('unicode_escape')      return re.sub(          r'\\U[0-9a-fA-F]{8}', -        lambda m: m.group(0).decode('unicode-escape'), s) +        lambda m: unicode_escape(m.group(0))[0], +        s)  try:      struct.pack(u'!I', 0) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ec539e64a..41cd1a6b1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.04.04.2' +__version__ = '2014.04.04.7' | 
