diff options
-rw-r--r-- | test/test_all_urls.py | 6 | ||||
-rw-r--r-- | test/test_playlists.py | 1 | ||||
-rw-r--r-- | test/test_utils.py | 4 | ||||
-rw-r--r-- | youtube_dl/__init__.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/breakcom.py | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/dailymotion.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/keezmovies.py | 26 | ||||
-rw-r--r-- | youtube_dl/extractor/morningstar.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/motorsport.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/pornhd.py | 77 | ||||
-rw-r--r-- | youtube_dl/extractor/ro220.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/rts.py | 13 | ||||
-rw-r--r-- | youtube_dl/extractor/teamcoco.py | 27 | ||||
-rw-r--r-- | youtube_dl/extractor/yahoo.py | 38 | ||||
-rw-r--r-- | youtube_dl/utils.py | 5 | ||||
-rw-r--r-- | youtube_dl/version.py | 2 |
16 files changed, 139 insertions, 82 deletions
diff --git a/test/test_all_urls.py b/test/test_all_urls.py index bea8c41fb..577f6ac32 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -157,5 +157,11 @@ class TestAllURLsMatching(unittest.TestCase): 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3', ['ComedyCentralShows']) + def test_yahoo_https(self): + # https://github.com/rg3/youtube-dl/issues/2701 + self.assertMatch( + 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', + ['Yahoo']) + if __name__ == '__main__': unittest.main() diff --git a/test/test_playlists.py b/test/test_playlists.py index 5fb679aa1..75c6a6bbb 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -324,7 +324,6 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['id'], '342759') self.assertEqual( result['title'], 'General Motors Ignition Switch Recall') - self.assertEqual(len(result['entries']), 9) whole_duration = sum(e['duration'] for e in result['entries']) self.assertEqual(whole_duration, 14855) diff --git a/test/test_utils.py b/test/test_utils.py index 2348c0415..51eb0b6b9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -38,6 +38,7 @@ from youtube_dl.utils import ( xpath_with_ns, parse_iso8601, strip_jsonp, + uppercase_escape, ) if sys.version_info < (3, 0): @@ -279,6 +280,9 @@ class TestUtil(unittest.TestCase): d = json.loads(stripped) self.assertEqual(d, [{"id": "532cb", "x": 3}]) + def test_uppercase_escpae(self): + self.assertEqual(uppercase_escape(u'aä'), u'aä') + self.assertEqual(uppercase_escape(u'\\U0001d550'), u'𝕐') if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 7c135db32..aba8b4537 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -242,7 +242,7 @@ def parseOpts(overrideArguments=None): help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') general.add_option( - '--prefer-insecure', action='store_true', dest='prefer_insecure', + '--prefer-insecure', '--prefer-unsecure', action='store_true', dest='prefer_insecure', help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)') general.add_option( '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR', diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 85635d1cc..1bfc9f35b 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -27,9 +27,10 @@ class BreakIE(InfoExtractor): webpage, 'info json', flags=re.DOTALL) info = json.loads(info_json) video_url = info['videoUri'] - m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) - if m_youtube is not None: - return self.url_result(m_youtube.group(1), 'Youtube') + youtube_id = info.get('youtubeId') + if youtube_id: + return self.url_result(youtube_id, 'Youtube') + final_url = video_url + '?' + info['AuthToken'] return { 'id': video_id, diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 7ed7f2723..5504d93eb 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -8,7 +8,6 @@ from .subtitles import SubtitlesInfoExtractor from ..utils import ( compat_urllib_request, compat_str, - get_element_by_attribute, get_element_by_id, orderedSet, str_to_int, diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index 29658a7d6..75b63cffb 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import os import re @@ -11,22 +13,22 @@ from ..aes import ( aes_decrypt_text ) + class KeezMoviesIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)' + _VALID_URL = r'^https?://(?:www\.)?keezmovies\.com/video/.+?(?P<videoid>[0-9]+)(?:[/?&]|$)' _TEST = { - u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', - u'file': u'1214711.mp4', - u'md5': u'6e297b7e789329923fcf83abb67c9289', - u'info_dict': { - u"title": u"Petite Asian Lady Mai Playing In Bathtub", - u"age_limit": 18, + 'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', + 'file': '1214711.mp4', + 'md5': '6e297b7e789329923fcf83abb67c9289', + 'info_dict': { + 'title': 'Petite Asian Lady Mai Playing In Bathtub', + 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') - url = 'http://www.' + mobj.group('url') req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') @@ -38,10 +40,10 @@ class KeezMoviesIE(InfoExtractor): embedded_url = mobj.group(1) return self.url_result(embedded_url) - video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, u'title') - video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&', webpage, u'video_url')) - if webpage.find('encrypted=true')!=-1: - password = self._html_search_regex(r'video_title=(.+?)&', webpage, u'password') + video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, 'title') + video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&', webpage, 'video_url')) + if 'encrypted=true' in webpage: + password = self._html_search_regex(r'video_title=(.+?)&', webpage, 'password') video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8') path = compat_urllib_parse_urlparse(video_url).path extension = os.path.splitext(path)[1][1:] diff --git a/youtube_dl/extractor/morningstar.py b/youtube_dl/extractor/morningstar.py index 3d3225699..4f7a5d2e4 100644 --- a/youtube_dl/extractor/morningstar.py +++ b/youtube_dl/extractor/morningstar.py @@ -1,17 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import hashlib -import json import re -import time from .common import InfoExtractor -from ..utils import ( - compat_parse_qs, - compat_str, - int_or_none, -) class MorningstarIE(InfoExtractor): diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py index dc727be10..7c0ec6a12 100644 --- a/youtube_dl/extractor/motorsport.py +++ b/youtube_dl/extractor/motorsport.py @@ -44,7 +44,7 @@ class MotorsportIE(InfoExtractor): e = compat_str(int(time.time()) + 24 * 60 * 60) base_video_url = params['location'] + '?e=' + e s = 'h3hg713fh32' - h = hashlib.md5(s + base_video_url).hexdigest() + h = hashlib.md5((s + base_video_url).encode('utf-8')).hexdigest() video_url = base_video_url + '&h=' + h uploader = self._html_search_regex( diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 58f9c690e..718fe9aba 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -1,44 +1,81 @@ from __future__ import unicode_literals import re +import json from .common import InfoExtractor -from ..utils import compat_urllib_parse +from ..utils import int_or_none class PornHdIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)' + _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)' _TEST = { 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', - 'file': '1962.flv', - 'md5': '35272469887dca97abd30abecc6cdf75', + 'md5': '956b8ca569f7f4d8ec563e2c41598441', 'info_dict': { - "title": "sierra-day-gets-his-cum-all-over-herself-hd-porn-video", - "age_limit": 18, + 'id': '1962', + 'ext': 'mp4', + 'title': 'Sierra loves doing laundry', + 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', + 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('video_id') - video_title = mobj.group('video_title') + video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - next_url = self._html_search_regex( - r'&hd=(http.+?)&', webpage, 'video URL') - next_url = compat_urllib_parse.unquote(next_url) + title = self._og_search_title(webpage) + TITLE_SUFFIX = ' porn HD Video | PornHD.com ' + if title.endswith(TITLE_SUFFIX): + title = title[:-len(TITLE_SUFFIX)] + + description = self._html_search_regex( + r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False) + view_count = int_or_none(self._html_search_regex( + r'(\d+) views </span>', webpage, 'view count', fatal=False)) + + formats = [ + { + 'url': format_url, + 'ext': format.lower(), + 'format_id': '%s-%s' % (format.lower(), quality.lower()), + 'quality': 1 if quality.lower() == 'high' else 0, + } for format, quality, format_url in re.findall( + r'var __video([\da-zA-Z]+?)(Low|High)StreamUrl = \'(http://.+?)\?noProxy=1\'', webpage) + ] + + mobj = re.search(r'flashVars = (?P<flashvars>{.+?});', webpage) + if mobj: + flashvars = json.loads(mobj.group('flashvars')) + formats.extend([ + { + 'url': flashvars['hashlink'].replace('?noProxy=1', ''), + 'ext': 'flv', + 'format_id': 'flv-low', + 'quality': 0, + }, + { + 'url': flashvars['hd'].replace('?noProxy=1', ''), + 'ext': 'flv', + 'format_id': 'flv-high', + 'quality': 1, + } + ]) + thumbnail = flashvars['urlWallpaper'] + else: + thumbnail = self._og_search_thumbnail(webpage) - video_url = self._download_webpage( - next_url, video_id, note='Retrieving video URL', - errnote='Could not retrieve video URL') - age_limit = 18 + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, - 'ext': 'flv', - 'title': video_title, - 'age_limit': age_limit, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'view_count': view_count, + 'formats': formats, + 'age_limit': 18, } diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py index 4678f62df..a6ad59465 100644 --- a/youtube_dl/extractor/ro220.py +++ b/youtube_dl/extractor/ro220.py @@ -18,7 +18,7 @@ class Ro220IE(InfoExtractor): 'md5': '03af18b73a07b4088753930db7a34add', 'info_dict': { "title": "Luati-le Banii sez 4 ep 1", - "description": "Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.", + "description": "re:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$", } } diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index 40224d761..0f85b2320 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -37,7 +37,7 @@ class RTSIE(InfoExtractor): 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', 'md5': 'c197f0b2421995c63a64cc73d800f42e', 'info_dict': { - 'id': '5738317', + 'id': '5624067', 'ext': 'mp4', 'duration': 55, 'title': 'Bande de lancement de Passe-moi les jumelles', @@ -98,17 +98,20 @@ class RTSIE(InfoExtractor): m = re.match(self._VALID_URL, url) video_id = m.group('id') - def download_json(video_id): + def download_json(internal_id): return self._download_json( - 'http://www.rts.ch/a/%s.html?f=json/article' % video_id, video_id) + 'http://www.rts.ch/a/%s.html?f=json/article' % internal_id, + video_id) all_info = download_json(video_id) # video_id extracted out of URL is not always a real id if 'video' not in all_info and 'audio' not in all_info: page = self._download_webpage(url, video_id) - video_id = self._html_search_regex(r'<(?:video|audio) data-id="(\d+)"', page, 'video id') - all_info = download_json(video_id) + internal_id = self._html_search_regex( + r'<(?:video|audio) data-id="([0-9]+)"', page, + 'internal video id') + all_info = download_json(internal_id) info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 9dcffead0..c86f1e423 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -9,8 +9,18 @@ from ..utils import ( class TeamcocoIE(InfoExtractor): - _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)' - _TEST = { + _VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<url_title>.*)' + _TESTS = [ + { + 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', + 'file': '80187.mp4', + 'md5': '3f7746aa0dc86de18df7539903d399ea', + 'info_dict': { + 'title': 'Conan Becomes A Mary Kay Beauty Consultant', + 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.' + } + }, + { 'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush', 'file': '19705.mp4', 'md5': 'cde9ba0fa3506f5f017ce11ead928f9a', @@ -19,6 +29,7 @@ class TeamcocoIE(InfoExtractor): "title": "Louis C.K. Interview Pt. 1 11/3/11" } } + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -26,11 +37,13 @@ class TeamcocoIE(InfoExtractor): raise ExtractorError('Invalid URL: %s' % url) url_title = mobj.group('url_title') webpage = self._download_webpage(url, url_title) - - video_id = self._html_search_regex( - r'<article class="video" data-id="(\d+?)"', - webpage, 'video id') - + + video_id = mobj.group("video_id") + if video_id == '': + video_id = self._html_search_regex( + r'<article class="video" data-id="(\d+?)"', + webpage, 'video id') + self.report_extraction(video_id) data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index d92d14f71..e2cf1ae56 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -15,22 +15,24 @@ from ..utils import ( class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen' - _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html' + _VALID_URL = r'https?://screen\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html' _TESTS = [ { 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', - 'file': '214727115.mp4', 'md5': '4962b075c08be8690a922ee026d05e69', 'info_dict': { + 'id': '214727115', + 'ext': 'mp4', 'title': 'Julian Smith & Travis Legg Watch Julian Smith', 'description': 'Julian and Travis watch Julian Smith', }, }, { 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', - 'file': '103000935.mp4', 'md5': 'd6e6fc6e1313c608f316ddad7b82b306', 'info_dict': { + 'id': '103000935', + 'ext': 'mp4', 'title': 'Codefellas - The Cougar Lies with Spanish Moss', 'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', }, @@ -60,10 +62,9 @@ class YahooIE(InfoExtractor): 'env': 'prod', 'format': 'json', }) - query_result_json = self._download_webpage( + query_result = self._download_json( 'http://video.query.yahoo.com/v1/public/yql?' + data, video_id, 'Downloading video info') - query_result = json.loads(query_result_json) info = query_result['query']['results']['mediaObj'][0] meta = info['meta'] @@ -86,7 +87,6 @@ class YahooIE(InfoExtractor): else: format_url = compat_urlparse.urljoin(host, path) format_info['url'] = format_url - formats.append(format_info) self._sort_formats(formats) @@ -134,27 +134,25 @@ class YahooSearchIE(SearchInfoExtractor): def _get_n_results(self, query, n): """Get a specified number of results for a query""" - - res = { - '_type': 'playlist', - 'id': query, - 'entries': [] - } - for pagenum in itertools.count(0): + entries = [] + for pagenum in itertools.count(0): result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30) - webpage = self._download_webpage(result_url, query, - note='Downloading results page '+str(pagenum+1)) - info = json.loads(webpage) + info = self._download_json(result_url, query, + note='Downloading results page '+str(pagenum+1)) m = info['m'] results = info['results'] for (i, r) in enumerate(results): - if (pagenum * 30) +i >= n: + if (pagenum * 30) + i >= n: break mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r) e = self.url_result('http://' + mobj.group('url'), 'Yahoo') - res['entries'].append(e) - if (pagenum * 30 +i >= n) or (m['last'] >= (m['total'] -1)): + entries.append(e) + if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)): break - return res + return { + '_type': 'playlist', + 'id': query, + 'entries': entries, + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 5f1f664c8..92fee966f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import calendar +import codecs import contextlib import ctypes import datetime @@ -1263,9 +1264,11 @@ class PagedList(object): def uppercase_escape(s): + unicode_escape = codecs.getdecoder('unicode_escape') return re.sub( r'\\U[0-9a-fA-F]{8}', - lambda m: m.group(0).decode('unicode-escape'), s) + lambda m: unicode_escape(m.group(0))[0], + s) try: struct.pack(u'!I', 0) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ec539e64a..41cd1a6b1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.04.04.2' +__version__ = '2014.04.04.7' |