From f870544302f75bee0d96f6a8623c8ff270beca89 Mon Sep 17 00:00:00 2001 From: fnord Date: Mon, 13 Jul 2015 07:41:38 -0500 Subject: Add support for democracynow.org Supports downloading clips or entire shows. Subtitle support --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/democracynow.py | 100 +++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 youtube_dl/extractor/democracynow.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index cbaa07391..5cc03b875 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -112,6 +112,7 @@ from .daum import DaumIE from .dbtv import DBTVIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE +from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py new file mode 100644 index 000000000..1c9b36052 --- /dev/null +++ b/youtube_dl/extractor/democracynow.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import time +import hmac +import hashlib +import itertools +import re +from ..utils import ( + ExtractorError, + int_or_none, + parse_age_limit, + parse_iso8601, +) +from ..compat import compat_urllib_request +from .common import InfoExtractor + + +class DemocracynowIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?democracynow.org/?(?P[^\?]*)' + IE_NAME = 'democracynow' + _TESTS = [{ + 'url': 'http://www.democracynow.org/shows/2015/7/3', + 'info_dict': { + 'id': '2015-0703-001', + 'ext': 'mp4', + 'title': 'July 03, 2015 - Democracy Now!', + 'description': 'A daily independent global news hour with Amy Goodman & Juan Gonz\xe1lez "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs', + 'uploader': 'Democracy Now', + 'upload_date': None, + }, + },{ + 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', + 'info_dict': { + 'id': '2015-0703-001', + 'ext': 'mp4', + 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag', + 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21', + 'uploader': 'Democracy Now', + 'upload_date': None, + }, + + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + base_host = re.search(r'^(.+?://[^/]+)', url).group(1) + if display_id == '': + display_id = 'home' + webpage = self._download_webpage(url, display_id) + re_desc = re.search(r'[^/]+)/(?:dn)?(?P[^/]+?)\.(?P[^\.\?]+)(?P\?|$)',url) + if video_id == None: + video_id = purl.group('fn') + if js.get('start') != None: + url += '&' if purl.group('hasparams') == '?' else '?' + url = url + 'start='+str(js.get('start')) + formats.append({ + 'format_id': purl.group('dir'), + 'ext': purl.group('ext'), + 'url': url, + }) + self._sort_formats(formats) + ret = { + 'id': video_id, + 'title': js.get('title'), + 'description': description, + 'uploader': 'Democracy Now', +# 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'formats': formats, + } + return ret +# \ No newline at end of file -- cgit v1.2.3 From eb08081330f5ef52d66140589137ae1bb05eee5f Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:57:08 -0500 Subject: democracynow: correct syntax --- youtube_dl/extractor/democracynow.py | 43 ++++++++++++------------------------ 1 file changed, 14 insertions(+), 29 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 1c9b36052..973bb437b 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -1,19 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import time -import hmac -import hashlib -import itertools import re -from ..utils import ( - ExtractorError, - int_or_none, - parse_age_limit, - parse_iso8601, -) -from ..compat import compat_urllib_request from .common import InfoExtractor @@ -30,7 +18,7 @@ class DemocracynowIE(InfoExtractor): 'uploader': 'Democracy Now', 'upload_date': None, }, - },{ + }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', 'info_dict': { 'id': '2015-0703-001', @@ -40,7 +28,6 @@ class DemocracynowIE(InfoExtractor): 'uploader': 'Democracy Now', 'upload_date': None, }, - }] def _real_extract(self, url): @@ -49,7 +36,7 @@ class DemocracynowIE(InfoExtractor): if display_id == '': display_id = 'home' webpage = self._download_webpage(url, display_id) - re_desc = re.search(r'[^/]+)/(?:dn)?(?P[^/]+?)\.(?P[^\.\?]+)(?P\?|$)',url) - if video_id == None: + purl = re.search(r'/(?P[^/]+)/(?:dn)?(?P[^/]+?)\.(?P[^\.\?]+)(?P\?|$)', url) + if video_id is None: video_id = purl.group('fn') - if js.get('start') != None: + if js.get('start') is not None: url += '&' if purl.group('hasparams') == '?' else '?' - url = url + 'start='+str(js.get('start')) + url = url + 'start=' + str(js.get('start')) formats.append({ 'format_id': purl.group('dir'), 'ext': purl.group('ext'), @@ -92,9 +79,7 @@ class DemocracynowIE(InfoExtractor): 'title': js.get('title'), 'description': description, 'uploader': 'Democracy Now', -# 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, } return ret -# \ No newline at end of file -- cgit v1.2.3 From f57f84f606b246db4f102fc5bc55e64e4f7a3d60 Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 21 Jul 2015 16:38:40 -0500 Subject: Twitter: get and describe video from status urls --- youtube_dl/extractor/twitter.py | 44 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1aaa06305..a65252cc6 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -70,3 +70,47 @@ class TwitterCardIE(InfoExtractor): 'duration': duration, 'formats': formats, } + + +class TwitterIE(TwitterCardIE): + _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P[^/]+/status/\d+)' + + _TESTS = [{ + 'url': 'https://m.twitter.com/thereaIbanksy/status/614301758345490432', + 'md5': '8bbccb487bd7a31349b775915fcd412f', + 'info_dict': { + 'id': '614301758345490432', + 'ext': 'mp4', + 'title': 'thereaIbanksy - This time lapse is so pretty \U0001f60d\U0001f60d', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 29.5, + 'description': 'banksy on Twitter: "This time lapse is so pretty \U0001f60d\U0001f60d http://t.co/QB8DDbqiR1"', + 'uploader': 'banksy', + 'uploader_id': 'thereaIbanksy', + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + username, twid = re.match(r'([^/]+)/status/(\d+)', id).groups() + name = username + url = re.sub(r'https?://(m|mobile)\.', 'https://', url) + webpage = self._download_webpage(url, 'tweet: ' + url) + description = unescapeHTML(self._search_regex('\s*(.+?)\s*', webpage, 'title')) + title = description.replace('\n', ' ') + splitdesc = re.match(r'^(.+?)\s*on Twitter:\s* "(.+?)"$', title) + if splitdesc: + name, title = splitdesc.groups() + title = re.sub(r'\s*https?://[^ ]+', '', title) # strip 'https -_t.co_BJYgOjSeGA' junk from filenames + card_id = self._search_regex(r'["\']/i/cards/tfw/v1/(\d+)', webpage, '/i/card/...') + card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id + return { + '_type': 'url_transparent', + 'ie_key': 'TwitterCard', + 'uploader_id': username, + 'uploader': name, + 'url': card_url, + 'webpage_url': url, + 'description': description, + 'title': username + ' - ' + title, + } -- cgit v1.2.3 From c3dea3f878133f3cbdad9e548609d3077572af66 Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 21 Jul 2015 16:45:36 -0500 Subject: Twittercard: support vmapurl method --- youtube_dl/extractor/twitter.py | 47 ++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index a65252cc6..1dd43ff3c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -12,17 +12,30 @@ from ..utils import ( class TwitterCardIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P\d+)' - _TEST = { - 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', - 'md5': 'a74f50b310c83170319ba16de6955192', - 'info_dict': { - 'id': '560070183650213889', - 'ext': 'mp4', - 'title': 'TwitterCard', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 30.033, + _TESTS = [ + { + 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', + 'md5': 'a74f50b310c83170319ba16de6955192', + 'info_dict': { + 'id': '560070183650213889', + 'ext': 'mp4', + 'title': 'TwitterCard', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 30.033, + } }, - } + { + 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', + 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8', + 'info_dict': { + 'id': '623160978427936768', + 'ext': 'mp4', + 'title': 'TwitterCard', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 80.155, + }, + } + ] def _real_extract(self, url): video_id = self._match_id(url) @@ -44,6 +57,20 @@ class TwitterCardIE(InfoExtractor): unescapeHTML(self._search_regex( r'data-player-config="([^"]+)"', webpage, 'data player config')), video_id) + if 'playlist' not in config: + if 'vmapUrl' in config: + webpage = self._download_webpage(config['vmapUrl'], video_id + ' (xml)') + video_url = self._search_regex( + r'\s*', webpage, 'data player config (xml)') + f = { + 'url': video_url, + } + ext = re.search(r'\.([a-z0-9]{2,4})(\?.+)?$', video_url) + if ext: + f['ext'] = ext.group(1) + formats.append(f) + break # same video regardless of UA + continue video_url = config['playlist'][0]['source'] -- cgit v1.2.3 From 9e7e0dffd5e3e3c959e8d99a5e236b9099886fe9 Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 21 Jul 2015 16:56:35 -0500 Subject: Actually add the extractor --- youtube_dl/extractor/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 50da08830..5c03bf8e8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -651,7 +651,7 @@ from .twitch import ( TwitchBookmarksIE, TwitchStreamIE, ) -from .twitter import TwitterCardIE +from .twitter import TwitterCardIE, TwitterIE from .ubu import UbuIE from .udemy import ( UdemyIE, -- cgit v1.2.3 From ed1269000f24a6ddc683a295ff402ef3ded5c4fb Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 11 Sep 2015 04:46:21 +0100 Subject: [brightcove] add support for brightcove in page embed(fixes #6824) --- youtube_dl/extractor/__init__.py | 5 ++- youtube_dl/extractor/brightcove.py | 92 ++++++++++++++++++++++++++++++++++++++ youtube_dl/extractor/generic.py | 21 ++++++++- 3 files changed, 116 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 57f55b479..fcd9edec3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -59,7 +59,10 @@ from .bloomberg import BloombergIE from .bpb import BpbIE from .br import BRIE from .breakcom import BreakIE -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, +) from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4721c2293..a07c0888f 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -22,6 +22,10 @@ from ..utils import ( fix_xml_ampersands, unescapeHTML, unsmuggle_url, + js_to_json, + int_or_none, + parse_iso8601, + extract_attributes, ) @@ -346,3 +350,91 @@ class BrightcoveIE(InfoExtractor): if 'url' not in info and not info.get('formats'): raise ExtractorError('Unable to extract video url for %s' % info['id']) return info + + +class BrightcoveInPageEmbedIE(InfoExtractor): + _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P\d+)' + TEST = { + 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'info_dict': { + 'id': '4463358922001', + 'ext': 'flv', + 'title': 'Meet the man behind Popcorn Time', + 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'duration': 165768, + } + } + + @staticmethod + def _extract_url(webpage): + video_attributes = re.search(r'(?s)]*)>.*?', webpage) + if video_attributes: + video_attributes = extract_attributes(video_attributes.group(), r'(?s)\s*data-(account|video-id|playlist-id|policy-key|player|embed)\s*=\s*["\']([^"\']+)["\']') + account_id = video_attributes.get('account') + player_id = video_attributes.get('player') + embed = video_attributes.get('embed') + video_id = video_attributes.get('video-id') + if account_id and player_id and embed and video_id: + return 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id) + return None + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + account_id, player_id, embed, video_id = mobj.groups() + + webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) + + catalog = self._parse_json( + js_to_json( + self._search_regex( + r'catalog\(({[^}]+})\);', + webpage, + 'catalog' + ) + ), + video_id + ) + policy_key = catalog['policyKey'] + + req = compat_urllib_request.Request( + 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id), + headers={'Accept': 'application/json;pk=%s' % policy_key}) + json_data = self._download_json(req, video_id) + + title = json_data['name'] + description = json_data.get('description') + thumbnail = json_data.get('name') + timestamp = parse_iso8601(json_data.get('published_at')) + duration = int_or_none(json_data.get('duration')) + + formats = [] + for source in json_data.get('sources'): + source_type = source.get('type') + if source_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) + else: + src = source.get('src') + if src: + formats.append({ + 'url': src, + 'abr': source.get('avg_bitrate'), + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'filesize': source.get('size'), + 'container': source.get('container'), + 'vcodec': source.get('container'), + }) + else: + formats.extend(self._extract_f4m_formats(source.get('streaming_src'), video_id)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec748ed9f..7a3a7f66b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,7 +29,10 @@ from ..utils import ( url_basename, xpath_text, ) -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, +) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@ -1012,6 +1015,17 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'cinemasnob', }, + }, + # BrightcoveInPageEmbed embed + { + 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', + 'info_dict': { + 'id': '4238694884001', + 'ext': 'flv', + 'title': 'Tabletop: Dread, Last Thoughts', + 'description': 'Tabletop: Dread, Last Thoughts', + 'duration': 51690, + }, } ] @@ -1288,6 +1302,11 @@ class GenericIE(InfoExtractor): 'entries': entries, } + # Look for Brightcove In Page Embed: + brightcove_in_page_embed_url = BrightcoveInPageEmbedIE._extract_url(webpage) + if brightcove_in_page_embed_url: + return self.url_result(brightcove_in_page_embed_url, 'BrightcoveInPageEmbed') + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', -- cgit v1.2.3 From 53407e3f383ed80c67db9e06b8c3480257aa3184 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 23 Sep 2015 14:02:13 +0100 Subject: [brightcove] fix streaming_src extraction --- youtube_dl/extractor/brightcove.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index a07c0888f..e4a7befee 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -413,7 +413,7 @@ class BrightcoveInPageEmbedIE(InfoExtractor): if source_type == 'application/x-mpegURL': formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) else: - src = source.get('src') + src = source.get('src') or source.get('streaming_src') if src: formats.append({ 'url': src, @@ -424,8 +424,6 @@ class BrightcoveInPageEmbedIE(InfoExtractor): 'container': source.get('container'), 'vcodec': source.get('container'), }) - else: - formats.extend(self._extract_f4m_formats(source.get('streaming_src'), video_id)) self._sort_formats(formats) -- cgit v1.2.3 From c01e1a96aa964ef6d5f0bf7675dbe34096b1d2c8 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 30 Sep 2015 11:20:43 +0100 Subject: [brightcove] fix test and fields extraction --- youtube_dl/extractor/brightcove.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index e4a7befee..b41cee91b 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -354,14 +354,18 @@ class BrightcoveIE(InfoExtractor): class BrightcoveInPageEmbedIE(InfoExtractor): _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P\d+)' - TEST = { + _TEST = { 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'md5': 'c8100925723840d4b0d243f7025703be', 'info_dict': { 'id': '4463358922001', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Meet the man behind Popcorn Time', - 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'description': 'md5:eac376a4fe366edc70279bfb681aea16', + 'timestamp': 1441391203, + 'upload_date': '20150904', 'duration': 165768, + 'uploader_id': '929656772001', } } @@ -403,7 +407,7 @@ class BrightcoveInPageEmbedIE(InfoExtractor): title = json_data['name'] description = json_data.get('description') - thumbnail = json_data.get('name') + thumbnail = json_data.get('thumbnail') timestamp = parse_iso8601(json_data.get('published_at')) duration = int_or_none(json_data.get('duration')) @@ -417,12 +421,13 @@ class BrightcoveInPageEmbedIE(InfoExtractor): if src: formats.append({ 'url': src, - 'abr': source.get('avg_bitrate'), + 'tbr': source.get('avg_bitrate'), 'width': int_or_none(source.get('width')), 'height': int_or_none(source.get('height')), 'filesize': source.get('size'), 'container': source.get('container'), - 'vcodec': source.get('container'), + 'vcodec': source.get('codec'), + 'ext': source.get('container').lower(), }) self._sort_formats(formats) @@ -435,4 +440,5 @@ class BrightcoveInPageEmbedIE(InfoExtractor): 'timestamp': timestamp, 'duration': duration, 'formats': formats, + 'uploader_id': account_id, } -- cgit v1.2.3 From 30787f7259c4e6a08f691cc691f14fa0c8fe4b87 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 3 Oct 2015 19:28:48 +0100 Subject: [cspan] correct the clip info extraction --- youtube_dl/extractor/cspan.py | 58 ++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 31 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index fbefd37d0..994e080d5 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -18,22 +18,21 @@ class CSpanIE(InfoExtractor): IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', - 'md5': '8e44ce11f0f725527daccc453f553eb0', + 'md5': '067803f994e049b455a58b16e5aab442', 'info_dict': { 'id': '315139', 'ext': 'mp4', 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', - 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', + 'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.', }, 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - # For whatever reason, the served video alternates between - # two different ones + 'md5': '4eafd1e91a75d2b1e6a3cbd0995816a2', 'info_dict': { - 'id': '340723', + 'id': 'c4486943', 'ext': 'mp4', - 'title': 'International Health Care Models', + 'title': 'CSPAN - International Health Care Models', 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967', } }, { @@ -44,7 +43,7 @@ class CSpanIE(InfoExtractor): 'ext': 'mp4', 'title': 'General Motors Ignition Switch Recall', 'duration': 14848, - 'description': 'md5:70c7c3b8fa63fa60d42772440596034c' + 'description': 'md5:118081aedd24bf1d3b68b3803344e7f3' }, }, { # Video from senate.gov @@ -57,36 +56,33 @@ class CSpanIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - webpage = self._download_webpage(url, page_id) - video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id') - - description = self._html_search_regex( - [ - # The full description - r'
(.*?)(.*?)

' - ], - webpage, 'description', flags=re.DOTALL, default=None) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage) + if matches: + video_type, video_id = matches.groups() + if video_type == 'prog': + video_type = 'program' + else: + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + title = self._og_search_title(webpage) + surl = smuggle_url(senate_isvp_url, {'force_title': title}) + return self.url_result(surl, 'SenateISVP', video_id, title) - info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id - data = self._download_json(info_url, video_id) + data = self._download_json( + 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + video_id) doc = self._download_xml( - 'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id, + 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), video_id) + description = self._html_search_meta('description', webpage) + title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - surl = smuggle_url(senate_isvp_url, {'force_title': title}) - return self.url_result(surl, 'SenateISVP', video_id, title) - files = data['video']['files'] try: capfile = data['video']['capfile']['#text'] @@ -112,12 +108,12 @@ class CSpanIE(InfoExtractor): if len(entries) == 1: entry = dict(entries[0]) - entry['id'] = video_id + entry['id'] = 'c' + video_id if video_type == 'clip' else video_id return entry else: return { '_type': 'playlist', 'entries': entries, 'title': title, - 'id': video_id, + 'id': 'c' + video_id if video_type == 'clip' else video_id, } -- cgit v1.2.3 From 41a7b00f183844e93ae2ba46fb4021f257f3ce79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Sat, 17 Oct 2015 18:18:40 +0200 Subject: [vimeo] Extract config URL from (new?) React-based Vimeo's page --- youtube_dl/extractor/vimeo.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index fa1b22049..88e462a4d 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -286,7 +286,14 @@ class VimeoIE(VimeoBaseInfoExtractor): try: try: config_url = self._html_search_regex( - r' data-config-url="(.+?)"', webpage, 'config URL') + r' data-config-url="(.+?)"', webpage, + 'config URL', default=None) + if not config_url: + # New react-based page + vimeo_clip_page_config = self._search_regex( + r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, + 'vimeo clip page config') + config_url = self._parse_json(vimeo_clip_page_config, video_id)['player']['config_url'] config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: -- cgit v1.2.3 From dd8417526b13c541e6db8f4200e717b8922a1620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Oct 2015 22:48:14 +0600 Subject: [vimeo] Clarify new react+flux website fallback --- youtube_dl/extractor/vimeo.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 88e462a4d..0f84656c0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -289,11 +289,14 @@ class VimeoIE(VimeoBaseInfoExtractor): r' data-config-url="(.+?)"', webpage, 'config URL', default=None) if not config_url: - # New react-based page + # Sometimes new react-based page is served instead of old one that require + # different config URL extraction approach (see + # https://github.com/rg3/youtube-dl/pull/7209) vimeo_clip_page_config = self._search_regex( r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, 'vimeo clip page config') - config_url = self._parse_json(vimeo_clip_page_config, video_id)['player']['config_url'] + config_url = self._parse_json( + vimeo_clip_page_config, video_id)['player']['config_url'] config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: -- cgit v1.2.3 From 59fe4824f80b7e266ea9918ae1b2e49a456b869f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Sat, 17 Oct 2015 18:52:25 +0200 Subject: [vidme] Better error message for suspended vidme videos --- youtube_dl/extractor/vidme.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 078d283b2..81dcaa231 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -114,6 +114,12 @@ class VidmeIE(InfoExtractor): video = response['video'] + if video.get('state') == 'user-disabled': + raise ExtractorError( + 'Vidme said: This video has been suspended either due to a copyright claim, ' + 'or for violating the terms of use.', + expected=True) + formats = [{ 'format_id': f.get('type'), 'url': f['uri'], -- cgit v1.2.3 From 9eb31b265f65ec6b04a508702af1a6feddafb8fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Oct 2015 23:01:24 +0600 Subject: [vidme] Add user-disabled test --- youtube_dl/extractor/vidme.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 81dcaa231..382517a4a 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -93,6 +93,10 @@ class VidmeIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # nsfw, user-disabled + 'url': 'https://vid.me/dzGJ', + 'only_matching': True, }] def _real_extract(self, url): -- cgit v1.2.3 From 583882fdce19f8c565402f42523b275f96c91575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Sat, 17 Oct 2015 19:26:30 +0200 Subject: [dailymotion] Report errors from player v5 --- youtube_dl/extractor/dailymotion.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 80a05cfee..ea1edceb1 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -96,6 +96,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader': 'HotWaves1012', 'age_limit': 18, } + }, + # geo-restricted, player v5 + { + 'url': 'http://www.dailymotion.com/video/xhza0o', + 'only_matching': True, } ] @@ -124,6 +129,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): if player_v5: player = self._parse_json(player_v5, video_id) metadata = player['metadata'] + + self._check_error(metadata) + formats = [] for quality, media_list in metadata['qualities'].items(): for media in media_list: @@ -201,9 +209,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'video info', flags=re.MULTILINE), video_id) - if info.get('error') is not None: - msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] - raise ExtractorError(msg, expected=True) + self._check_error(info) formats = [] for (key, format_id) in self._FORMATS: @@ -246,6 +252,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'duration': info['duration'] } + def _check_error(self, info): + if info.get('error') is not None: + msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] + raise ExtractorError(msg, expected=True) + def _get_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( -- cgit v1.2.3 From 648e6a1ffe45ceae2995c3f9ec6a9413aad55640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 00:11:34 +0600 Subject: [youtube] Generalize playlist entries extraction (Closes #6699, closes #6992) --- youtube_dl/extractor/youtube.py | 121 +++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 69 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b252e36e1..08e821362 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -178,6 +178,52 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return +class YoutubePlaylistBaseInfoExtractor(InfoExtractor): + # Extract the video ids from the playlist pages + def _entries(self, page, playlist_id): + more_widget_html = content_html = page + for page_num in itertools.count(1): + for video_id, video_title in self.extract_videos_from_page(content_html): + yield self.url_result( + video_id, 'Youtube', video_id=video_id, + video_title=video_title) + + mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) + if not mobj: + break + + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) + content_html = more['content_html'] + if not content_html.strip(): + # Some webpages show a "Load more" button but they don't + # have more videos + break + more_widget_html = more['load_more_widget_html'] + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + for mobj in re.finditer(self._VIDEO_RE, page): + # The link with index 0 is not the first video of the playlist (not sure if still actual) + if 'index' in mobj.groupdict() and mobj.group('id') == '0': + continue + video_id = mobj.group('id') + video_title = unescapeHTML(mobj.group('title')) + if video_title: + video_title = video_title.strip() + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + return zip(ids_in_page, titles_in_page) + + class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' _VALID_URL = r"""(?x)^ @@ -1419,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } -class YoutubePlaylistIE(YoutubeBaseInfoExtractor): +class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: (?:https?://)? @@ -1440,7 +1486,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)' + _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)(?:[^>]+>(?P[^<]+))?' IE_NAME = 'youtube:playlist' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', @@ -1557,37 +1603,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): else: self.report_warning('Youtube gives an alert message: ' + match) - # Extract the video ids from the playlist pages - def _entries(): - more_widget_html = content_html = page - for page_num in itertools.count(1): - matches = re.finditer(self._VIDEO_RE, content_html) - # We remove the duplicates and the link with index 0 - # (it's not the first video of the playlist) - new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') - for vid_id in new_ids: - yield self.url_result(vid_id, 'Youtube', video_id=vid_id) - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] - playlist_title = self._html_search_regex( r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>', page, 'title') - return self.playlist_result(_entries(), playlist_id, playlist_title) + return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) def _real_extract(self, url): # Extract playlist id @@ -1613,10 +1633,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): return self._extract_playlist(playlist_id) -class YoutubeChannelIE(InfoExtractor): +class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' + _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' _TESTS = [{ 'note': 'paginated channel', @@ -1627,22 +1648,6 @@ class YoutubeChannelIE(InfoExtractor): } }] - @staticmethod - def extract_videos_from_page(page): - ids_in_page = [] - titles_in_page = [] - for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page): - video_id = mobj.group('id') - video_title = unescapeHTML(mobj.group('title')) - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - return zip(ids_in_page, titles_in_page) - def _real_extract(self, url): channel_id = self._match_id(url) @@ -1685,29 +1690,7 @@ class YoutubeChannelIE(InfoExtractor): for video_id, video_title in self.extract_videos_from_page(channel_page)] return self.playlist_result(entries, channel_id) - def _entries(): - more_widget_html = content_html = channel_page - for pagenum in itertools.count(1): - - for video_id, video_title in self.extract_videos_from_page(content_html): - yield self.url_result( - video_id, 'Youtube', video_id=video_id, - video_title=video_title) - - mobj = re.search( - r'data-uix-load-more-href="/?(?P<more>[^"]+)"', - more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), channel_id, - 'Downloading page #%s' % (pagenum + 1), - transform_source=uppercase_escape) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - - return self.playlist_result(_entries(), channel_id) + return self.playlist_result(self._entries(channel_page, channel_id), channel_id) class YoutubeUserIE(YoutubeChannelIE): -- cgit v1.2.3 From 7593fbaa126f8bf14eecff7f103cb497e3d31de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 01:00:37 +0600 Subject: [dailymotion] Error spelling --- youtube_dl/extractor/dailymotion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index ea1edceb1..9cd9ff17d 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -254,8 +254,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): def _check_error(self, info): if info.get('error') is not None: - msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] - raise ExtractorError(msg, expected=True) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, info['error']['title']), expected=True) def _get_subtitles(self, video_id, webpage): try: -- cgit v1.2.3 From 5a11b793fe70beb6b0c7a74a489db9e52c4a742b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 01:36:03 +0600 Subject: [lynda] Extract all prioritized streams --- youtube_dl/extractor/lynda.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 378117270..5c973e75c 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -140,13 +140,14 @@ class LyndaIE(LyndaBaseIE): prioritized_streams = video_json.get('PrioritizedStreams') if prioritized_streams: - formats.extend([ - { - 'url': video_url, - 'width': int_or_none(format_id), - 'format_id': format_id, - } for format_id, video_url in prioritized_streams['0'].items() - ]) + for prioritized_stream_id, prioritized_stream in prioritized_streams.items(): + formats.extend([ + { + 'url': video_url, + 'width': int_or_none(format_id), + 'format_id': '%s-%s' % (prioritized_stream_id, format_id), + } for format_id, video_url in prioritized_stream.items() + ]) self._check_formats(formats, video_id) self._sort_formats(formats) -- cgit v1.2.3 From 355c7ad361aa3c8a57ff83e3f702a496dce59e65 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 17 Oct 2015 21:30:38 +0100 Subject: [cspan] handle error massages and extract qualities --- youtube_dl/extractor/cspan.py | 67 ++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 26 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 994e080d5..c74b35fd9 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -9,16 +9,21 @@ from ..utils import ( find_xpath_attr, smuggle_url, determine_ext, + ExtractorError, ) from .senateisvp import SenateISVPIE +def get_text_attr(d, attr): + return d.get(attr, {}).get('#text') + + class CSpanIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)' IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', - 'md5': '067803f994e049b455a58b16e5aab442', + 'md5': '94b29a4f131ff03d23471dd6f60b6a1d', 'info_dict': { 'id': '315139', 'ext': 'mp4', @@ -28,7 +33,7 @@ class CSpanIE(InfoExtractor): 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - 'md5': '4eafd1e91a75d2b1e6a3cbd0995816a2', + 'md5': '8e5fbfabe6ad0f89f3012a7943c1287b', 'info_dict': { 'id': 'c4486943', 'ext': 'mp4', @@ -37,7 +42,7 @@ class CSpanIE(InfoExtractor): } }, { 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall', - 'md5': '446562a736c6bf97118e389433ed88d4', + 'md5': '2ae5051559169baadba13fc35345ae74', 'info_dict': { 'id': '342759', 'ext': 'mp4', @@ -71,8 +76,10 @@ class CSpanIE(InfoExtractor): return self.url_result(surl, 'SenateISVP', video_id, title) data = self._download_json( - 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), - video_id) + 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + video_id)['video'] + if data['@status'] != 'Success': + raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) doc = self._download_xml( 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), @@ -83,28 +90,36 @@ class CSpanIE(InfoExtractor): title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text - files = data['video']['files'] - try: - capfile = data['video']['capfile']['#text'] - except KeyError: - capfile = None + files = data['files'] + capfile = get_text_attr(data, 'capfile') - entries = [{ - 'id': '%s_%d' % (video_id, partnum + 1), - 'title': ( - title if len(files) == 1 else - '%s part %d' % (title, partnum + 1)), - 'url': unescapeHTML(f['path']['#text']), - 'description': description, - 'thumbnail': thumbnail, - 'duration': int_or_none(f.get('length', {}).get('#text')), - 'subtitles': { - 'en': [{ - 'url': capfile, - 'ext': determine_ext(capfile, 'dfxp') - }], - } if capfile else None, - } for partnum, f in enumerate(files)] + entries = [] + for partnum, f in enumerate(files): + formats = [] + for quality in f['qualities']: + formats.append({ + 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), + 'url': unescapeHTML(get_text_attr(quality, 'file')), + 'height': int_or_none(get_text_attr(quality, 'height')), + 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), + }) + self._sort_formats(formats) + entries.append({ + 'id': '%s_%d' % (video_id, partnum + 1), + 'title': ( + title if len(files) == 1 else + '%s part %d' % (title, partnum + 1)), + 'formats': formats, + 'description': description, + 'thumbnail': thumbnail, + 'duration': int_or_none(get_text_attr(f, 'length')), + 'subtitles': { + 'en': [{ + 'url': capfile, + 'ext': determine_ext(capfile, 'dfxp') + }], + } if capfile else None, + }) if len(entries) == 1: entry = dict(entries[0]) -- cgit v1.2.3 From 80f48920c8a909ba55d13932524e55ed970f1c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 06:57:57 +0600 Subject: [crunchyroll] Bypass maturity wall (Closes #7202) --- youtube_dl/extractor/crunchyroll.py | 59 ++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 23 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 95952bc29..aa258bbc2 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -32,6 +32,26 @@ from ..aes import ( class CrunchyrollBaseIE(InfoExtractor): + _NETRC_MACHINE = 'crunchyroll' + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + self.report_login() + login_url = 'https://www.crunchyroll.com/?a=formhandler' + data = urlencode_postdata({ + 'formname': 'RpcApiUser_Login', + 'name': username, + 'password': password, + }) + login_request = compat_urllib_request.Request(login_url, data) + login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + self._download_webpage(login_request, None, False, 'Wrong login info') + + def _real_initialize(self): + self._login() + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) else compat_urllib_request.Request(url_or_request)) @@ -46,10 +66,22 @@ class CrunchyrollBaseIE(InfoExtractor): return super(CrunchyrollBaseIE, self)._download_webpage( request, video_id, note, errnote, fatal, tries, timeout, encoding) + @staticmethod + def _add_skip_wall(url): + parsed_url = compat_urlparse.urlparse(url) + qs = compat_urlparse.parse_qs(parsed_url.query) + # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message: + # > This content may be inappropriate for some people. + # > Are you sure you want to continue? + # since it's not disabled by default in crunchyroll account's settings. + # See https://github.com/rg3/youtube-dl/issues/7202. + qs['skip_wall'] = ['1'] + return compat_urlparse.urlunparse( + parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + class CrunchyrollIE(CrunchyrollBaseIE): _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)' - _NETRC_MACHINE = 'crunchyroll' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', 'info_dict': { @@ -81,7 +113,6 @@ class CrunchyrollIE(CrunchyrollBaseIE): # rtmp 'skip_download': True, }, - }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, @@ -94,24 +125,6 @@ class CrunchyrollIE(CrunchyrollBaseIE): '1080': ('80', '108'), } - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - self.report_login() - login_url = 'https://www.crunchyroll.com/?a=formhandler' - data = urlencode_postdata({ - 'formname': 'RpcApiUser_Login', - 'name': username, - 'password': password, - }) - login_request = compat_urllib_request.Request(login_url, data) - login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - self._download_webpage(login_request, None, False, 'Wrong login info') - - def _real_initialize(self): - self._login() - def _decrypt_subtitles(self, data, iv, id): data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8'))) @@ -254,7 +267,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text else: webpage_url = 'http://www.' + mobj.group('url') - webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage') + webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage') note_m = self._html_search_regex( r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='') @@ -352,7 +365,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = "crunchyroll:playlist" - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?$' + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', @@ -366,7 +379,7 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): def _real_extract(self, url): show_id = self._match_id(url) - webpage = self._download_webpage(url, show_id) + webpage = self._download_webpage(self._add_skip_wall(url), show_id) title = self._html_search_regex( r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>', webpage, 'title') -- cgit v1.2.3 From 49941c4e4f6e33785a3be1e0d103bd81657d8a0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 07:06:47 +0600 Subject: [crunchyroll] Add maturity wall reference tests (#7202) --- youtube_dl/extractor/crunchyroll.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index aa258bbc2..cecd0c784 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -116,6 +116,10 @@ class CrunchyrollIE(CrunchyrollBaseIE): }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, + }, { + # geo-restricted (US), 18+ maturity wall, non-premium available + 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617', + 'only_matching': True, }] _FORMAT_IDS = { @@ -374,6 +378,19 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi' }, 'playlist_count': 13, + }, { + # geo-restricted (US), 18+ maturity wall, non-premium available + 'url': 'http://www.crunchyroll.com/cosplay-complex-ova', + 'info_dict': { + 'id': 'cosplay-complex-ova', + 'title': 'Cosplay Complex OVA' + }, + 'playlist_count': 3, + 'skip': 'Georestricted', + }, { + # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14 + 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1', + 'only_matching': True, }] def _real_extract(self, url): -- cgit v1.2.3 From 448ef1f31c8bcc1550cf907fd46e31026ec981b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 09:11:02 +0600 Subject: [extractor/common] Allow angle brackets in attributes in _og_regexes (#7215) --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a0c4af92f..4365077f1 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -645,7 +645,7 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))' + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' % {'prop': re.escape(prop)}) template = r'<meta[^>]+?%s[^>]+?%s' -- cgit v1.2.3 From 94a773feb94a20be66526348a57ebe20495eba3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= <lukas@oxygene.sk> Date: Sat, 17 Oct 2015 22:25:08 +0200 Subject: [vine] Use JS data to get title/alt_title --- youtube_dl/extractor/vine.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index c733a48fa..d80b580a0 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -51,6 +51,21 @@ class VineIE(InfoExtractor): }, { 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', 'only_matching': True, + }, { + 'url': 'https://vine.co/v/e192BnZnZ9V', + 'info_dict': { + 'id': 'e192BnZnZ9V', + 'ext': 'mp4', + 'title': u'\u0e22\u0e34\u0e49\u0e21~ \u0e40\u0e02\u0e34\u0e19~ \u0e2d\u0e32\u0e22~ \u0e19\u0e48\u0e32\u0e23\u0e49\u0e32\u0e01\u0e2d\u0e49\u0e30 >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', + 'alt_title': 'Vine by Pimry_zaa', + 'description': u'\u0e22\u0e34\u0e49\u0e21~ \u0e40\u0e02\u0e34\u0e19~ \u0e2d\u0e32\u0e22~ \u0e19\u0e48\u0e32\u0e23\u0e49\u0e32\u0e01\u0e2d\u0e49\u0e30 >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', + 'upload_date': '20150705', + 'uploader': 'Pimry_zaa', + 'uploader_id': '1135760698325307392', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -74,8 +89,8 @@ class VineIE(InfoExtractor): return { 'id': video_id, - 'title': self._og_search_title(webpage), - 'alt_title': self._og_search_description(webpage, default=None), + 'title': data['description'], + 'alt_title': 'Vine by %s' % data['username'], 'description': data['description'], 'thumbnail': data['thumbnailUrl'], 'upload_date': unified_strdate(data['created']), -- cgit v1.2.3 From 10c38c7ca248d06c2c0f069c5a810e27e207c61e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= <lukas@oxygene.sk> Date: Sat, 17 Oct 2015 22:29:49 +0200 Subject: [vine] Fix download tests --- youtube_dl/extractor/vine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index d80b580a0..d1dbec893 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -29,10 +29,10 @@ class VineIE(InfoExtractor): 'id': 'MYxVapFvz2z', 'ext': 'mp4', 'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', - 'alt_title': 'Vine by Luna', + 'alt_title': 'Vine by Mars Ruiz', 'description': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', 'upload_date': '20140815', - 'uploader': 'Luna', + 'uploader': 'Mars Ruiz', 'uploader_id': '1102363502380728320', }, }, { -- cgit v1.2.3 From 91816e8f16408a3a2753fb254a9e963ad9429ced Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 09:32:08 +0600 Subject: [vine] Remove duplicate metadata, make more robust and modernize (Closes #7215) --- youtube_dl/extractor/vine.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index d1dbec893..6e72cc253 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -1,10 +1,14 @@ +# coding: utf-8 from __future__ import unicode_literals import re import itertools from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + int_or_none, + unified_strdate, +) class VineIE(InfoExtractor): @@ -17,7 +21,6 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': 'Chicken.', 'alt_title': 'Vine by Jack Dorsey', - 'description': 'Chicken.', 'upload_date': '20130519', 'uploader': 'Jack Dorsey', 'uploader_id': '76', @@ -30,7 +33,6 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', 'alt_title': 'Vine by Mars Ruiz', - 'description': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', 'upload_date': '20140815', 'uploader': 'Mars Ruiz', 'uploader_id': '1102363502380728320', @@ -43,7 +45,6 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': '#mw3 #ac130 #killcam #angelofdeath', 'alt_title': 'Vine by Z3k3', - 'description': '#mw3 #ac130 #killcam #angelofdeath', 'upload_date': '20130430', 'uploader': 'Z3k3', 'uploader_id': '936470460173008896', @@ -56,9 +57,8 @@ class VineIE(InfoExtractor): 'info_dict': { 'id': 'e192BnZnZ9V', 'ext': 'mp4', - 'title': u'\u0e22\u0e34\u0e49\u0e21~ \u0e40\u0e02\u0e34\u0e19~ \u0e2d\u0e32\u0e22~ \u0e19\u0e48\u0e32\u0e23\u0e49\u0e32\u0e01\u0e2d\u0e49\u0e30 >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', + 'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', 'alt_title': 'Vine by Pimry_zaa', - 'description': u'\u0e22\u0e34\u0e49\u0e21~ \u0e40\u0e02\u0e34\u0e19~ \u0e2d\u0e32\u0e22~ \u0e19\u0e48\u0e32\u0e23\u0e49\u0e32\u0e01\u0e2d\u0e49\u0e30 >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', 'upload_date': '20150705', 'uploader': 'Pimry_zaa', 'uploader_id': '1135760698325307392', @@ -80,25 +80,26 @@ class VineIE(InfoExtractor): formats = [{ 'format_id': '%(format)s-%(rate)s' % f, - 'vcodec': f['format'], - 'quality': f['rate'], + 'vcodec': f.get('format'), + 'quality': f.get('rate'), 'url': f['videoUrl'], - } for f in data['videoUrls']] + } for f in data['videoUrls'] if f.get('videoUrl')] self._sort_formats(formats) + username = data.get('username') + return { 'id': video_id, - 'title': data['description'], - 'alt_title': 'Vine by %s' % data['username'], - 'description': data['description'], - 'thumbnail': data['thumbnailUrl'], - 'upload_date': unified_strdate(data['created']), - 'uploader': data['username'], - 'uploader_id': data['userIdStr'], - 'like_count': data['likes']['count'], - 'comment_count': data['comments']['count'], - 'repost_count': data['reposts']['count'], + 'title': data.get('description') or self._og_search_title(webpage), + 'alt_title': 'Vine by %s' % username if username else self._og_search_description(webpage, default=None), + 'thumbnail': data.get('thumbnailUrl'), + 'upload_date': unified_strdate(data.get('created')), + 'uploader': username, + 'uploader_id': data.get('userIdStr'), + 'like_count': int_or_none(data.get('likes', {}).get('count')), + 'comment_count': int_or_none(data.get('comments', {}).get('count')), + 'repost_count': int_or_none(data.get('reposts', {}).get('count')), 'formats': formats, } -- cgit v1.2.3 From 02835c6bf4403a907c058d43220a83b3b427e181 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 09:34:54 +0600 Subject: [extractor/common] Document repost_count --- youtube_dl/extractor/common.py | 1 + 1 file changed, 1 insertion(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4365077f1..6169fbbeb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -172,6 +172,7 @@ class InfoExtractor(object): view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video average_rating: Average rating give by users, the scale used depends on the webpage comment_count: Number of comments on the video comments: A list of comments, each with one or more of the following -- cgit v1.2.3 From 2e022397c45fbcfd2ef6da43d14b0770221aabd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 09:36:19 +0600 Subject: [vine] Add counters to tests --- youtube_dl/extractor/vine.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 6e72cc253..be72f3147 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -24,6 +24,9 @@ class VineIE(InfoExtractor): 'upload_date': '20130519', 'uploader': 'Jack Dorsey', 'uploader_id': '76', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { 'url': 'https://vine.co/v/MYxVapFvz2z', @@ -36,6 +39,9 @@ class VineIE(InfoExtractor): 'upload_date': '20140815', 'uploader': 'Mars Ruiz', 'uploader_id': '1102363502380728320', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { 'url': 'https://vine.co/v/bxVjBbZlPUH', @@ -48,6 +54,9 @@ class VineIE(InfoExtractor): 'upload_date': '20130430', 'uploader': 'Z3k3', 'uploader_id': '936470460173008896', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', @@ -62,6 +71,9 @@ class VineIE(InfoExtractor): 'upload_date': '20150705', 'uploader': 'Pimry_zaa', 'uploader_id': '1135760698325307392', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, 'params': { 'skip_download': True, -- cgit v1.2.3 From 1e399778ee870ee583135e65458268cd7c0fb923 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 22 Jul 2015 20:03:05 +0800 Subject: [letv] Fix extraction Using data URIs for passing the decrypted M3U8 manifest, which is supported by ffmpeg only. --- youtube_dl/extractor/letv.py | 70 ++++++++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 18 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index a28abb0f0..9ebbc8089 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -9,13 +9,14 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_request, - compat_urlparse, + compat_ord, ) from ..utils import ( determine_ext, ExtractorError, parse_iso8601, int_or_none, + encode_data_uri, ) @@ -25,15 +26,16 @@ class LetvIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.letv.com/ptv/vplay/22005890.html', - 'md5': 'cab23bd68d5a8db9be31c9a222c1e8df', + 'md5': 'edadcfe5406976f42f9f266057ee5e40', 'info_dict': { 'id': '22005890', 'ext': 'mp4', 'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家', - 'timestamp': 1424747397, - 'upload_date': '20150224', 'description': 'md5:a9cb175fd753e2962176b7beca21a47c', - } + }, + 'params': { + 'hls_prefer_native': True, + }, }, { 'url': 'http://www.letv.com/ptv/vplay/1415246.html', 'info_dict': { @@ -42,16 +44,22 @@ class LetvIE(InfoExtractor): 'title': '美人天下01', 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', }, + 'params': { + 'hls_prefer_native': True, + }, }, { 'note': 'This video is available only in Mainland China, thus a proxy is needed', 'url': 'http://www.letv.com/ptv/vplay/1118082.html', - 'md5': 'f80936fbe20fb2f58648e81386ff7927', + 'md5': '2424c74948a62e5f31988438979c5ad1', 'info_dict': { 'id': '1118082', 'ext': 'mp4', 'title': '与龙共舞 完整版', 'description': 'md5:7506a5eeb1722bb9d4068f85024e3986', }, + 'params': { + 'hls_prefer_native': True, + }, 'skip': 'Only available in China', }] @@ -74,6 +82,27 @@ class LetvIE(InfoExtractor): _loc3_ = self.ror(_loc3_, _loc2_ % 17) return _loc3_ + # see M3U8Encryption class in KLetvPlayer.swf + @staticmethod + def decrypt_m3u8(encrypted_data): + if encrypted_data[:5].decode('utf-8').lower() != 'vc_01': + return encrypted_data + encrypted_data = encrypted_data[5:] + + _loc4_ = bytearray() + while encrypted_data: + b = compat_ord(encrypted_data[0]) + _loc4_.extend([b // 16, b & 0x0f]) + encrypted_data = encrypted_data[1:] + idx = len(_loc4_) - 11 + _loc4_ = _loc4_[idx:] + _loc4_[:idx] + _loc7_ = bytearray() + while _loc4_: + _loc7_.append(_loc4_[0] * 16 + _loc4_[1]) + _loc4_ = _loc4_[2:] + + return bytes(_loc7_) + def _real_extract(self, url): media_id = self._match_id(url) page = self._download_webpage(url, media_id) @@ -115,23 +144,28 @@ class LetvIE(InfoExtractor): for format_id in formats: if format_id in dispatch: media_url = playurl['domain'][0] + dispatch[format_id][0] - - # Mimic what flvxz.com do - url_parts = list(compat_urlparse.urlparse(media_url)) - qs = dict(compat_urlparse.parse_qs(url_parts[4])) - qs.update({ - 'platid': '14', - 'splatid': '1401', - 'tss': 'no', - 'retry': 1 + media_url += '&' + compat_urllib_parse.urlencode({ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'rateid': format_id, }) - url_parts[4] = compat_urllib_parse.urlencode(qs) - media_url = compat_urlparse.urlunparse(url_parts) + + nodes_data = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id) + + req = self._request_webpage( + nodes_data['nodelist'][0]['location'], media_id, + note='Downloading m3u8 information for format %s' % format_id) + + m3u8_data = self.decrypt_m3u8(req.read()) url_info_dict = { - 'url': media_url, + 'url': encode_data_uri(m3u8_data, 'application/x-mpegURL'), 'ext': determine_ext(dispatch[format_id][1]), 'format_id': format_id, + 'protocol': 'm3u8', } if format_id[-1:] == 'p': -- cgit v1.2.3 From 48aae2d2cf49843d0efa227fa393a0c783fc3c1e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:07:48 +0800 Subject: [twitter] Update tests --- youtube_dl/extractor/twitter.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1dd43ff3c..b2fff73b9 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -15,7 +16,7 @@ class TwitterCardIE(InfoExtractor): _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', - 'md5': 'a74f50b310c83170319ba16de6955192', + 'md5': '7d2f6b4d2eb841a7ccc893d479bfceb4', 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', @@ -103,17 +104,17 @@ class TwitterIE(TwitterCardIE): _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P<id>[^/]+/status/\d+)' _TESTS = [{ - 'url': 'https://m.twitter.com/thereaIbanksy/status/614301758345490432', - 'md5': '8bbccb487bd7a31349b775915fcd412f', + 'url': 'https://twitter.com/freethenipple/status/643211948184596480', + 'md5': '31cd83a116fc41f99ae3d909d4caf6a0', 'info_dict': { - 'id': '614301758345490432', + 'id': '643211948184596480', 'ext': 'mp4', - 'title': 'thereaIbanksy - This time lapse is so pretty \U0001f60d\U0001f60d', + 'title': 'freethenipple - FTN supporters on Hollywood Blvd today!', 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 29.5, - 'description': 'banksy on Twitter: "This time lapse is so pretty \U0001f60d\U0001f60d http://t.co/QB8DDbqiR1"', - 'uploader': 'banksy', - 'uploader_id': 'thereaIbanksy', + 'duration': 12.922, + 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', + 'uploader': 'FREE THE NIPPLE', + 'uploader_id': 'freethenipple', }, }] -- cgit v1.2.3 From 01d22d47039dedace1c5414c83e9fecfca41b5a5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:11:55 +0800 Subject: [twitter] Use _download_xml --- youtube_dl/extractor/twitter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index b2fff73b9..37a9fd5fd 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -8,6 +8,7 @@ from ..compat import compat_urllib_request from ..utils import ( float_or_none, unescapeHTML, + xpath_text, ) @@ -60,9 +61,8 @@ class TwitterCardIE(InfoExtractor): video_id) if 'playlist' not in config: if 'vmapUrl' in config: - webpage = self._download_webpage(config['vmapUrl'], video_id + ' (xml)') - video_url = self._search_regex( - r'<MediaFile>\s*<!\[CDATA\[(https?://.+?)\]\]>', webpage, 'data player config (xml)') + vmap_data = self._download_xml(config['vmapUrl'], video_id) + video_url = xpath_text(vmap_data, './/MediaFile').strip() f = { 'url': video_url, } -- cgit v1.2.3 From 014e880372e896cdd63f9075864d2a3bba60e706 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:13:58 +0800 Subject: [twitter] Add IE_NAMEs --- youtube_dl/extractor/twitter.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 37a9fd5fd..5f697782e 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -13,6 +13,7 @@ from ..utils import ( class TwitterCardIE(InfoExtractor): + IE_NAME = 'twitter:card' _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)' _TESTS = [ { @@ -101,6 +102,7 @@ class TwitterCardIE(InfoExtractor): class TwitterIE(TwitterCardIE): + IE_NAME = 'twitter' _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P<id>[^/]+/status/\d+)' _TESTS = [{ -- cgit v1.2.3 From f322bfb0638aeeb527459ebcf00f8a3dde26280c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:15:47 +0800 Subject: [twitter:card] Remove unneeded 'ext' --- youtube_dl/extractor/twitter.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 5f697782e..48bef5d80 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -64,13 +64,9 @@ class TwitterCardIE(InfoExtractor): if 'vmapUrl' in config: vmap_data = self._download_xml(config['vmapUrl'], video_id) video_url = xpath_text(vmap_data, './/MediaFile').strip() - f = { + formats.append({ 'url': video_url, - } - ext = re.search(r'\.([a-z0-9]{2,4})(\?.+)?$', video_url) - if ext: - f['ext'] = ext.group(1) - formats.append(f) + }) break # same video regardless of UA continue -- cgit v1.2.3 From e04edad621efe56347e155b6dc59a0c3d589b3bd Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:16:57 +0800 Subject: [twitter] Inherit from InfoExtractor directly --- youtube_dl/extractor/twitter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 48bef5d80..c9b783745 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -97,11 +97,11 @@ class TwitterCardIE(InfoExtractor): } -class TwitterIE(TwitterCardIE): +class TwitterIE(InfoExtractor): IE_NAME = 'twitter' _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P<id>[^/]+/status/\d+)' - _TESTS = [{ + _TEST = { 'url': 'https://twitter.com/freethenipple/status/643211948184596480', 'md5': '31cd83a116fc41f99ae3d909d4caf6a0', 'info_dict': { @@ -114,7 +114,7 @@ class TwitterIE(TwitterCardIE): 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', }, - }] + } def _real_extract(self, url): id = self._match_id(url) -- cgit v1.2.3 From f6dfd6603a9e9bb88ebcdcd52490974a34d1bd11 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:18:01 +0800 Subject: [twitter] Use _html_search_regex --- youtube_dl/extractor/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index c9b783745..6ff15369c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -122,7 +122,7 @@ class TwitterIE(InfoExtractor): name = username url = re.sub(r'https?://(m|mobile)\.', 'https://', url) webpage = self._download_webpage(url, 'tweet: ' + url) - description = unescapeHTML(self._search_regex('<title>\s*(.+?)\s*', webpage, 'title')) + description = self._html_search_regex('\s*(.+?)\s*', webpage, 'title') title = description.replace('\n', ' ') splitdesc = re.match(r'^(.+?)\s*on Twitter:\s* "(.+?)"$', title) if splitdesc: -- cgit v1.2.3 From 575036b40504bc921b18f05bde64e0e7dceacec6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 18:04:13 +0800 Subject: [twitter] Simplify and improve --- youtube_dl/extractor/twitter.py | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 6ff15369c..6b3b39aee 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -9,6 +9,7 @@ from ..utils import ( float_or_none, unescapeHTML, xpath_text, + remove_end, ) @@ -99,7 +100,8 @@ class TwitterCardIE(InfoExtractor): class TwitterIE(InfoExtractor): IE_NAME = 'twitter' - _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P[^/]+/status/\d+)' + _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P[^/]+)/status/(?P\d+)' + _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' _TEST = { 'url': 'https://twitter.com/freethenipple/status/643211948184596480', @@ -107,7 +109,7 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '643211948184596480', 'ext': 'mp4', - 'title': 'freethenipple - FTN supporters on Hollywood Blvd today!', + 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 12.922, 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', @@ -117,26 +119,31 @@ class TwitterIE(InfoExtractor): } def _real_extract(self, url): - id = self._match_id(url) - username, twid = re.match(r'([^/]+)/status/(\d+)', id).groups() - name = username - url = re.sub(r'https?://(m|mobile)\.', 'https://', url) - webpage = self._download_webpage(url, 'tweet: ' + url) - description = self._html_search_regex('\s*(.+?)\s*', webpage, 'title') - title = description.replace('\n', ' ') - splitdesc = re.match(r'^(.+?)\s*on Twitter:\s* "(.+?)"$', title) - if splitdesc: - name, title = splitdesc.groups() - title = re.sub(r'\s*https?://[^ ]+', '', title) # strip 'https -_t.co_BJYgOjSeGA' junk from filenames - card_id = self._search_regex(r'["\']/i/cards/tfw/v1/(\d+)', webpage, '/i/card/...') + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('user_id') + twid = mobj.group('id') + + webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid) + + username = remove_end(self._og_search_title(webpage), ' on Twitter') + + title = self._og_search_description(webpage).strip('').replace('\n', ' ') + + # strip 'https -_t.co_BJYgOjSeGA' junk from filenames + mobj = re.match(r'“(.*)\s+(http://[^ ]+)”', title) + title, short_url = mobj.groups() + + card_id = self._search_regex( + r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url') card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id + return { '_type': 'url_transparent', 'ie_key': 'TwitterCard', - 'uploader_id': username, - 'uploader': name, + 'uploader_id': user_id, + 'uploader': username, 'url': card_url, 'webpage_url': url, - 'description': description, + 'description': '%s on Twitter: "%s %s"' % (username, title, short_url), 'title': username + ' - ' + title, } -- cgit v1.2.3 From 77a54b6a658059a11de415d793588fdbfec14194 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 18:08:24 +0800 Subject: [twitter:card] Use _html_search_regex --- youtube_dl/extractor/twitter.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 6b3b39aee..1cdca544c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..compat import compat_urllib_request from ..utils import ( float_or_none, - unescapeHTML, xpath_text, remove_end, ) @@ -57,9 +56,8 @@ class TwitterCardIE(InfoExtractor): request.add_header('User-Agent', user_agent) webpage = self._download_webpage(request, video_id) - config = self._parse_json( - unescapeHTML(self._search_regex( - r'data-player-config="([^"]+)"', webpage, 'data player config')), + config = self._parse_json(self._html_search_regex( + r'data-player-config="([^"]+)"', webpage, 'data player config'), video_id) if 'playlist' not in config: if 'vmapUrl' in config: -- cgit v1.2.3 From c88aec845a680ef9404b637b3dbcf706dcf00b68 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 18:23:56 +0800 Subject: [twitter] Fix short URL extraction --- youtube_dl/extractor/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1cdca544c..1472f22a7 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -128,7 +128,7 @@ class TwitterIE(InfoExtractor): title = self._og_search_description(webpage).strip('').replace('\n', ' ') # strip 'https -_t.co_BJYgOjSeGA' junk from filenames - mobj = re.match(r'“(.*)\s+(http://[^ ]+)”', title) + mobj = re.match(r'“(.*)\s+(https?://[^ ]+)”', title) title, short_url = mobj.groups() card_id = self._search_regex( -- cgit v1.2.3 From 4a7b79038425f614af49116edab7897f0db13e5a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 19:07:37 +0800 Subject: [twitter:card] Support YouTube embeds --- youtube_dl/extractor/twitter.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1472f22a7..9d3e46b94 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -37,6 +37,19 @@ class TwitterCardIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg', 'duration': 80.155, }, + }, + { + 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', + 'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814', + 'info_dict': { + 'id': 'dq4Oj5quskI', + 'ext': 'mp4', + 'title': 'Ubuntu 11.10 Overview', + 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/', + 'upload_date': '20111013', + 'uploader': 'OMG! Ubuntu!', + 'uploader_id': 'omgubuntu', + }, } ] @@ -56,6 +69,12 @@ class TwitterCardIE(InfoExtractor): request.add_header('User-Agent', user_agent) webpage = self._download_webpage(request, video_id) + youtube_url = self._html_search_regex( + r']+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', + webpage, 'youtube iframe', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube') + config = self._parse_json(self._html_search_regex( r'data-player-config="([^"]+)"', webpage, 'data player config'), video_id) -- cgit v1.2.3 From 05a3879f1c142cc2bf0287cde4690d8ccadcdc8f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 19:19:46 +0800 Subject: [letv] Update M3U8's MIME type The new MIME type appears in the following places: https://www.iana.org/assignments/media-types/media-types.xhtml#application https://hg.python.org/cpython/file/tip/Lib/mimetypes.py --- youtube_dl/extractor/letv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 9ebbc8089..effd9eb92 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -162,7 +162,7 @@ class LetvIE(InfoExtractor): m3u8_data = self.decrypt_m3u8(req.read()) url_info_dict = { - 'url': encode_data_uri(m3u8_data, 'application/x-mpegURL'), + 'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), 'ext': determine_ext(dispatch[format_id][1]), 'format_id': format_id, 'protocol': 'm3u8', -- cgit v1.2.3 From dd67702a3ea007369109ee8e4b67043064e1f759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 18 Oct 2015 14:13:06 +0200 Subject: [imdb] Fix extraction (fixes #7220) --- youtube_dl/extractor/imdb.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 4bb574cf3..02e1e428e 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -4,8 +4,8 @@ import re import json from .common import InfoExtractor -from ..compat import ( - compat_urlparse, +from ..utils import ( + qualities, ) @@ -30,24 +30,33 @@ class ImdbIE(InfoExtractor): descr = self._html_search_regex( r'(?s)(.*?)', webpage, 'description', fatal=False) - available_formats = re.findall( - r'case \'(?P.*?)\' :$\s+url = \'(?P.*?)\'', webpage, - flags=re.MULTILINE) + player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id + player_page = self._download_webpage( + player_url, video_id, 'Downloading player page') + # the player page contains the info for the default format, we have to + # fetch other pages for the rest of the formats + extra_formats = re.findall(r'href="(?P%s.*?)".*?>(?P.*?)<' % re.escape(player_url), player_page) + format_pages = [ + self._download_webpage( + f_url, video_id, 'Downloading info for %s format' % f_name) + for f_url, f_name in extra_formats] + format_pages.append(player_page) + + quality = qualities(['SD', '480p', '720p']) formats = [] - for f_id, f_path in available_formats: - f_path = f_path.strip() - format_page = self._download_webpage( - compat_urlparse.urljoin(url, f_path), - 'Downloading info for %s format' % f_id) + for format_page in format_pages: json_data = self._search_regex( r']+class="imdb-player-data"[^>]*?>(.*?)', format_page, 'json data', flags=re.DOTALL) info = json.loads(json_data) format_info = info['videoPlayerObject']['video'] + f_id = format_info['ffname'] formats.append({ 'format_id': f_id, 'url': format_info['videoInfoList'][0]['videoUrl'], + 'quality': quality(f_id), }) + self._sort_formats(formats) return { 'id': video_id, -- cgit v1.2.3 From b0f001a6cbd220c8b10c0ce359f17072d6347a8f Mon Sep 17 00:00:00 2001 From: remitamine Date: Mon, 21 Sep 2015 15:52:36 +0100 Subject: [canalc2] fix info extraction --- youtube_dl/extractor/canalc2.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index c4fefefe4..66a9ff093 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -8,34 +8,40 @@ from .common import InfoExtractor class Canalc2IE(InfoExtractor): IE_NAME = 'canalc2.tv' - _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P\d+)' + _VALID_URL = r'https?://(www\.)?canalc2\.tv/video/(?P\d+)' _TEST = { - 'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', + 'url': 'http://www.canalc2.tv/video/12163', 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', 'ext': 'mp4', 'title': 'Terrasses du Numérique' + }, + 'params': { + 'skip_download': True, # Requires rtmpdump } } def _real_extract(self, url): - video_id = re.match(self._VALID_URL, url).group('id') - # We need to set the voir field for getting the file name - url = 'http://www.canalc2.tv/video.asp?idVideo=%s&voir=oui' % video_id + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - file_name = self._search_regex( - r"so\.addVariable\('file','(.*?)'\);", - webpage, 'file name') - video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name + video_url = self._search_regex( + r'jwplayer\("Player"\).setup\({[^}]*file: "([^"]+)"', + webpage, 'video_url') + formats = [{'url': video_url}] + if video_url.startswith('rtmp://'): + rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+))/(?Pmp4:.+)$', video_url) + formats[0].update({ + 'app': rtmp.group('app'), + 'play_path': rtmp.group('play_path'), + }) title = self._html_search_regex( - r'class="evenement8">(.*?)
', webpage, 'title') + r'(?s)class="[^"]*col_description[^"]*">.*?

(.*?)

', webpage, 'title') return { 'id': video_id, - 'ext': 'mp4', - 'url': video_url, + 'formats': formats, 'title': title, } -- cgit v1.2.3 From 6682049dee5e73b98e99e1359b959240d0920d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:19:43 +0600 Subject: [canalc2] Improve rtmp extraction --- youtube_dl/extractor/canalc2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index 66a9ff093..648af2e18 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -31,10 +31,12 @@ class Canalc2IE(InfoExtractor): webpage, 'video_url') formats = [{'url': video_url}] if video_url.startswith('rtmp://'): - rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+))/(?Pmp4:.+)$', video_url) + rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+/))(?Pmp4:.+)$', video_url) formats[0].update({ + 'url': rtmp.group('url'), 'app': rtmp.group('app'), 'play_path': rtmp.group('play_path'), + 'page_url': url, }) title = self._html_search_regex( -- cgit v1.2.3 From ef6c868f23f2fe0d493831e0d4cba71c735bd160 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:23:31 +0600 Subject: [canalc2] Improve some regexes --- youtube_dl/extractor/canalc2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index 648af2e18..d9137e2ef 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class Canalc2IE(InfoExtractor): IE_NAME = 'canalc2.tv' - _VALID_URL = r'https?://(www\.)?canalc2\.tv/video/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?canalc2\.tv/video/(?P\d+)' _TEST = { 'url': 'http://www.canalc2.tv/video/12163', @@ -27,8 +27,8 @@ class Canalc2IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = self._search_regex( - r'jwplayer\("Player"\).setup\({[^}]*file: "([^"]+)"', - webpage, 'video_url') + r'jwplayer\((["\'])Player\1\)\.setup\({[^}]*file\s*:\s*(["\'])(?P.+?)\2', + webpage, 'video_url', group='file') formats = [{'url': video_url}] if video_url.startswith('rtmp://'): rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+/))(?Pmp4:.+)$', video_url) -- cgit v1.2.3 From 14bddf35fbe8253e283042630e24b134996b2575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:23:52 +0600 Subject: [canalc2] Add ext --- youtube_dl/extractor/canalc2.py | 1 + 1 file changed, 1 insertion(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index d9137e2ef..ba82bb2b7 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -34,6 +34,7 @@ class Canalc2IE(InfoExtractor): rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+/))(?Pmp4:.+)$', video_url) formats[0].update({ 'url': rtmp.group('url'), + 'ext': 'flv', 'app': rtmp.group('app'), 'play_path': rtmp.group('play_path'), 'page_url': url, -- cgit v1.2.3 From b1bf063503893192637f95e929d1a9147de59a7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:27:05 +0600 Subject: [canalc2] Extract duration --- youtube_dl/extractor/canalc2.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index ba82bb2b7..e326b8fbd 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import parse_duration class Canalc2IE(InfoExtractor): @@ -42,9 +43,13 @@ class Canalc2IE(InfoExtractor): title = self._html_search_regex( r'(?s)class="[^"]*col_description[^"]*">.*?

(.*?)

', webpage, 'title') + duration = parse_duration(self._search_regex( + r'id=["\']video_duree["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)) return { 'id': video_id, - 'formats': formats, 'title': title, + 'duration': duration, + 'formats': formats, } -- cgit v1.2.3 From 608945d44a7e47fa5115295839c993af545936eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:27:22 +0600 Subject: [canalc2] Fix test --- youtube_dl/extractor/canalc2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index e326b8fbd..f6a1ff381 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -16,8 +16,9 @@ class Canalc2IE(InfoExtractor): 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', - 'ext': 'mp4', - 'title': 'Terrasses du Numérique' + 'ext': 'flv', + 'title': 'Terrasses du Numérique', + 'duration': 122, }, 'params': { 'skip_download': True, # Requires rtmpdump -- cgit v1.2.3 From dedd35c6bc33eb88f19b16eeb37498cee076c47a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:59:18 +0600 Subject: [viewster] Fix failing m3u8 --- youtube_dl/extractor/viewster.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 632e57fb4..7cf930d69 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -131,10 +131,11 @@ class ViewsterIE(InfoExtractor): formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds')) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( video_url, video_id, 'mp4', m3u8_id='hls', - fatal=False # m3u8 sometimes fail - )) + fatal=False) # m3u8 sometimes fail + if m3u8_formats: + formats.extend(m3u8_formats) else: format_id = media.get('Bitrate') f = { -- cgit v1.2.3 From e36963e0eb57294f156a98c38df891dec41ebaa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 20:24:33 +0600 Subject: [eagleplatform] Identify hls formats --- youtube_dl/extractor/eagleplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index e529b9b96..7bbf617d4 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -87,7 +87,7 @@ class EaglePlatformIE(InfoExtractor): m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON') formats = self._extract_m3u8_formats( m3u8_url, video_id, - 'mp4', entry_protocol='m3u8_native') + 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') mp4_url = self._get_video_url( # Secure mp4 URL is constructed according to Player.prototype.mp4 from -- cgit v1.2.3 From 264b23e1a42378d52f8774a07c1d906cd1cff96c Mon Sep 17 00:00:00 2001 From: kennell Date: Sun, 18 Oct 2015 19:56:22 +0200 Subject: adds thumbnail support for ZDF Mediathek extractor --- youtube_dl/extractor/zdf.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 98f15177b..f376025e1 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -70,6 +70,23 @@ def extract_from_xml_url(ie, video_id, xml_url): '_available': is_available, } + def xml_to_thumbnails(fnode): + thumbnails = list() + for node in fnode: + width_x_height = node.attrib['key'] + thumbnail = { + 'url': node.text, + 'width': int(width_x_height.split('x')[0]), + 'height': int(width_x_height.split('x')[1]) + } + thumbnails.append(thumbnail) + return thumbnails + + + thumbnail_nodes = doc.findall('.//teaserimages/teaserimage') + thumbnails = xml_to_thumbnails(thumbnail_nodes) + thumbnail = thumbnails[-1]['url'] + format_nodes = doc.findall('.//formitaeten/formitaet') formats = list(filter( lambda f: f['_available'], @@ -81,6 +98,8 @@ def extract_from_xml_url(ie, video_id, xml_url): 'title': title, 'description': description, 'duration': duration, + 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'uploader': uploader, 'uploader_id': uploader_id, 'upload_date': upload_date, -- cgit v1.2.3 From d762f86e940ad656e8f7e7b93636292e4cf36de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 19 Oct 2015 00:11:16 +0600 Subject: [ok] Extend _VALID_URL --- youtube_dl/extractor/odnoklassniki.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index ccc88cfb1..184c7a323 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -13,7 +13,7 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P[\d-]+)' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', @@ -66,6 +66,9 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'http://www.ok.ru/video/20648036891', 'only_matching': True, + }, { + 'url': 'http://www.ok.ru/videoembed/20648036891', + 'only_matching': True, }] def _real_extract(self, url): -- cgit v1.2.3 From 8cc83d301dd0e8029aff804e362860d36e3d7e7a Mon Sep 17 00:00:00 2001 From: kennell Date: Sun, 18 Oct 2015 20:47:42 +0200 Subject: use int_or_none, check if attrib exists, remove thumbnail --- youtube_dl/extractor/zdf.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index f376025e1..d41c4e712 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -73,19 +73,17 @@ def extract_from_xml_url(ie, video_id, xml_url): def xml_to_thumbnails(fnode): thumbnails = list() for node in fnode: - width_x_height = node.attrib['key'] - thumbnail = { - 'url': node.text, - 'width': int(width_x_height.split('x')[0]), - 'height': int(width_x_height.split('x')[1]) - } + thumbnail = {'url': node.text} + if 'key' in node.attrib: + width_x_height = node.attrib['key'] + thumbnail['width'] = int_or_none(width_x_height.split('x')[0]) + thumbnail['height'] = int_or_none(width_x_height.split('x')[1]) thumbnails.append(thumbnail) return thumbnails thumbnail_nodes = doc.findall('.//teaserimages/teaserimage') thumbnails = xml_to_thumbnails(thumbnail_nodes) - thumbnail = thumbnails[-1]['url'] format_nodes = doc.findall('.//formitaeten/formitaet') formats = list(filter( @@ -98,7 +96,6 @@ def extract_from_xml_url(ie, video_id, xml_url): 'title': title, 'description': description, 'duration': duration, - 'thumbnail': thumbnail, 'thumbnails': thumbnails, 'uploader': uploader, 'uploader_id': uploader_id, -- cgit v1.2.3 From b243340f0ce311443a15a2dfd4356a9504e18c04 Mon Sep 17 00:00:00 2001 From: kennell Date: Sun, 18 Oct 2015 21:07:52 +0200 Subject: check if key attrib matches resolution pattern --- youtube_dl/extractor/zdf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index d41c4e712..ed385450c 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -75,9 +75,9 @@ def extract_from_xml_url(ie, video_id, xml_url): for node in fnode: thumbnail = {'url': node.text} if 'key' in node.attrib: - width_x_height = node.attrib['key'] - thumbnail['width'] = int_or_none(width_x_height.split('x')[0]) - thumbnail['height'] = int_or_none(width_x_height.split('x')[1]) + if re.match("^[0-9]+x[0-9]+$", node.attrib['key']): + thumbnail['width'] = int_or_none(node.attrib['key'].split('x')[0]) + thumbnail['height'] = int_or_none(node.attrib['key'].split('x')[1]) thumbnails.append(thumbnail) return thumbnails -- cgit v1.2.3 From b7cedb16043c60d4032b206a83539acbd39f994f Mon Sep 17 00:00:00 2001 From: kennell Date: Sun, 18 Oct 2015 21:25:26 +0200 Subject: simplify thumbnail dict building --- youtube_dl/extractor/zdf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index ed385450c..c2b196504 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -75,9 +75,10 @@ def extract_from_xml_url(ie, video_id, xml_url): for node in fnode: thumbnail = {'url': node.text} if 'key' in node.attrib: - if re.match("^[0-9]+x[0-9]+$", node.attrib['key']): - thumbnail['width'] = int_or_none(node.attrib['key'].split('x')[0]) - thumbnail['height'] = int_or_none(node.attrib['key'].split('x')[1]) + m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) thumbnails.append(thumbnail) return thumbnails -- cgit v1.2.3 From 7b091c370c0f187545df8b1b1cc990fcf95df108 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 19 Oct 2015 01:48:05 +0600 Subject: [zdf] Modernize and PEP 8 --- youtube_dl/extractor/zdf.py | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index c2b196504..a795f56b3 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -9,6 +9,7 @@ from ..utils import ( int_or_none, unified_strdate, OnDemandPagedList, + xpath_text, ) @@ -19,13 +20,11 @@ def extract_from_xml_url(ie, video_id, xml_url): errnote='Failed to download video info') title = doc.find('.//information/title').text - description = doc.find('.//information/detail').text - duration = int(doc.find('.//details/lengthSec').text) - uploader_node = doc.find('.//details/originChannelTitle') - uploader = None if uploader_node is None else uploader_node.text - uploader_id_node = doc.find('.//details/originChannelId') - uploader_id = None if uploader_id_node is None else uploader_id_node.text - upload_date = unified_strdate(doc.find('.//details/airtime').text) + description = xpath_text(doc, './/information/detail', 'description') + duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) + uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') + uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') + upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) def xml_to_format(fnode): video_url = fnode.find('url').text @@ -40,15 +39,14 @@ def extract_from_xml_url(ie, video_id, xml_url): ext = format_m.group('container') proto = format_m.group('proto').lower() - quality = fnode.find('./quality').text - abr = int(fnode.find('./audioBitrate').text) // 1000 - vbr_node = fnode.find('./videoBitrate') - vbr = None if vbr_node is None else int(vbr_node.text) // 1000 + quality = xpath_text(fnode, './quality', 'quality') + abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) - width_node = fnode.find('./width') - width = None if width_node is None else int_or_none(width_node.text) - height_node = fnode.find('./height') - height = None if height_node is None else int_or_none(height_node.text) + width = int_or_none(xpath_text(fnode, './width', 'width')) + height = int_or_none(xpath_text(fnode, './height', 'height')) + + filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) format_note = '' if not format_note: @@ -64,16 +62,21 @@ def extract_from_xml_url(ie, video_id, xml_url): 'vbr': vbr, 'width': width, 'height': height, - 'filesize': int_or_none(fnode.find('./filesize').text), + 'filesize': filesize, 'format_note': format_note, 'protocol': proto, '_available': is_available, } def xml_to_thumbnails(fnode): - thumbnails = list() + thumbnails = [] for node in fnode: - thumbnail = {'url': node.text} + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } if 'key' in node.attrib: m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) if m: @@ -82,9 +85,7 @@ def extract_from_xml_url(ie, video_id, xml_url): thumbnails.append(thumbnail) return thumbnails - - thumbnail_nodes = doc.findall('.//teaserimages/teaserimage') - thumbnails = xml_to_thumbnails(thumbnail_nodes) + thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) format_nodes = doc.findall('.//formitaeten/formitaet') formats = list(filter( -- cgit v1.2.3 From 0be30bafa42dbfa99644a9eb7fefa5cebb70f121 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Mon, 19 Oct 2015 20:53:27 +0200 Subject: [vidme] Stream URL fallback, better error message for suspended videos --- youtube_dl/extractor/vidme.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 382517a4a..393970a12 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -97,6 +97,31 @@ class VidmeIE(InfoExtractor): # nsfw, user-disabled 'url': 'https://vid.me/dzGJ', 'only_matching': True, + }, { + # suspended + 'url': 'https://vid.me/Ox3G', + 'only_matching': True, + }, { + # no formats in the API response + 'url': 'https://vid.me/e5g', + 'info_dict': { + 'id': 'e5g', + 'ext': 'mp4', + 'title': 'e5g', + 'thumbnail': 're:^https?://.*\.jpg', + 'timestamp': 1401480195, + 'upload_date': '20140530', + 'uploader': None, + 'uploader_id': None, + 'age_limit': 0, + 'duration': 483, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -118,7 +143,7 @@ class VidmeIE(InfoExtractor): video = response['video'] - if video.get('state') == 'user-disabled': + if video.get('state') in ('user-disabled', 'suspended'): raise ExtractorError( 'Vidme said: This video has been suspended either due to a copyright claim, ' 'or for violating the terms of use.', @@ -131,6 +156,14 @@ class VidmeIE(InfoExtractor): 'height': int_or_none(f.get('height')), 'preference': 0 if f.get('type', '').endswith('clip') else 1, } for f in video.get('formats', []) if f.get('uri')] + + if not formats and video.get('complete_url'): + formats.append({ + 'url': video.get('complete_url'), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + }) + self._sort_formats(formats) title = video['title'] @@ -147,7 +180,7 @@ class VidmeIE(InfoExtractor): return { 'id': video_id, - 'title': title, + 'title': title or video_id, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, -- cgit v1.2.3 From 4bf56141950f3c24000381403417d20095f04460 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 20 Oct 2015 07:43:39 +0100 Subject: [cspan] move get_text_attr to CSpanIE --- youtube_dl/extractor/cspan.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index c74b35fd9..388460a32 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -14,10 +14,6 @@ from ..utils import ( from .senateisvp import SenateISVPIE -def get_text_attr(d, attr): - return d.get(attr, {}).get('#text') - - class CSpanIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P[0-9a-f]+)' IE_DESC = 'C-SPAN' @@ -60,6 +56,9 @@ class CSpanIE(InfoExtractor): } }] + def get_text_attr(self, d, attr): + return d.get(attr, {}).get('#text') + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -79,7 +78,7 @@ class CSpanIE(InfoExtractor): 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), video_id)['video'] if data['@status'] != 'Success': - raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) + raise ExtractorError('%s said: %s' % (self.IE_NAME, self.get_text_attr(data, 'error')), expected=True) doc = self._download_xml( 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), @@ -91,17 +90,17 @@ class CSpanIE(InfoExtractor): thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text files = data['files'] - capfile = get_text_attr(data, 'capfile') + capfile = self.get_text_attr(data, 'capfile') entries = [] for partnum, f in enumerate(files): formats = [] for quality in f['qualities']: formats.append({ - 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), - 'url': unescapeHTML(get_text_attr(quality, 'file')), - 'height': int_or_none(get_text_attr(quality, 'height')), - 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), + 'format_id': '%s-%sp' % (self.get_text_attr(quality, 'bitrate'), self.get_text_attr(quality, 'height')), + 'url': unescapeHTML(self.get_text_attr(quality, 'file')), + 'height': int_or_none(self.get_text_attr(quality, 'height')), + 'tbr': int_or_none(self.get_text_attr(quality, 'bitrate')), }) self._sort_formats(formats) entries.append({ @@ -112,7 +111,7 @@ class CSpanIE(InfoExtractor): 'formats': formats, 'description': description, 'thumbnail': thumbnail, - 'duration': int_or_none(get_text_attr(f, 'length')), + 'duration': int_or_none(self.get_text_attr(f, 'length')), 'subtitles': { 'en': [{ 'url': capfile, -- cgit v1.2.3 From b6aa99aff8278142fed94e37e500f1cfb62defd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Tue, 20 Oct 2015 10:30:31 +0200 Subject: [vimeo] Fix error parsing --- youtube_dl/extractor/vimeo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 0f84656c0..bdec79341 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -273,13 +273,13 @@ class VimeoIE(VimeoBaseInfoExtractor): self.report_extraction(video_id) vimeo_config = self._search_regex( - r'vimeo\.config\s*=\s*({.+?});', webpage, + r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));', webpage, 'vimeo config', default=None) if vimeo_config: seed_status = self._parse_json(vimeo_config, video_id).get('seed_status', {}) if seed_status.get('state') == 'failed': raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, seed_status['title']), + '%s said: %s' % (self.IE_NAME, seed_status['title']), expected=True) # Extract the config JSON -- cgit v1.2.3 From 4a8963770e37568c484841338cbb6761cf3cb5c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 20 Oct 2015 20:17:54 +0600 Subject: [vidme] Use original vid.me title template for untitled videos --- youtube_dl/extractor/vidme.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 393970a12..296e00423 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -107,7 +107,7 @@ class VidmeIE(InfoExtractor): 'info_dict': { 'id': 'e5g', 'ext': 'mp4', - 'title': 'e5g', + 'title': 'Video upload (e5g)', 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1401480195, 'upload_date': '20140530', @@ -180,7 +180,7 @@ class VidmeIE(InfoExtractor): return { 'id': video_id, - 'title': title or video_id, + 'title': title or 'Video upload (%s)' % video_id, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, -- cgit v1.2.3 From d65889bbc0a6b4a1eafe6a8c0e0e26170dc75586 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 20 Oct 2015 20:18:23 +0600 Subject: [vidme] Update test --- youtube_dl/extractor/vidme.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 296e00423..eb5cde761 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -14,7 +14,7 @@ class VidmeIE(InfoExtractor): _VALID_URL = r'https?://vid\.me/(?:e/)?(?P[\da-zA-Z]+)' _TESTS = [{ 'url': 'https://vid.me/QNB', - 'md5': 'c62f1156138dc3323902188c5b5a8bd6', + 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', 'info_dict': { 'id': 'QNB', 'ext': 'mp4', -- cgit v1.2.3 From 8bea039b8329074af9a95fe51e7622c8074f6218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Tue, 20 Oct 2015 16:38:44 +0200 Subject: [vimeo] New test, fixed one older test --- youtube_dl/extractor/vimeo.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bdec79341..2437ae1eb 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -133,7 +133,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people who love them.', + 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people\u2026', }, 'params': { 'videopassword': 'youtube-dl', @@ -181,6 +181,11 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'user28849593', }, }, + { + 'url': 'https://vimeo.com/109815029', + 'note': 'Video not completely processed, "failed" seed status', + 'only_matching': True, + }, ] @staticmethod -- cgit v1.2.3 From 4211c83aa4dec0cf9874a6a485665360570e2a89 Mon Sep 17 00:00:00 2001 From: mjdubell Date: Mon, 19 Oct 2015 03:36:07 +0200 Subject: [stitcher] Add extractor Stitcher review updates Removed re import Stitcher review updates --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/stitcher.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 youtube_dl/extractor/stitcher.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bd6eb6ae0..eac5e7d5e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -586,6 +586,7 @@ from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE from .spike import SpikeIE +from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import ( SportBoxIE, diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py new file mode 100644 index 000000000..a547debbd --- /dev/null +++ b/youtube_dl/extractor/stitcher.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor +from ..utils import int_or_none + + +class StitcherIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/[\/a-z\-]+(?P\d+)' + _TEST = { + 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', + 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940', + 'info_dict': { + 'id': '40789481', + 'ext': 'mp3', + 'title': 'Machine Learning Mastery and Cancer Clusters from Talking Machines', + } + } + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage(url, audio_id) + + title = self._og_search_title(webpage) + url = self._search_regex(r'episodeURL: "(.+?)"', webpage, 'url') + episode_image = self._search_regex(r'episodeImage: "(.+?)"', webpage, 'episode_image', fatal=False) + duration = int_or_none(self._search_regex(r'duration: (\d+?),', webpage, 'duration', fatal=False)) + + return { + 'id': audio_id, + 'url': url, + 'title': title, + 'duration': duration, + 'thumbnail': episode_image, + 'ext': 'mp3', + 'vcodec': 'none', + } -- cgit v1.2.3 From 7308b8cb3df5a2df0a86e8050c83b951004a0aca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 20 Oct 2015 23:12:13 +0600 Subject: [stitcher] Improve (Closes #7162, closes #7228) --- youtube_dl/extractor/stitcher.py | 78 +++++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 17 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index a547debbd..971a1c466 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -1,37 +1,81 @@ -# coding: utf-8 from __future__ import unicode_literals + +import re + from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, + unescapeHTML, +) class StitcherIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/[\/a-z\-]+(?P\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' + _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940', 'info_dict': { 'id': '40789481', 'ext': 'mp3', - 'title': 'Machine Learning Mastery and Cancer Clusters from Talking Machines', - } - } + 'title': 'Machine Learning Mastery and Cancer Clusters', + 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3', + 'duration': 1604, + 'thumbnail': 're:^https?://.*\.jpg', + }, + }, { + 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', + 'info_dict': { + 'id': '40846275', + 'display_id': 'the-rare-hourlong-comedy-plus', + 'ext': 'mp3', + 'title': "The CW's 'Crazy Ex-Girlfriend'", + 'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17', + 'duration': 2235, + 'thumbnail': 're:^https?://.*\.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # escaped title + 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true', + 'only_matching': True, + }, { + 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true', + 'only_matching': True, + }] def _real_extract(self, url): - audio_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + audio_id = mobj.group('id') + display_id = mobj.group('display_id') or audio_id - webpage = self._download_webpage(url, audio_id) + webpage = self._download_webpage(url, display_id) - title = self._og_search_title(webpage) - url = self._search_regex(r'episodeURL: "(.+?)"', webpage, 'url') - episode_image = self._search_regex(r'episodeImage: "(.+?)"', webpage, 'episode_image', fatal=False) - duration = int_or_none(self._search_regex(r'duration: (\d+?),', webpage, 'duration', fatal=False)) + episode = self._parse_json( + js_to_json(self._search_regex( + r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')), + display_id)['config']['episode'] + + title = unescapeHTML(episode['title']) + formats = [{ + 'url': episode[episode_key], + 'ext': determine_ext(episode[episode_key]) or 'mp3', + 'vcodec': 'none', + } for episode_key in ('origEpisodeURL', 'episodeURL') if episode.get(episode_key)] + description = self._search_regex( + r'Episode Info:\s*([^<]+)<', webpage, 'description', fatal=False) + duration = int_or_none(episode.get('duration')) + thumbnail = episode.get('episodeImage') return { 'id': audio_id, - 'url': url, + 'display_id': display_id, 'title': title, + 'description': description, 'duration': duration, - 'thumbnail': episode_image, - 'ext': 'mp3', - 'vcodec': 'none', + 'thumbnail': thumbnail, + 'formats': formats, } -- cgit v1.2.3 From cc449417c4b3835c31f89e47a5e08e0f0c42ac5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Oct 2015 20:35:22 +0600 Subject: [vine] Use _search_regex for JSON data (Closes #7254, closes #7255) --- youtube_dl/extractor/vine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index be72f3147..cb2a4b0b5 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -85,8 +85,8 @@ class VineIE(InfoExtractor): webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id) data = self._parse_json( - self._html_search_regex( - r'window\.POST_DATA = { %s: ({.+?}) };\s*' % video_id, + self._search_regex( + r'window\.POST_DATA\s*=\s*{\s*%s\s*:\s*({.+?})\s*};\s*' % video_id, webpage, 'vine data'), video_id) -- cgit v1.2.3 From 44d6dd08b299ccf17eb04901cf09a8d333769783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Oct 2015 21:35:57 +0600 Subject: [facebook] Fix extraction (Closes #7252) --- youtube_dl/extractor/facebook.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 178a7ca4c..f53c51615 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -14,7 +14,6 @@ from ..compat import ( ) from ..utils import ( ExtractorError, - int_or_none, limit_length, urlencode_postdata, get_element_by_id, @@ -142,16 +141,20 @@ class FacebookIE(InfoExtractor): data = dict(json.loads(m.group(1))) params_raw = compat_urllib_parse_unquote(data['params']) params = json.loads(params_raw) - video_data = params['video_data'][0] formats = [] - for quality in ['sd', 'hd']: - src = video_data.get('%s_src' % quality) - if src is not None: - formats.append({ - 'format_id': quality, - 'url': src, - }) + for format_id, f in params['video_data'].items(): + if not f or not isinstance(f, list): + continue + for quality in ('sd', 'hd'): + for src_type in ('src', 'src_no_ratelimit'): + src = f[0].get('%s_%s' % (quality, src_type)) + if src: + formats.append({ + 'format_id': '%s_%s_%s' % (format_id, quality, src_type), + 'url': src, + 'preference': -10 if format_id == 'progressive' else 0, + }) if not formats: raise ExtractorError('Cannot find video formats') @@ -171,7 +174,5 @@ class FacebookIE(InfoExtractor): 'id': video_id, 'title': video_title, 'formats': formats, - 'duration': int_or_none(video_data.get('video_duration')), - 'thumbnail': video_data.get('thumbnail_src'), 'uploader': uploader, } -- cgit v1.2.3 From 8c3533ba976af15ca9fac8acd68547b195dc4e8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 21 Oct 2015 23:57:23 +0200 Subject: [adultswim] Don't default to the native m3u8 downloader (closes #7243) Some of the streams are encrypted, which is not supported . --- youtube_dl/extractor/adultswim.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 130afe791..3ae618e71 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -183,7 +183,7 @@ class AdultSwimIE(InfoExtractor): media_url = file_el.text if determine_ext(media_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( - media_url, segment_title, 'mp4', 'm3u8_native', preference=0, m3u8_id='hls')) + media_url, segment_title, 'mp4', preference=0, m3u8_id='hls')) else: formats.append({ 'format_id': '%s_%s' % (bitrate, ftype), -- cgit v1.2.3 From 89d5fbf354cf0b49098582a19f76cef67358d375 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 22 Oct 2015 17:47:11 +0800 Subject: [iqiyi] Update key --- youtube_dl/extractor/iqiyi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 0e53cb154..2df1da3f0 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -205,9 +205,9 @@ class IqiyiIE(InfoExtractor): def get_enc_key(self, swf_url, video_id): # TODO: automatic key extraction - # last update at 2015-10-10 for Zombie::bite - # '7239670519b6ac209a0bee4ef0446a6b24894b8ac2751506e42116212a0d0272e505'[2:66][1::2] - enc_key = '97596c0abee04ab49ba25564161ad225' + # last update at 2015-10-22 for Zombie::bite + # '7223c67061dbea1259d0ceb44f44b6d62288f4f80c972170de5201d2321060270e05'[2:66][0::2] + enc_key = '2c76de15dcb44bd28ff0927d50d31620' return enc_key def _real_extract(self, url): -- cgit v1.2.3 From 7033bc1a5117068c493931cb736d53e68d50f9a1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 22 Oct 2015 21:12:29 +0800 Subject: [bbc] Fix test_BBC_9 --- youtube_dl/extractor/bbc.py | 1 + 1 file changed, 1 insertion(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 1b3a33e4e..ea67e3f2d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -625,6 +625,7 @@ class BBCIE(BBCCoUkIE): 'id': 'p02xycnp', 'ext': 'mp4', 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', 'duration': 140, }, 'params': { -- cgit v1.2.3 From a65402ef42c42477f78469f0a6c4af1583d97a31 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 22 Oct 2015 21:13:03 +0800 Subject: [bbc.co.uk:article] Add new extractor (#7257) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/bbc.py | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index eac5e7d5e..6318ac4a2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -45,6 +45,7 @@ from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE from .bbc import ( BBCCoUkIE, + BBCCoUkArticleIE, BBCIE, ) from .beeg import BeegIE diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index ea67e3f2d..2cdce1eb9 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -20,7 +20,7 @@ from ..compat import compat_HTTPError class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P[\da-z]{8})' _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails @@ -652,7 +652,7 @@ class BBCIE(BBCCoUkIE): @classmethod def suitable(cls, url): - return False if BBCCoUkIE.suitable(url) else super(BBCIE, cls).suitable(url) + return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url) def _extract_from_media_meta(self, media_meta, video_id): # Direct links to media in media metadata (e.g. @@ -903,3 +903,33 @@ class BBCIE(BBCCoUkIE): }) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + +class BBCCoUkArticleIE(InfoExtractor): + _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' + IE_NAME = 'bbc.co.uk:article' + IE_DESC = 'BBC articles' + + _TEST = { + 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer', + 'info_dict': { + 'id': '3jNQLTMrPlYGTBn0WV6M2MS', + 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four', + 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.', + }, + 'playlist_count': 4, + 'add_ie': ['BBCCoUk'], + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage).strip() + + entries = [self.url_result(programme_url) for programme_url in re.findall( + r']+typeof="Clip"[^>]+resource="([^"]+)"', webpage)] + + return self.playlist_result(entries, playlist_id, title, description) -- cgit v1.2.3 From 769078755318896fa1a6c5c8aba6f76d6aeddf78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 22 Oct 2015 20:34:11 +0600 Subject: [crunchyroll] Improve subtitle regex (Closes #7262) --- youtube_dl/extractor/crunchyroll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index cecd0c784..f8ce10111 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -245,7 +245,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text def _get_subtitles(self, video_id, webpage): subtitles = {} - for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): + for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage): sub_page = self._download_webpage( 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id, video_id, note='Downloading subtitles for ' + sub_name) -- cgit v1.2.3 From 9170ca5b16f3420892ff06bbe5cccf1679eb75e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 23 Oct 2015 14:16:08 +0200 Subject: [youtube:channel] Fix test --- youtube_dl/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 08e821362..bae1b1117 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1644,7 +1644,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', 'playlist_mincount': 91, 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'Uploads from lex will', } }] -- cgit v1.2.3 From 5c43afd40f8ba101e0cf90b8fcb5713b378a62c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 23 Oct 2015 14:23:45 +0200 Subject: [youtube:channel] Support age restricted channels (fixes #7277) --- youtube_dl/extractor/youtube.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bae1b1117..d7eda7aa7 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1647,6 +1647,15 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', 'title': 'Uploads from lex will', } + }, { + 'note': 'Age restricted channel', + # from https://www.youtube.com/user/DeusExOfficial + 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w', + 'playlist_mincount': 64, + 'info_dict': { + 'id': 'UUs0ifCMCm1icqRbqhUINa0w', + 'title': 'Uploads from Deus Ex', + }, }] def _real_extract(self, url): @@ -1667,7 +1676,7 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): 'channelId', channel_page, 'channel id', default=None) if not channel_playlist_id: channel_playlist_id = self._search_regex( - r'data-channel-external-id="([^"]+)"', + r'data-(?:channel-external-|yt)id="([^"]+)"', channel_page, 'channel id', default=None) if channel_playlist_id and channel_playlist_id.startswith('UC'): playlist_id = 'UU' + channel_playlist_id[2:] -- cgit v1.2.3 From c93153852f342ef26005b37649d58fa944c53fe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 24 Oct 2015 12:10:53 +0200 Subject: [mitele] Don't encode the URL query (closes #7280) This seems to produce sporadic errors when trying to access the URL, because on python 3.x when you do '%s' % b'somedata' you get "b'somedata'". --- youtube_dl/extractor/mitele.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 54993e2c9..ccb5c1467 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -56,7 +56,7 @@ class MiTeleIE(InfoExtractor): 'sta': '0', } media = self._download_json( - '%s/?%s' % (gat, compat_urllib_parse.urlencode(encode_dict(token_data)).encode('utf-8')), + '%s/?%s' % (gat, compat_urllib_parse.urlencode(encode_dict(token_data))), display_id, 'Downloading %s JSON' % location['loc']) file_ = media.get('file') if not file_: -- cgit v1.2.3 From 6856139705aea86ab1f950c08e605dd47f839be0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 24 Oct 2015 12:13:26 +0200 Subject: [mitele] Fix test checksum --- youtube_dl/extractor/mitele.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index ccb5c1467..3142fcde2 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -15,7 +15,7 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', - 'md5': 'ace7635b2a0b286aaa37d3ff192d2a8a', + 'md5': '757b0b66cbd7e0a97226d7d3156cb3e9', 'info_dict': { 'id': '0NF1jJnxS1Wu3pHrmvFyw2', 'display_id': 'programa-144', -- cgit v1.2.3 From 0198807ef95338bec69cfdcd67249c007e4d4141 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20D=C3=B6pmann?= Date: Sat, 24 Oct 2015 11:35:41 +0200 Subject: [spiegeltv] Fix Accept-Encoding issue (server chokes on gzip) --- youtube_dl/extractor/spiegeltv.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index 27f4033c5..a85305281 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -83,6 +83,10 @@ class SpiegeltvIE(InfoExtractor): preference=1, # Prefer hls since it allows to workaround georestriction m3u8_id='hls', fatal=False) if m3u8_formats is not False: + for m3u8_format in m3u8_formats: + m3u8_format['http_headers'] = { + 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side + } formats.extend(m3u8_formats) else: formats.append({ -- cgit v1.2.3 From 50f01302d347738647262f9442bc4f5d06f013c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Oct 2015 16:24:08 +0600 Subject: [spiegeltv] Do not extract m3u8 formats since it's already a format --- youtube_dl/extractor/spiegeltv.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index a85305281..d976bf33c 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -77,17 +77,16 @@ class SpiegeltvIE(InfoExtractor): 'rtmp_live': True, }) elif determine_ext(endpoint) == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - endpoint.replace('[video]', play_path), - video_id, 'm4v', - preference=1, # Prefer hls since it allows to workaround georestriction - m3u8_id='hls', fatal=False) - if m3u8_formats is not False: - for m3u8_format in m3u8_formats: - m3u8_format['http_headers'] = { - 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side - } - formats.extend(m3u8_formats) + formats.append({ + 'url': endpoint.replace('[video]', play_path), + 'ext': 'm4v', + 'format_id': 'hls', # Prefer hls since it allows to workaround georestriction + 'protocol': 'm3u8', + 'preference': 1, + 'http_headers': { + 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side + }, + }) else: formats.append({ 'url': endpoint, -- cgit v1.2.3 From 943a1e24b896a64c869bbd302f32fe5bd2afec96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Oct 2015 16:25:04 +0600 Subject: [extractor/common] Use more generic URLError in _is_valid_url --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6169fbbeb..720033ddf 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -842,7 +842,7 @@ class InfoExtractor(object): self._request_webpage(url, video_id, 'Checking %s URL' % item) return True except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): + if isinstance(e.cause, compat_urllib_error.URLError): self.to_screen( '%s: %s URL is invalid, skipping' % (video_id, item)) return False -- cgit v1.2.3 From ac21e7196856d7b689f74ed3f9953cbcbe90bee5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Oct 2015 16:25:44 +0600 Subject: [spiegeltv] Check formats --- youtube_dl/extractor/spiegeltv.py | 1 + 1 file changed, 1 insertion(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index d976bf33c..0981e325a 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -91,6 +91,7 @@ class SpiegeltvIE(InfoExtractor): formats.append({ 'url': endpoint, }) + self._check_formats(formats, video_id) thumbnails = [] for image in media_json['images']: -- cgit v1.2.3 From 865d1fbafc671815904b2ba3da76544d66c593c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 24 Oct 2015 12:39:23 +0200 Subject: [extractor/common] Remove unused import --- youtube_dl/extractor/common.py | 1 - 1 file changed, 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 720033ddf..04b699972 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -16,7 +16,6 @@ from ..compat import ( compat_cookiejar, compat_cookies, compat_getpass, - compat_HTTPError, compat_http_client, compat_urllib_error, compat_urllib_parse, -- cgit v1.2.3 From 36d72810374ef2dba0232706a461d6dc4aa292d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 24 Oct 2015 12:41:41 +0200 Subject: [spiegeltv] Fix style issue Use two spaces before comment. --- youtube_dl/extractor/spiegeltv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index 0981e325a..034bd47ff 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -84,7 +84,7 @@ class SpiegeltvIE(InfoExtractor): 'protocol': 'm3u8', 'preference': 1, 'http_headers': { - 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side + 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side }, }) else: -- cgit v1.2.3 From 7687b354c59efea076fae762206c00de273fbe04 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 23 Oct 2015 07:09:41 +0100 Subject: [abc] add support for audio extraction --- youtube_dl/extractor/abc.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index f9a389f67..ae80dc529 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -36,6 +36,15 @@ class ABCIE(InfoExtractor): 'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill', }, 'add_ie': ['Youtube'], + }, { + 'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080', + 'md5': 'b96eee7c9edf4fc5a358a0252881cc1f', + 'info_dict': { + 'id': '6880080', + 'ext': 'mp3', + 'title': 'NAB lifts interest rates, following Westpac and CBA', + 'description': 'md5:f13d8edc81e462fce4a0437c7dc04728', + }, }] def _real_extract(self, url): @@ -43,7 +52,7 @@ class ABCIE(InfoExtractor): webpage = self._download_webpage(url, video_id) mobj = re.search( - r'inline(?PVideo|YouTube)Data\.push\((?P[^)]+)\);', + r'inline(?PVideo|Audio|YouTube)Data\.push\((?P[^)]+)\);', webpage) if mobj is None: raise ExtractorError('Unable to extract video urls') @@ -60,11 +69,13 @@ class ABCIE(InfoExtractor): formats = [{ 'url': url_info['url'], + 'vcodec': url_info.get('codec') if mobj.group('type') == 'Video' else 'none', 'width': int_or_none(url_info.get('width')), 'height': int_or_none(url_info.get('height')), 'tbr': int_or_none(url_info.get('bitrate')), 'filesize': int_or_none(url_info.get('filesize')), } for url_info in urls_info] + self._sort_formats(formats) return { -- cgit v1.2.3 From d97da29da26c920ae31fde94bba5e3b4e1f5a36a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 24 Oct 2015 12:31:42 +0200 Subject: [abc] Support more URL formats --- youtube_dl/extractor/abc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index ae80dc529..c0e5d1abf 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -12,7 +12,7 @@ from ..utils import ( class ABCIE(InfoExtractor): IE_NAME = 'abc.net.au' - _VALID_URL = r'http://www\.abc\.net\.au/news/[^/]+/[^/]+/(?P\d+)' + _VALID_URL = r'http://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' _TESTS = [{ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', @@ -45,6 +45,9 @@ class ABCIE(InfoExtractor): 'title': 'NAB lifts interest rates, following Westpac and CBA', 'description': 'md5:f13d8edc81e462fce4a0437c7dc04728', }, + }, { + 'url': 'http://www.abc.net.au/news/2015-10-19/6866214', + 'only_matching': True, }] def _real_extract(self, url): -- cgit v1.2.3 From 50b936936dcf53b448557c35a90e4678239aaf81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 24 Oct 2015 14:22:47 +0200 Subject: [tutv] Fix test --- youtube_dl/extractor/tutv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index fad720b68..822372ea1 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -10,10 +10,10 @@ class TutvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P[^/?]+)' _TEST = { 'url': 'http://tu.tv/videos/robots-futbolistas', - 'md5': '627c7c124ac2a9b5ab6addb94e0e65f7', + 'md5': '0cd9e28ad270488911b0d2a72323395d', 'info_dict': { 'id': '2973058', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Robots futbolistas', }, } -- cgit v1.2.3 From 3711304510d3be6a5f9b2b18084aad8687e78001 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 8 Sep 2015 19:35:41 +0100 Subject: [extractor/common] get the redirected m3u8_url in _extract_m3u8_formats --- youtube_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 04b699972..10c0d5d1f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -943,13 +943,14 @@ class InfoExtractor(object): if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - m3u8_doc = self._download_webpage( + m3u8_doc, urlh = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) if m3u8_doc is False: return m3u8_doc + m3u8_url = urlh.geturl() last_info = None last_media = None kv_rex = re.compile( -- cgit v1.2.3 From ec29539e06e156a2bb589af774a80d156b2c2f76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Oct 2015 21:03:45 +0600 Subject: [senateisvp] Pass extra param as query segment without `?` --- youtube_dl/extractor/senateisvp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index 9c53704ea..474ebb49b 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -121,9 +121,9 @@ class SenateISVPIE(InfoExtractor): 'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=', }] else: - hdcore_sign = '?hdcore=3.1.0' + hdcore_sign = 'hdcore=3.1.0' url_params = (domain, video_id, stream_num) - f4m_url = '%s/z/%s_1@%s/manifest.f4m' % url_params + hdcore_sign + f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): # URLs without the extra param induce an 404 error -- cgit v1.2.3 From 8e82ecfe8f0dc2b9dfb6a2cda68e7b5f7926b0e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Oct 2015 21:04:09 +0600 Subject: [dailymotion] Extract f4m formats --- youtube_dl/extractor/dailymotion.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 9cd9ff17d..bc7823931 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -141,9 +141,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor): type_ = media.get('type') if type_ == 'application/vnd.lumberjack.manifest': continue - if type_ == 'application/x-mpegURL' or determine_ext(media_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', m3u8_id='hls')) + ext = determine_ext(media_url) + if type_ == 'application/x-mpegURL' or ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif type_ == 'application/f4m' or ext == 'f4m': + f4m_formats = self._extract_f4m_formats( + media_url, video_id, preference=-1, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) else: f = { 'url': media_url, -- cgit v1.2.3 From 7e0dc61334cfe9c79e92fd79d9996d191990a80a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 25 Oct 2015 20:48:29 +0600 Subject: [njoy] Add support for URLs without display id --- youtube_dl/extractor/ndr.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index e3cc6fde8..ba06d8a98 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -14,7 +14,8 @@ from ..utils import ( class NDRBaseIE(InfoExtractor): def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + display_id = next(group for group in mobj.groups() if group) webpage = self._download_webpage(url, display_id) return self._extract_embed(webpage, display_id) @@ -101,7 +102,7 @@ class NDRIE(NDRBaseIE): class NJoyIE(NDRBaseIE): IE_NAME = 'njoy' IE_DESC = 'N-JOY' - _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?P[^/?#]+),[\da-z]+\.html' + _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?:(?P[^/?#]+),)?(?P[\da-z]+)\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', @@ -136,6 +137,9 @@ class NJoyIE(NDRBaseIE): 'params': { 'skip_download': True, }, + }, { + 'url': 'http://www.n-joy.de/radio/webradio/morningshow209.html', + 'only_matching': True, }] def _extract_embed(self, webpage, display_id): -- cgit v1.2.3 From e572a1010b81f5864e610808c86848db4d6ed8e4 Mon Sep 17 00:00:00 2001 From: Erik Date: Sat, 17 Oct 2015 19:22:47 +0200 Subject: [youporn] Fix extraction [youporn] Added description and thumbnail [youporn] Added uploader and date [youporn] Removed Try and Except lines [youporn] Fixed date, fatal, formats and /s* [youporn] Undid removing comment about video url components & Undid and fixed removal of encrypted URL detection [youporn] Fix: Add encrypted link to links array only if not already in it [youporn] Fix: Add encrypted link to links array only if not already in it [youporn] Fix: cleanup --- youtube_dl/extractor/youporn.py | 55 +++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 32 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 4ba7c36db..546985f3a 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import json import re import sys +import datetime from .common import InfoExtractor from ..compat import ( @@ -27,10 +28,11 @@ class YouPornIE(InfoExtractor): 'info_dict': { 'id': '505835', 'ext': 'mp4', - 'upload_date': '20101221', - 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', - 'uploader': 'Ask Dan And Jennifer', 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', + 'description': 'Watch Sex Ed: Is It Safe To Masturbate Daily? at YouPorn.com - YouPorn is the biggest free porn tube site on the net!', + 'uploader': 'Ask Dan And Jennifer', + 'thumbnail': 'http://cdn5.image.youporn.phncdn.com/201012/17/505835/640x480/8/sex-ed-is-it-safe-to-masturbate-daily-8.jpg', + 'date': '20101221', 'age_limit': 18, } } @@ -45,45 +47,34 @@ class YouPornIE(InfoExtractor): webpage = self._download_webpage(req, video_id) age_limit = self._rta_search(webpage) - # Get JSON parameters - json_params = self._search_regex( - [r'videoJa?son\s*=\s*({.+})', - r'var\s+currentVideo\s*=\s*new\s+Video\((.+?)\)[,;]'], - webpage, 'JSON parameters') - try: - params = json.loads(json_params) - except ValueError: - raise ExtractorError('Invalid JSON') - self.report_extraction(video_id) - try: - video_title = params['title'] - upload_date = unified_strdate(params['release_date_f']) - video_description = params['description'] - video_uploader = params['submitted_by'] - thumbnail = params['thumbnails'][0]['image'] - except KeyError: - raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) + video_title = self._html_search_regex(r'page_params.video_title = \'(.+?)\';', webpage, 'video URL', fatal=False) + video_description = self._html_search_meta('description', webpage, 'video DESC', fatal=False) + video_thumbnail = self._html_search_regex(r'page_params.imageurl\t=\t"(.+?)";', webpage, 'video THUMB', fatal=False) + video_uploader = self._html_search_regex(r"
By:
\n]+\">(.+?)", webpage, 'video UPLOADER', fatal=False) + video_date = self._html_search_regex(r"
\n (.+?)\n
", webpage, 'video DATE', fatal=False) + video_date = datetime.datetime.strptime(video_date, '%B %d, %Y').strftime('%Y%m%d') # Get all of the links from the page - DOWNLOAD_LIST_RE = r'(?s)
    (?P.*?)
' - download_list_html = self._search_regex(DOWNLOAD_LIST_RE, - webpage, 'download list').strip() - LINK_RE = r'' + DOWNLOAD_LIST_RE = r'(?s)sources: {\n(?P.*?)}' + download_list_html = self._search_regex(DOWNLOAD_LIST_RE, webpage, 'download list').strip() + LINK_RE = r': \'(.+?)\',' links = re.findall(LINK_RE, download_list_html) # Get all encrypted links - encrypted_links = re.findall(r'var encryptedQuality[0-9]{3}URL = \'([a-zA-Z0-9+/]+={0,2})\';', webpage) + encrypted_links = re.findall(r'page_params.encryptedQuality[0-9]{3,4}URL\s=\s\'([a-zA-Z0-9+/]+={0,2})\';', webpage) for encrypted_link in encrypted_links: link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8') - links.append(link) + # it's unclear if encryted links still differ from normal ones, so only include in links array if it's unique + if link not in links: + links.append(link) formats = [] for link in links: # A link looks like this: - # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0 + # http://cdn2b.public.youporn.phncdn.com/201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4?rs=200&ri=2500&s=1445599900&e=1445773500&h=5345d19ce9944ec52eb167abf24af248 # A path looks like this: - # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 + # 201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 video_url = unescapeHTML(link) path = compat_urllib_parse_urlparse(video_url).path format_parts = path.split('/')[4].split('_')[:2] @@ -111,11 +102,11 @@ class YouPornIE(InfoExtractor): return { 'id': video_id, - 'uploader': video_uploader, - 'upload_date': upload_date, 'title': video_title, - 'thumbnail': thumbnail, 'description': video_description, + 'thumbnail': video_thumbnail, + 'uploader': video_uploader, + 'date': video_date, 'age_limit': age_limit, 'formats': formats, } -- cgit v1.2.3 From 589c33dadeec18a9d50713a4a200e3e2d9e297bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 25 Oct 2015 22:56:35 +0600 Subject: [youporn] Improve and make more robust (Closes #6888, closes #7214) --- youtube_dl/extractor/youporn.py | 177 +++++++++++++++++++++++----------------- 1 file changed, 104 insertions(+), 73 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 546985f3a..d10ebb0bf 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -1,112 +1,143 @@ from __future__ import unicode_literals - -import json import re -import sys -import datetime from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, - compat_urllib_request, -) +from ..compat import compat_urllib_request from ..utils import ( - ExtractorError, + int_or_none, + str_to_int, unescapeHTML, unified_strdate, ) -from ..aes import ( - aes_decrypt_text -) +from ..aes import aes_decrypt_text class YouPornIE(InfoExtractor): - _VALID_URL = r'^(?Phttps?://)(?:www\.)?(?Pyouporn\.com/watch/(?P[0-9]+)/(?P[^/]+))' + _VALID_URL = r'https?://(?:www\.)?youporn\.com/watch/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' _TEST = { 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'info_dict': { 'id': '505835', + 'display_id': 'sex-ed-is-it-safe-to-masturbate-daily', 'ext': 'mp4', 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', - 'description': 'Watch Sex Ed: Is It Safe To Masturbate Daily? at YouPorn.com - YouPorn is the biggest free porn tube site on the net!', + 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', + 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'Ask Dan And Jennifer', - 'thumbnail': 'http://cdn5.image.youporn.phncdn.com/201012/17/505835/640x480/8/sex-ed-is-it-safe-to-masturbate-daily-8.jpg', - 'date': '20101221', + 'upload_date': '20101221', + 'average_rating': int, + 'view_count': int, + 'categories': list, + 'tags': list, 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('videoid') - url = mobj.group('proto') + 'www.' + mobj.group('url') + video_id = mobj.group('id') + display_id = mobj.group('display_id') - req = compat_urllib_request.Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) - age_limit = self._rta_search(webpage) + request = compat_urllib_request.Request(url) + request.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(request, display_id) + + title = self._search_regex( + [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>.+?)\1', + r'<h1[^>]+class=["\']heading\d?["\'][^>]*>([^<])<'], + webpage, 'title', group='title') - self.report_extraction(video_id) - video_title = self._html_search_regex(r'page_params.video_title = \'(.+?)\';', webpage, 'video URL', fatal=False) - video_description = self._html_search_meta('description', webpage, 'video DESC', fatal=False) - video_thumbnail = self._html_search_regex(r'page_params.imageurl\t=\t"(.+?)";', webpage, 'video THUMB', fatal=False) - video_uploader = self._html_search_regex(r"<div class=\'videoInfoBy\'>By:</div>\n<a href=\"[^>]+\">(.+?)</a>", webpage, 'video UPLOADER', fatal=False) - video_date = self._html_search_regex(r"<div class='videoInfoTime'>\n<i class='icon-clock'></i> (.+?)\n</div>", webpage, 'video DATE', fatal=False) - video_date = datetime.datetime.strptime(video_date, '%B %d, %Y').strftime('%Y%m%d') - - # Get all of the links from the page - DOWNLOAD_LIST_RE = r'(?s)sources: {\n(?P<download_list>.*?)}' - download_list_html = self._search_regex(DOWNLOAD_LIST_RE, webpage, 'download list').strip() - LINK_RE = r': \'(.+?)\',' - links = re.findall(LINK_RE, download_list_html) - - # Get all encrypted links - encrypted_links = re.findall(r'page_params.encryptedQuality[0-9]{3,4}URL\s=\s\'([a-zA-Z0-9+/]+={0,2})\';', webpage) - for encrypted_link in encrypted_links: - link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8') - # it's unclear if encryted links still differ from normal ones, so only include in links array if it's unique - if link not in links: + links = [] + + sources = self._search_regex( + r'sources\s*:\s*({.+?})', webpage, 'sources', default=None) + if sources: + for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources): links.append(link) + # Fallback #1 + for _, link in re.findall( + r'(?:videoUrl|videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage): + links.append(link) + + # Fallback #2, this also contains extra low quality 180p format + for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): + links.append(link) + + # Fallback #3, encrypted links + for _, encrypted_link in re.findall( + r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage): + links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8')) + formats = [] - for link in links: - # A link looks like this: - # http://cdn2b.public.youporn.phncdn.com/201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4?rs=200&ri=2500&s=1445599900&e=1445773500&h=5345d19ce9944ec52eb167abf24af248 - # A path looks like this: - # 201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 - video_url = unescapeHTML(link) - path = compat_urllib_parse_urlparse(video_url).path - format_parts = path.split('/')[4].split('_')[:2] - - dn = compat_urllib_parse_urlparse(video_url).netloc.partition('.')[0] - - resolution = format_parts[0] - height = int(resolution[:-len('p')]) - bitrate = int(format_parts[1][:-len('k')]) - format = '-'.join(format_parts) + '-' + dn - - formats.append({ + for video_url in set(unescapeHTML(link) for link in links): + f = { 'url': video_url, - 'format': format, - 'format_id': format, - 'height': height, - 'tbr': bitrate, - 'resolution': resolution, - }) - + } + # Video URL's path looks like this: + # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # We will benefit from it by extracting some metadata + mobj = re.search(r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url) + if mobj: + height = int(mobj.group('height')) + bitrate = int(mobj.group('bitrate')) + f.update({ + 'format_id': '%dp-%dk' % (height, bitrate), + 'height': height, + 'tbr': bitrate, + }) + formats.append(f) self._sort_formats(formats) - if not formats: - raise ExtractorError('ERROR: no known formats available for video') + description = self._html_search_regex( + r'(?s)<div[^>]+class=["\']video-description["\'][^>]*>(.+?)</div>', + webpage, 'description', fatal=False) + thumbnail = self._search_regex( + r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', + webpage, 'thumbnail', fatal=False, group='thumbnail') + + uploader = self._search_regex( + r'<div[^>]+class=["\']videoInfoBy["\'][^>]*>\s*By:\s*</div>\s*<a[^>]+href="[^"]*">([^<]+)</a>', + webpage, 'uploader', fatal=False) + upload_date = unified_strdate(self._html_search_regex( + r'(?s)<div[^>]+class=["\']videoInfoTime["\'][^>]*>(.+?)</div>', + webpage, 'upload date', fatal=False)) + + age_limit = self._rta_search(webpage) + + average_rating = int_or_none(self._search_regex( + r'<div[^>]+class=["\']videoInfoRating["\'][^>]*>\s*<div[^>]+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%</div>', + webpage, 'average rating', fatal=False)) + + view_count = str_to_int(self._search_regex( + r'(?s)<div[^>]+class=["\']videoInfoViews["\'][^>]*>.*?([\d,.]+)\s*</div>', + webpage, 'view count', fatal=False)) + + def extract_tag_box(title): + tag_box = self._search_regex( + (r'<div[^>]+class=["\']tagBoxTitle["\'][^>]*>\s*%s\b.*?</div>\s*' + '<div[^>]+class=["\']tagBoxContent["\']>(.+?)</div>') % re.escape(title), + webpage, '%s tag box' % title, default=None) + if not tag_box: + return [] + return re.findall(r'<a[^>]+href=[^>]+>([^<]+)', tag_box) + + categories = extract_tag_box('Category') + tags = extract_tag_box('Tags') return { 'id': video_id, - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - 'date': video_date, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, + 'average_rating': average_rating, + 'view_count': view_count, + 'categories': categories, + 'tags': tags, 'age_limit': age_limit, 'formats': formats, } -- cgit v1.2.3 From feb7711cf58863a19cae770a878d22a8424e3c61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 25 Oct 2015 23:01:12 +0600 Subject: [youporn] Make description optional Some videos does not contain any description --- youtube_dl/extractor/youporn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index d10ebb0bf..db5d049d2 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -92,7 +92,7 @@ class YouPornIE(InfoExtractor): description = self._html_search_regex( r'(?s)<div[^>]+class=["\']video-description["\'][^>]*>(.+?)</div>', - webpage, 'description', fatal=False) + webpage, 'description', default=None) thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') -- cgit v1.2.3 From 4f13f8f798be06bc2b3c0c42818bb0785e4cde64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 25 Oct 2015 23:12:12 +0600 Subject: [youporn] Improve uploader extraction --- youtube_dl/extractor/youporn.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index db5d049d2..b39fbb5fc 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -15,8 +15,9 @@ from ..aes import aes_decrypt_text class YouPornIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?youporn\.com/watch/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', + 'md5': '71ec5fcfddacf80f495efa8b6a8d9a89', 'info_dict': { 'id': '505835', 'display_id': 'sex-ed-is-it-safe-to-masturbate-daily', @@ -31,8 +32,29 @@ class YouPornIE(InfoExtractor): 'categories': list, 'tags': list, 'age_limit': 18, - } - } + }, + }, { + # Anonymous User uploader + 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4', + 'info_dict': { + 'id': '561726', + 'display_id': 'big-tits-awesome-brunette-on-amazing-webcam-show', + 'ext': 'mp4', + 'title': 'Big Tits Awesome Brunette On amazing webcam show', + 'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Anonymous User', + 'upload_date': '20111125', + 'average_rating': int, + 'view_count': int, + 'categories': list, + 'tags': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -97,8 +119,8 @@ class YouPornIE(InfoExtractor): r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') - uploader = self._search_regex( - r'<div[^>]+class=["\']videoInfoBy["\'][^>]*>\s*By:\s*</div>\s*<a[^>]+href="[^"]*">([^<]+)</a>', + uploader = self._html_search_regex( + r'(?s)<div[^>]+class=["\']videoInfoBy["\'][^>]*>\s*By:\s*</div>(.+?)</(?:a|div)>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( r'(?s)<div[^>]+class=["\']videoInfoTime["\'][^>]*>(.+?)</div>', -- cgit v1.2.3 From 7b3a19e5339344037a872574780c39f334cea90e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 25 Oct 2015 23:17:23 +0600 Subject: [stitcher] Remove origEpisodeURL It's always 404 --- youtube_dl/extractor/stitcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index 971a1c466..d5c852f52 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -64,7 +64,7 @@ class StitcherIE(InfoExtractor): 'url': episode[episode_key], 'ext': determine_ext(episode[episode_key]) or 'mp3', 'vcodec': 'none', - } for episode_key in ('origEpisodeURL', 'episodeURL') if episode.get(episode_key)] + } for episode_key in ('episodeURL',) if episode.get(episode_key)] description = self._search_regex( r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False) duration = int_or_none(episode.get('duration')) -- cgit v1.2.3 From 755ff8d22ca5607400c1232b194e20a004e4e9eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 25 Oct 2015 23:41:10 +0600 Subject: [youporn] Extract comment count --- youtube_dl/extractor/youporn.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index b39fbb5fc..9bf8d1eeb 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -29,6 +29,7 @@ class YouPornIE(InfoExtractor): 'upload_date': '20101221', 'average_rating': int, 'view_count': int, + 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -47,6 +48,7 @@ class YouPornIE(InfoExtractor): 'upload_date': '20111125', 'average_rating': int, 'view_count': int, + 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -135,6 +137,9 @@ class YouPornIE(InfoExtractor): view_count = str_to_int(self._search_regex( r'(?s)<div[^>]+class=["\']videoInfoViews["\'][^>]*>.*?([\d,.]+)\s*</div>', webpage, 'view count', fatal=False)) + comment_count = str_to_int(self._search_regex( + r'>All [Cc]omments? \(([\d,.]+)\)', + webpage, 'comment count', fatal=False)) def extract_tag_box(title): tag_box = self._search_regex( @@ -158,6 +163,7 @@ class YouPornIE(InfoExtractor): 'upload_date': upload_date, 'average_rating': average_rating, 'view_count': view_count, + 'comment_count': comment_count, 'categories': categories, 'tags': tags, 'age_limit': age_limit, -- cgit v1.2.3 From 36e6f62cd0883f0f486d1666d010e5d9e6d515bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 25 Oct 2015 20:04:55 +0100 Subject: Use a wrapper around xml.etree.ElementTree.fromstring in python 2.x (#7178) Attributes aren't unicode objects, so they couldn't be directly used in info_dict fields (for example '--write-description' doesn't work with bytes). --- youtube_dl/extractor/bbc.py | 8 +++++--- youtube_dl/extractor/bilibili.py | 6 ++++-- youtube_dl/extractor/brightcove.py | 4 ++-- youtube_dl/extractor/common.py | 4 ++-- youtube_dl/extractor/crunchyroll.py | 4 ++-- youtube_dl/extractor/vevo.py | 6 +++--- 6 files changed, 18 insertions(+), 14 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 2cdce1eb9..a55a6dbc9 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -14,7 +13,10 @@ from ..utils import ( remove_end, unescapeHTML, ) -from ..compat import compat_HTTPError +from ..compat import ( + compat_etree_fromstring, + compat_HTTPError, +) class BBCCoUkIE(InfoExtractor): @@ -344,7 +346,7 @@ class BBCCoUkIE(InfoExtractor): url, programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) + media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) else: raise return self._process_media_selector(media_selection, programme_id) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index ecc17ebeb..6c66a1236 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -4,9 +4,11 @@ from __future__ import unicode_literals import re import itertools import json -import xml.etree.ElementTree as ET from .common import InfoExtractor +from ..compat import ( + compat_etree_fromstring, +) from ..utils import ( int_or_none, unified_strdate, @@ -88,7 +90,7 @@ class BiliBiliIE(InfoExtractor): except ValueError: pass - lq_doc = ET.fromstring(lq_page) + lq_doc = compat_etree_fromstring(lq_page) lq_durls = lq_doc.findall('./durl') hq_doc = self._download_xml( diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4721c2293..1686cdde1 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -3,10 +3,10 @@ from __future__ import unicode_literals import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_parse_qs, compat_str, compat_urllib_parse, @@ -119,7 +119,7 @@ class BrightcoveIE(InfoExtractor): object_str = fix_xml_ampersands(object_str) try: - object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) + object_doc = compat_etree_fromstring(object_str.encode('utf-8')) except compat_xml_parse_error: return diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 10c0d5d1f..52523d7b2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -10,7 +10,6 @@ import re import socket import sys import time -import xml.etree.ElementTree from ..compat import ( compat_cookiejar, @@ -23,6 +22,7 @@ from ..compat import ( compat_urllib_request, compat_urlparse, compat_str, + compat_etree_fromstring, ) from ..utils import ( NO_DEFAULT, @@ -461,7 +461,7 @@ class InfoExtractor(object): return xml_string if transform_source: xml_string = transform_source(xml_string) - return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + return compat_etree_fromstring(xml_string.encode('utf-8')) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f8ce10111..0c9b8ca02 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -5,12 +5,12 @@ import re import json import base64 import zlib -import xml.etree.ElementTree from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_request, @@ -234,7 +234,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text return output def _extract_subtitles(self, subtitle): - sub_root = xml.etree.ElementTree.fromstring(subtitle) + sub_root = compat_etree_fromstring(subtitle) return [{ 'ext': 'srt', 'data': self._convert_subtitles_to_srt(sub_root), diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index c17094f81..4c0de354f 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -1,10 +1,10 @@ from __future__ import unicode_literals import re -import xml.etree.ElementTree from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_urllib_request, ) from ..utils import ( @@ -97,7 +97,7 @@ class VevoIE(InfoExtractor): if last_version['version'] == -1: raise ExtractorError('Unable to extract last version of the video') - renditions = xml.etree.ElementTree.fromstring(last_version['data']) + renditions = compat_etree_fromstring(last_version['data']) formats = [] # Already sorted from worst to best quality for rend in renditions.findall('rendition'): @@ -114,7 +114,7 @@ class VevoIE(InfoExtractor): def _formats_from_smil(self, smil_xml): formats = [] - smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8')) + smil_doc = compat_etree_fromstring(smil_xml.encode('utf-8')) els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') for el in els: src = el.attrib['src'] -- cgit v1.2.3 From 5f9f87c06fa819c75e59f5d0d491d191c229abbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= <lukas@oxygene.sk> Date: Mon, 26 Oct 2015 14:42:17 +0100 Subject: [vidme] Check for deleted videos --- youtube_dl/extractor/vidme.py | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index eb5cde761..3d63ed4f0 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -101,6 +101,10 @@ class VidmeIE(InfoExtractor): # suspended 'url': 'https://vid.me/Ox3G', 'only_matching': True, + }, { + # deleted + 'url': 'https://vid.me/KTPm', + 'only_matching': True, }, { # no formats in the API response 'url': 'https://vid.me/e5g', @@ -143,6 +147,11 @@ class VidmeIE(InfoExtractor): video = response['video'] + if video.get('state') == 'deleted': + raise ExtractorError( + 'Vidme said: Sorry, this video has been deleted.', + expected=True) + if video.get('state') in ('user-disabled', 'suspended'): raise ExtractorError( 'Vidme said: This video has been suspended either due to a copyright claim, ' -- cgit v1.2.3 From 5dadae079bd053c822353b081e94d9daff333208 Mon Sep 17 00:00:00 2001 From: Frans de Jonge <fransdejonge@gmail.com> Date: Mon, 26 Oct 2015 15:11:09 +0100 Subject: [francetv] Add subtitles support --- youtube_dl/extractor/francetv.py | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 129984a5f..eaaa43958 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -83,6 +83,16 @@ class FranceTVBaseInfoExtractor(InfoExtractor): if subtitle: title += ' - %s' % subtitle + subtitles = {} + for subtitle_accessibilite in info['subtitles']: + if subtitle_accessibilite['url'] is not '': + if not subtitles: + subtitles['fr'] = [] + subtitles['fr'].append({ + 'ext': subtitle_accessibilite['format'], + 'url': subtitle_accessibilite['url'], + }) + return { 'id': video_id, 'title': title, @@ -91,6 +101,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), 'timestamp': int_or_none(info['diffusion']['timestamp']), 'formats': formats, + 'subtitles': subtitles, } -- cgit v1.2.3 From 6e4b8b28916aaafc6d1b4b4d69a6f667e35d413f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 26 Oct 2015 20:35:28 +0600 Subject: [francetv] Make subtitles more robust (Closes #7298) --- youtube_dl/extractor/francetv.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index eaaa43958..07115b9d4 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -84,14 +84,12 @@ class FranceTVBaseInfoExtractor(InfoExtractor): title += ' - %s' % subtitle subtitles = {} - for subtitle_accessibilite in info['subtitles']: - if subtitle_accessibilite['url'] is not '': - if not subtitles: - subtitles['fr'] = [] - subtitles['fr'].append({ - 'ext': subtitle_accessibilite['format'], - 'url': subtitle_accessibilite['url'], - }) + subtitles_list = [{ + 'url': subtitle['url'], + 'ext': subtitle.get('format'), + } for subtitle in info.get('subtitles', []) if subtitle.get('url')] + if subtitles_list: + subtitles['fr'] = subtitles_list return { 'id': video_id, -- cgit v1.2.3 From c137cc0d33fee8369b857c8f12a9116379248127 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 26 Oct 2015 20:35:45 +0600 Subject: [francetv] Add subtitles test --- youtube_dl/extractor/francetv.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 07115b9d4..a31cc3c97 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -129,6 +129,9 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'title': 'Soir 3', 'upload_date': '20130826', 'timestamp': 1377548400, + 'subtitles': { + 'fr': 'mincount:2', + }, }, }, { 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', -- cgit v1.2.3 From f78546272cf7c4b10c8003870728ab69bec982fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 26 Oct 2015 16:41:24 +0100 Subject: [compat] compat_etree_fromstring: also decode the text attribute Deletes parse_xml from utils, because it also does it. --- youtube_dl/extractor/ard.py | 4 ++-- youtube_dl/extractor/generic.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6f465789b..73be6d204 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -14,8 +14,8 @@ from ..utils import ( parse_duration, unified_strdate, xpath_text, - parse_xml, ) +from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): @@ -161,7 +161,7 @@ class ARDMediathekIE(InfoExtractor): raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True) if re.search(r'[\?&]rss($|[=&])', url): - doc = parse_xml(webpage) + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return GenericIE()._extract_rss(url, video_id, doc) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..1de96b268 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -9,6 +9,7 @@ import sys from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( + compat_etree_fromstring, compat_urllib_parse_unquote, compat_urllib_request, compat_urlparse, @@ -21,7 +22,6 @@ from ..utils import ( HEADRequest, is_html, orderedSet, - parse_xml, smuggle_url, unescapeHTML, unified_strdate, @@ -1237,7 +1237,7 @@ class GenericIE(InfoExtractor): # Is it an RSS feed, a SMIL file or a XSPF playlist? try: - doc = parse_xml(webpage) + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): -- cgit v1.2.3 From a526167d40983e47231d10c09c9f9064e0298604 Mon Sep 17 00:00:00 2001 From: Pierre Fenoll <pierrefenoll@gmail.com> Date: Tue, 27 Oct 2015 11:58:59 +0100 Subject: [francetv] Accept mobile URLs --- youtube_dl/extractor/francetv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index a31cc3c97..d63dc4d7c 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -105,7 +105,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): class PluzzIE(FranceTVBaseInfoExtractor): IE_NAME = 'pluzz.francetv.fr' - _VALID_URL = r'https?://pluzz\.francetv\.fr/videos/(.*?)\.html' + _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(.*?)\.html' # Can't use tests, videos expire in 7 days -- cgit v1.2.3 From 0a192fbea798c843ad6fef37106901d431f39b6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 27 Oct 2015 21:43:29 +0600 Subject: [pluzz] Fix mobile support and modernize (Closes #7305) --- youtube_dl/extractor/francetv.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index d63dc4d7c..00a80ba61 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -105,15 +105,21 @@ class FranceTVBaseInfoExtractor(InfoExtractor): class PluzzIE(FranceTVBaseInfoExtractor): IE_NAME = 'pluzz.francetv.fr' - _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(.*?)\.html' + _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(?P<id>.+?)\.html' # Can't use tests, videos expire in 7 days def _real_extract(self, url): - title = re.match(self._VALID_URL, url).group(1) - webpage = self._download_webpage(url, title) - video_id = self._search_regex( - r'data-diffusion="(\d+)"', webpage, 'ID') + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._html_search_meta( + 'id_video', webpage, 'video id', default=None) + if not video_id: + video_id = self._search_regex( + r'data-diffusion=["\'](\d+)', webpage, 'video id') + return self._extract_video(video_id, 'Pluzz') -- cgit v1.2.3 From 7ccb2b84ddb65f41d01037b5b62301886be9d22c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 28 Oct 2015 08:22:04 +0100 Subject: [francetv] fix style issues reported by flake8 * Don't redefine variable in list comprehension * Line missing indentation --- youtube_dl/extractor/francetv.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 00a80ba61..8e60cf60f 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -85,9 +85,9 @@ class FranceTVBaseInfoExtractor(InfoExtractor): subtitles = {} subtitles_list = [{ - 'url': subtitle['url'], - 'ext': subtitle.get('format'), - } for subtitle in info.get('subtitles', []) if subtitle.get('url')] + 'url': subformat['url'], + 'ext': subformat.get('format'), + } for subformat in info.get('subtitles', []) if subformat.get('url')] if subtitles_list: subtitles['fr'] = subtitles_list @@ -118,7 +118,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): 'id_video', webpage, 'video id', default=None) if not video_id: video_id = self._search_regex( - r'data-diffusion=["\'](\d+)', webpage, 'video id') + r'data-diffusion=["\'](\d+)', webpage, 'video id') return self._extract_video(video_id, 'Pluzz') -- cgit v1.2.3 From 4e16c1f80b009001aaea6f8baca5dfbfa9b58c11 Mon Sep 17 00:00:00 2001 From: Cian Ruane <CianLR@users.noreply.github.com> Date: Fri, 16 Oct 2015 01:23:09 +0100 Subject: [clyp] Add extractor Update __init__.py [clyp.it] Extract ID idiomatically and make duration and description optional --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/clyp.py | 57 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 youtube_dl/extractor/clyp.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6318ac4a2..f98e6487e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -90,6 +90,7 @@ from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE from .cloudy import CloudyIE from .clubic import ClubicIE +from .clyp import ClypIE from .cmt import CMTIE from .cnet import CNETIE from .cnn import ( diff --git a/youtube_dl/extractor/clyp.py b/youtube_dl/extractor/clyp.py new file mode 100644 index 000000000..906729b30 --- /dev/null +++ b/youtube_dl/extractor/clyp.py @@ -0,0 +1,57 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class ClypIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)' + + _TESTS = [{ + 'url': 'https://clyp.it/ojz2wfah', + 'md5': '1d4961036c41247ecfdcc439c0cddcbb', + 'info_dict': { + 'id': 'ojz2wfah', + 'ext': 'mp3', + 'title': 'Krisson80 - bits wip wip', + 'description': '#Krisson80BitsWipWip #chiptune\n#wip', + }, + }, { + 'url': 'https://clyp.it/ojz2wfah', + 'only_matching': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + api_url = 'https://api.clyp.it/' + audio_id + metadata = self._download_json(api_url, audio_id) + + title = metadata['Title'] + + description = None + if metadata['Description']: description = metadata['Description'] + + duration = None + if metadata['Duration']: duration = int(metadata['Duration']) + + formats = [ + { + 'url': metadata['OggUrl'], + 'format_id': 'ogg', + 'preference': -2 + },{ + 'url': metadata['Mp3Url'], + 'format_id': 'mp3', + 'preference': -1 + }] + + return { + 'id': audio_id, + 'title': title, + 'formats': formats, + 'description': description, + 'duration': duration + } -- cgit v1.2.3 From 03c2c162f9eaac6b474a1be9e985621f5b7b8c10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 28 Oct 2015 21:42:01 +0600 Subject: [clyp] Improve and cleanup (Closes #7194) --- youtube_dl/extractor/clyp.py | 62 ++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 31 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/clyp.py b/youtube_dl/extractor/clyp.py index 906729b30..57e643799 100644 --- a/youtube_dl/extractor/clyp.py +++ b/youtube_dl/extractor/clyp.py @@ -1,16 +1,15 @@ -# coding: utf-8 - from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + float_or_none, + parse_iso8601, +) class ClypIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)' - - _TESTS = [{ + _TEST = { 'url': 'https://clyp.it/ojz2wfah', 'md5': '1d4961036c41247ecfdcc439c0cddcbb', 'info_dict': { @@ -18,40 +17,41 @@ class ClypIE(InfoExtractor): 'ext': 'mp3', 'title': 'Krisson80 - bits wip wip', 'description': '#Krisson80BitsWipWip #chiptune\n#wip', + 'duration': 263.21, + 'timestamp': 1443515251, + 'upload_date': '20150929', }, - }, { - 'url': 'https://clyp.it/ojz2wfah', - 'only_matching': True, - }] + } def _real_extract(self, url): audio_id = self._match_id(url) - api_url = 'https://api.clyp.it/' + audio_id - metadata = self._download_json(api_url, audio_id) - title = metadata['Title'] + metadata = self._download_json( + 'https://api.clyp.it/%s' % audio_id, audio_id) + + formats = [] + for secure in ('', 'Secure'): + for ext in ('Ogg', 'Mp3'): + format_id = '%s%s' % (secure, ext) + format_url = metadata.get('%sUrl' % format_id) + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'vcodec': 'none', + }) + self._sort_formats(formats) - description = None - if metadata['Description']: description = metadata['Description'] - - duration = None - if metadata['Duration']: duration = int(metadata['Duration']) - - formats = [ - { - 'url': metadata['OggUrl'], - 'format_id': 'ogg', - 'preference': -2 - },{ - 'url': metadata['Mp3Url'], - 'format_id': 'mp3', - 'preference': -1 - }] + title = metadata['Title'] + description = metadata.get('Description') + duration = float_or_none(metadata.get('Duration')) + timestamp = parse_iso8601(metadata.get('DateCreated')) return { 'id': audio_id, 'title': title, - 'formats': formats, 'description': description, - 'duration': duration + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, } -- cgit v1.2.3 From 6fb8ace671db2f2bdcc9cd7ac6b9f81fbd356791 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 29 Oct 2015 22:44:01 +0600 Subject: [moniker] Add support for builtin embedded videos (Closes #7244) --- youtube_dl/extractor/moniker.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py index 69e4bcd1a..204c03c4a 100644 --- a/youtube_dl/extractor/moniker.py +++ b/youtube_dl/extractor/moniker.py @@ -17,7 +17,7 @@ from ..utils import ( class MonikerIE(InfoExtractor): IE_DESC = 'allmyvideos.net and vidspot.net' - _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?P<id>[a-zA-Z0-9_-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?:(?:2|v)/v-)?(?P<id>[a-zA-Z0-9_-]+)' _TESTS = [{ 'url': 'http://allmyvideos.net/jih3nce3x6wn', @@ -64,18 +64,30 @@ class MonikerIE(InfoExtractor): raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, error), expected=True) - fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) - data = dict(fields) + builtin_url = self._search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>.+?/builtin-.+?)\1', + orig_webpage, 'builtin URL', default=None, group='url') - post = compat_urllib_parse.urlencode(data) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - } - req = compat_urllib_request.Request(url, post, headers) - webpage = self._download_webpage( - req, video_id, note='Downloading video page ...') + if builtin_url: + req = compat_urllib_request.Request(builtin_url) + req.add_header('Referer', url) + webpage = self._download_webpage(req, video_id, 'Downloading builtin page') + title = self._og_search_title(orig_webpage).strip() + description = self._og_search_description(orig_webpage).strip() + else: + fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) + data = dict(fields) + + post = compat_urllib_parse.urlencode(data) + headers = { + b'Content-Type': b'application/x-www-form-urlencoded', + } + req = compat_urllib_request.Request(url, post, headers) + webpage = self._download_webpage( + req, video_id, note='Downloading video page ...') - title = os.path.splitext(data['fname'])[0] + title = os.path.splitext(data['fname'])[0] + description = None # Could be several links with different quality links = re.findall(r'"file" : "?(.+?)",', webpage) @@ -89,5 +101,6 @@ class MonikerIE(InfoExtractor): return { 'id': video_id, 'title': title, + 'description': description, 'formats': formats, } -- cgit v1.2.3 From 721f5a277ca0012ee72c9d4b3e5550e52a0a596d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 29 Oct 2015 22:47:18 +0600 Subject: [moniker] Add tests for #7244 --- youtube_dl/extractor/moniker.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py index 204c03c4a..7c0c4e50e 100644 --- a/youtube_dl/extractor/moniker.py +++ b/youtube_dl/extractor/moniker.py @@ -46,6 +46,18 @@ class MonikerIE(InfoExtractor): }, { 'url': 'https://www.vidspot.net/l2ngsmhs8ci5', 'only_matching': True, + }, { + 'url': 'http://vidspot.net/2/v-ywDf99', + 'md5': '5f8254ce12df30479428b0152fb8e7ba', + 'info_dict': { + 'id': 'ywDf99', + 'ext': 'mp4', + 'title': 'IL FAIT LE MALIN EN PORSHE CAYENNE ( mais pas pour longtemps)', + 'description': 'IL FAIT LE MALIN EN PORSHE CAYENNE.', + }, + }, { + 'url': 'http://allmyvideos.net/v/v-HXZm5t', + 'only_matching': True, }] def _real_extract(self, url): -- cgit v1.2.3 From 6722ebd43720e836af8217fa078fa1a604b98229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 30 Oct 2015 21:00:36 +0600 Subject: [anitube] Relax key regex (Closes #7303) Another variant seen http://anitubebr.xpg.uol.com.br/embed/ --- youtube_dl/extractor/anitube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 31f0d417c..23f942ae2 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -26,8 +26,8 @@ class AnitubeIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - key = self._html_search_regex( - r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, 'key') + key = self._search_regex( + r'src=["\']https?://[^/]+/embed/([A-Za-z0-9_-]+)', webpage, 'key') config_xml = self._download_xml( 'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key) -- cgit v1.2.3 From 47f2d01a5ac074f6959aa12e8bc00310f18a54e8 Mon Sep 17 00:00:00 2001 From: Lucas <mikotosc@gmail.com> Date: Thu, 24 Sep 2015 22:19:09 +0200 Subject: Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/kika.py | 115 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 youtube_dl/extractor/kika.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f98e6487e..5ad4e9c36 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -274,6 +274,7 @@ from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE +from .kika import KikaIE from .keek import KeekIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py new file mode 100644 index 000000000..db0f333ff --- /dev/null +++ b/youtube_dl/extractor/kika.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class KikaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|sendung)(?P<id>\d+).*' + + _TESTS = [ + { + 'url': 'http://www.kika.de/baumhaus/videos/video9572.html', + 'md5': '94fc748cf5d64916571d275a07ffe2d5', + 'info_dict': { + 'id': '9572', + 'ext': 'mp4', + 'title': 'Baumhaus vom 29. Oktober 2014', + 'description': None + } + }, + { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', + 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', + 'info_dict': { + 'id': '8182', + 'ext': 'mp4', + 'title': 'Beutolomäus und der geheime Weihnachtswunsch', + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd' + } + }, + { + 'url': 'http://www.kika.de/videos/allevideos/video9572_zc-32ca94ad_zs-3f535991.html', + 'md5': '94fc748cf5d64916571d275a07ffe2d5', + 'info_dict': { + 'id': '9572', + 'ext': 'mp4', + 'title': 'Baumhaus vom 29. Oktober 2014', + 'description': None + } + }, + { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/sendung81244_zc-81d703f8_zs-f82d5e31.html', + 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', + 'info_dict': { + 'id': '8182', + 'ext': 'mp4', + 'title': 'Beutolomäus und der geheime Weihnachtswunsch', + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd' + } + } + ] + + def _real_extract(self, url): + # broadcast_id may be the same as the video_id + broadcast_id = self._match_id(url) + webpage = self._download_webpage(url, broadcast_id) + + xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml' + video_id = self._search_regex(xml_re, webpage, "xml_url", default=None) + if not video_id: + # Video is not available online + err_msg = 'Video %s is not available online' % broadcast_id + raise ExtractorError(err_msg, expected=True) + + xml_url = 'http://www.kika.de/video%s-avCustom.xml' % (video_id) + xml_tree = self._download_xml(xml_url, video_id) + + title = xml_tree.find('title').text + webpage_url = xml_tree.find('htmlUrl').text + + # Try to get the description, not available for all videos + try: + broadcast_elem = xml_tree.find('broadcast') + description = broadcast_elem.find('broadcastDescription').text + except AttributeError: + # No description available + description = None + + # duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42) + tmp = xml_tree.find('duration').text.split(':') + duration = int(tmp[0]) * 60 + int(tmp[1]) + + formats_list = [] + for elem in xml_tree.find('assets'): + format_dict = {} + format_dict['url'] = elem.find('progressiveDownloadUrl').text + format_dict['ext'] = elem.find('mediaType').text.lower() + format_dict['format'] = elem.find('profileName').text + width = int(elem.find('frameWidth').text) + height = int(elem.find('frameHeight').text) + format_dict['width'] = width + format_dict['height'] = height + format_dict['resolution'] = '%dx%d' % (width, height) + format_dict['abr'] = int(elem.find('bitrateAudio').text) + format_dict['vbr'] = int(elem.find('bitrateVideo').text) + format_dict['tbr'] = format_dict['abr'] + format_dict['vbr'] + format_dict['filesize'] = int(elem.find('fileSize').text) + + # append resolution and dict for sorting by resolution + formats_list.append((width * height, format_dict)) + + # Sort by resolution (=quality) + formats_list.sort() + + out_list = [x[1] for x in formats_list] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': out_list, + 'duration': duration, + 'webpage_url': webpage_url + } -- cgit v1.2.3 From 892015b088fa21915270b0a05937fcc7063ccdd2 Mon Sep 17 00:00:00 2001 From: Lucas <mikotosc@gmail.com> Date: Mon, 28 Sep 2015 22:00:56 +0200 Subject: replaced inefficient code --- youtube_dl/extractor/kika.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py index db0f333ff..871e4ea44 100644 --- a/youtube_dl/extractor/kika.py +++ b/youtube_dl/extractor/kika.py @@ -87,29 +87,25 @@ class KikaIE(InfoExtractor): format_dict['url'] = elem.find('progressiveDownloadUrl').text format_dict['ext'] = elem.find('mediaType').text.lower() format_dict['format'] = elem.find('profileName').text - width = int(elem.find('frameWidth').text) - height = int(elem.find('frameHeight').text) - format_dict['width'] = width - format_dict['height'] = height - format_dict['resolution'] = '%dx%d' % (width, height) + format_dict['width'] = int(elem.find('frameWidth').text) + format_dict['height'] = int(elem.find('frameHeight').text) + format_dict['resolution'] = '%dx%d' % (format_dict['width'], + format_dict['height']) format_dict['abr'] = int(elem.find('bitrateAudio').text) format_dict['vbr'] = int(elem.find('bitrateVideo').text) format_dict['tbr'] = format_dict['abr'] + format_dict['vbr'] format_dict['filesize'] = int(elem.find('fileSize').text) - # append resolution and dict for sorting by resolution - formats_list.append((width * height, format_dict)) + formats_list.append(format_dict) # Sort by resolution (=quality) - formats_list.sort() - - out_list = [x[1] for x in formats_list] + formats_list.sort(key=lambda x: x['width'] * x['height']) return { 'id': video_id, 'title': title, 'description': description, - 'formats': out_list, + 'formats': formats_list, 'duration': duration, 'webpage_url': webpage_url } -- cgit v1.2.3 From 78d7ee19dc417b16b26fe2fa1101124866727a85 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 31 Oct 2015 22:21:52 +0800 Subject: [democracynow] Fix _TESTS --- youtube_dl/extractor/democracynow.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 973bb437b..05cfc7502 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -36,10 +36,9 @@ class DemocracynowIE(InfoExtractor): if display_id == '': display_id = 'home' webpage = self._download_webpage(url, display_id) - re_desc = re.search(r'<meta property=.og:description. content=(["\'])(.+?)\1', webpage, re.DOTALL) - description = re_desc.group(2) if re_desc else '' + description = self._og_search_description(webpage) - jstr = self._search_regex(r'({.+?"related_video_xml".+?})', webpage, 'json', default=None) + jstr = self._search_regex(r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json') js = self._parse_json(jstr, display_id) video_id = None formats = [] @@ -56,7 +55,7 @@ class DemocracynowIE(InfoExtractor): 'ext': ext, 'url': url, }] - for key in ('file', 'audio'): + for key in ('file', 'audio', 'video'): url = js.get(key, '') if url == '' or url is None: continue -- cgit v1.2.3 From 8c1aa28c27af204c4996260cdc70359e83c2c3d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 31 Oct 2015 16:14:36 +0100 Subject: [kika] Replace non working tests and recognize 'einzelsendung' urls. --- youtube_dl/extractor/kika.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py index 871e4ea44..c9169076a 100644 --- a/youtube_dl/extractor/kika.py +++ b/youtube_dl/extractor/kika.py @@ -6,16 +6,16 @@ from ..utils import ExtractorError class KikaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|sendung)(?P<id>\d+).*' + _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|(?:einzel)?sendung)(?P<id>\d+).*' _TESTS = [ { - 'url': 'http://www.kika.de/baumhaus/videos/video9572.html', - 'md5': '94fc748cf5d64916571d275a07ffe2d5', + 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', + 'md5': '4930515e36b06c111213e80d1e4aad0e', 'info_dict': { - 'id': '9572', + 'id': '19636', 'ext': 'mp4', - 'title': 'Baumhaus vom 29. Oktober 2014', + 'title': 'Baumhaus vom 30. Oktober 2015', 'description': None } }, @@ -30,17 +30,17 @@ class KikaIE(InfoExtractor): } }, { - 'url': 'http://www.kika.de/videos/allevideos/video9572_zc-32ca94ad_zs-3f535991.html', - 'md5': '94fc748cf5d64916571d275a07ffe2d5', + 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', + 'md5': '4930515e36b06c111213e80d1e4aad0e', 'info_dict': { - 'id': '9572', + 'id': '19636', 'ext': 'mp4', - 'title': 'Baumhaus vom 29. Oktober 2014', + 'title': 'Baumhaus vom 30. Oktober 2015', 'description': None } }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/sendung81244_zc-81d703f8_zs-f82d5e31.html', + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', 'info_dict': { 'id': '8182', -- cgit v1.2.3 From c3040bd00a43e111dab0d1ab903df03ac7d19a00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 31 Oct 2015 16:32:35 +0100 Subject: [kika] Cleanup Closes #6957. --- youtube_dl/extractor/kika.py | 54 ++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 32 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py index c9169076a..5337ac439 100644 --- a/youtube_dl/extractor/kika.py +++ b/youtube_dl/extractor/kika.py @@ -16,8 +16,8 @@ class KikaIE(InfoExtractor): 'id': '19636', 'ext': 'mp4', 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None - } + 'description': None, + }, }, { 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', @@ -26,8 +26,8 @@ class KikaIE(InfoExtractor): 'id': '8182', 'ext': 'mp4', 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd' - } + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', + }, }, { 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', @@ -36,8 +36,8 @@ class KikaIE(InfoExtractor): 'id': '19636', 'ext': 'mp4', 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None - } + 'description': None, + }, }, { 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', @@ -46,9 +46,9 @@ class KikaIE(InfoExtractor): 'id': '8182', 'ext': 'mp4', 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd' - } - } + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', + }, + }, ] def _real_extract(self, url): @@ -59,7 +59,6 @@ class KikaIE(InfoExtractor): xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml' video_id = self._search_regex(xml_re, webpage, "xml_url", default=None) if not video_id: - # Video is not available online err_msg = 'Video %s is not available online' % broadcast_id raise ExtractorError(err_msg, expected=True) @@ -74,38 +73,29 @@ class KikaIE(InfoExtractor): broadcast_elem = xml_tree.find('broadcast') description = broadcast_elem.find('broadcastDescription').text except AttributeError: - # No description available description = None # duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42) tmp = xml_tree.find('duration').text.split(':') duration = int(tmp[0]) * 60 + int(tmp[1]) - formats_list = [] - for elem in xml_tree.find('assets'): - format_dict = {} - format_dict['url'] = elem.find('progressiveDownloadUrl').text - format_dict['ext'] = elem.find('mediaType').text.lower() - format_dict['format'] = elem.find('profileName').text - format_dict['width'] = int(elem.find('frameWidth').text) - format_dict['height'] = int(elem.find('frameHeight').text) - format_dict['resolution'] = '%dx%d' % (format_dict['width'], - format_dict['height']) - format_dict['abr'] = int(elem.find('bitrateAudio').text) - format_dict['vbr'] = int(elem.find('bitrateVideo').text) - format_dict['tbr'] = format_dict['abr'] + format_dict['vbr'] - format_dict['filesize'] = int(elem.find('fileSize').text) - - formats_list.append(format_dict) - - # Sort by resolution (=quality) - formats_list.sort(key=lambda x: x['width'] * x['height']) + formats = [{ + 'url': elem.find('progressiveDownloadUrl').text, + 'ext': elem.find('mediaType').text.lower(), + 'format': elem.find('profileName').text, + 'width': int(elem.find('frameWidth').text), + 'height': int(elem.find('frameHeight').text), + 'abr': int(elem.find('bitrateAudio').text), + 'vbr': int(elem.find('bitrateVideo').text), + 'filesize': int(elem.find('fileSize').text), + } for elem in xml_tree.find('assets')] + self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, - 'formats': formats_list, + 'formats': formats, 'duration': duration, - 'webpage_url': webpage_url + 'webpage_url': webpage_url, } -- cgit v1.2.3 From 2b1b2d83cacfdce19cae5eea2f9bbfd142efc7f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 22:17:09 +0600 Subject: [mdr] Modernize and include kika.de --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/kika.py | 101 ----------------------- youtube_dl/extractor/mdr.py | 174 +++++++++++++++++++++++++++++---------- 3 files changed, 132 insertions(+), 144 deletions(-) delete mode 100644 youtube_dl/extractor/kika.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5ad4e9c36..f98e6487e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -274,7 +274,6 @@ from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE -from .kika import KikaIE from .keek import KeekIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py deleted file mode 100644 index 5337ac439..000000000 --- a/youtube_dl/extractor/kika.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class KikaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|(?:einzel)?sendung)(?P<id>\d+).*' - - _TESTS = [ - { - 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None, - }, - }, - { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - }, - }, - { - 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None, - }, - }, - { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - }, - }, - ] - - def _real_extract(self, url): - # broadcast_id may be the same as the video_id - broadcast_id = self._match_id(url) - webpage = self._download_webpage(url, broadcast_id) - - xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml' - video_id = self._search_regex(xml_re, webpage, "xml_url", default=None) - if not video_id: - err_msg = 'Video %s is not available online' % broadcast_id - raise ExtractorError(err_msg, expected=True) - - xml_url = 'http://www.kika.de/video%s-avCustom.xml' % (video_id) - xml_tree = self._download_xml(xml_url, video_id) - - title = xml_tree.find('title').text - webpage_url = xml_tree.find('htmlUrl').text - - # Try to get the description, not available for all videos - try: - broadcast_elem = xml_tree.find('broadcast') - description = broadcast_elem.find('broadcastDescription').text - except AttributeError: - description = None - - # duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42) - tmp = xml_tree.find('duration').text.split(':') - duration = int(tmp[0]) * 60 + int(tmp[1]) - - formats = [{ - 'url': elem.find('progressiveDownloadUrl').text, - 'ext': elem.find('mediaType').text.lower(), - 'format': elem.find('profileName').text, - 'width': int(elem.find('frameWidth').text), - 'height': int(elem.find('frameHeight').text), - 'abr': int(elem.find('bitrateAudio').text), - 'vbr': int(elem.find('bitrateVideo').text), - 'filesize': int(elem.find('fileSize').text), - } for elem in xml_tree.find('assets')] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - 'duration': duration, - 'webpage_url': webpage_url, - } diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index fc7499958..541ddd909 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -1,64 +1,154 @@ +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + xpath_text, +) class MDRIE(InfoExtractor): - _VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)' + IE_DESC = 'MDR.DE and KiKA' + _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html' - # No tests, MDR regularily deletes its videos - _TEST = { + _TESTS = [{ + # MDR regularily deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', 'only_matching': True, - } + }, { + 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', + 'md5': '4930515e36b06c111213e80d1e4aad0e', + 'info_dict': { + 'id': '19636', + 'ext': 'mp4', + 'title': 'Baumhaus vom 30. Oktober 2015', + 'duration': 134, + 'uploader': 'KIKA', + }, + }, { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', + 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', + 'info_dict': { + 'id': '8182', + 'ext': 'mp4', + 'title': 'Beutolomäus und der geheime Weihnachtswunsch', + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', + 'timestamp': 1419047100, + 'upload_date': '20141220', + 'duration': 4628, + 'uploader': 'KIKA', + }, + }, { + 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', + 'only_matching': True, + }, { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', + 'only_matching': True, + }] def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('video_id') - domain = m.group('domain') + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) - # determine title and media streams from webpage - html = self._download_webpage(url, video_id) + data_url = self._search_regex( + r'dataURL\s*:\s*(["\'])(?P<url>/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1', + webpage, 'data url', group='url') - title = self._html_search_regex(r'<h[12]>(.*?)</h[12]>', html, 'title') - xmlurl = self._search_regex( - r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL') + doc = self._download_xml( + compat_urlparse.urljoin(url, data_url), video_id) + + title = (xpath_text(doc, './title', 'title', default=None) or + xpath_text(doc, './broadcast/broadcastName', 'title')) - doc = self._download_xml(domain + xmlurl, video_id) formats = [] - for a in doc.findall('./assets/asset'): - url_el = a.find('./progressiveDownloadUrl') - if url_el is None: - continue - abr = int(a.find('bitrateAudio').text) // 1000 - media_type = a.find('mediaType').text - format = { - 'abr': abr, - 'filesize': int(a.find('fileSize').text), - 'url': url_el.text, - } - - vbr_el = a.find('bitrateVideo') - if vbr_el is None: - format.update({ - 'vcodec': 'none', - 'format_id': '%s-%d' % (media_type, abr), - }) - else: - vbr = int(vbr_el.text) // 1000 - format.update({ - 'vbr': vbr, - 'width': int(a.find('frameWidth').text), - 'height': int(a.find('frameHeight').text), - 'format_id': '%s-%d' % (media_type, vbr), - }) - formats.append(format) + processed_urls = [] + for asset in doc.findall('./assets/asset'): + for source in ( + 'progressiveDownload', + 'dynamicHttpStreamingRedirector', + 'adaptiveHttpStreamingRedirector'): + url_el = asset.find('./%sUrl' % source) + if url_el is None: + continue + + video_url = url_el.text + if video_url in processed_urls: + continue + + processed_urls.append(video_url) + + vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) + abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) + + url_formats = [] + + ext = determine_ext(url_el.text) + if ext == 'm3u8': + url_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + preference=0, m3u8_id='HLS', fatal=False) + elif ext == 'f4m': + url_formats = self._extract_f4m_formats( + video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, + preference=0, f4m_id='HDS', fatal=False) + else: + media_type = xpath_text(asset, './mediaType', 'media type', default='MP4') + vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) + abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) + filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) + + f = { + 'url': video_url, + 'format_id': '%s-%d' % (media_type, vbr or abr), + 'filesize': filesize, + 'abr': abr, + 'preference': 1, + } + + if vbr: + width = int_or_none(xpath_text(asset, './frameWidth', 'width')) + height = int_or_none(xpath_text(asset, './frameHeight', 'height')) + f.update({ + 'vbr': vbr, + 'width': width, + 'height': height, + }) + + url_formats.append(f) + + if not vbr: + for f in url_formats: + abr = f.get('tbr') or abr + if 'tbr' in f: + del f['tbr'] + f.update({ + 'abr': abr, + 'vcodec': 'none', + }) + + if url_formats: + formats.extend(url_formats) self._sort_formats(formats) + description = xpath_text(doc, './broadcast/broadcastDescription', 'description') + timestamp = parse_iso8601( + xpath_text(doc, './broadcast/broadcastDate', 'timestamp', default=None) or + xpath_text(doc, './broadcast/broadcastStartDate', 'timestamp', default=None)) + duration = parse_duration(xpath_text(doc, './duration', 'duration')) + uploader = xpath_text(doc, './rights', 'uploader') + return { 'id': video_id, 'title': title, + 'description': description, + 'timestamp': timestamp, + 'duration': duration, + 'uploader': uploader, 'formats': formats, } -- cgit v1.2.3 From 8cdb5c845336ad3dc48c85a0558a38bd42972b00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 22:24:21 +0600 Subject: [mdr] Add audio test --- youtube_dl/extractor/mdr.py | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 541ddd909..e05577496 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -20,6 +20,17 @@ class MDRIE(InfoExtractor): # MDR regularily deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', 'only_matching': True, + }, { + # audio + 'url': 'http://www.mdr.de/kultur/audio1312272_zc-15948bad_zs-86171fdd.html', + 'md5': '64c4ee50f0a791deb9479cd7bbe9d2fa', + 'info_dict': { + 'id': '1312272', + 'ext': 'mp3', + 'title': 'Feuilleton vom 30. Oktober 2015', + 'duration': 250, + 'uploader': 'MITTELDEUTSCHER RUNDFUNK', + }, }, { 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', 'md5': '4930515e36b06c111213e80d1e4aad0e', -- cgit v1.2.3 From 11465da70257663ee52c7be50debe1c1e825ec67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 22:45:45 +0600 Subject: [mdr] Simplify xpath --- youtube_dl/extractor/mdr.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index e05577496..a63257c56 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -74,8 +74,7 @@ class MDRIE(InfoExtractor): doc = self._download_xml( compat_urlparse.urljoin(url, data_url), video_id) - title = (xpath_text(doc, './title', 'title', default=None) or - xpath_text(doc, './broadcast/broadcastName', 'title')) + title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True) formats = [] processed_urls = [] @@ -149,8 +148,12 @@ class MDRIE(InfoExtractor): description = xpath_text(doc, './broadcast/broadcastDescription', 'description') timestamp = parse_iso8601( - xpath_text(doc, './broadcast/broadcastDate', 'timestamp', default=None) or - xpath_text(doc, './broadcast/broadcastStartDate', 'timestamp', default=None)) + xpath_text( + doc, [ + './broadcast/broadcastDate', + './broadcast/broadcastStartDate', + './broadcast/broadcastEndDate'], + 'timestamp', default=None)) duration = parse_duration(xpath_text(doc, './duration', 'duration')) uploader = xpath_text(doc, './rights', 'uploader') -- cgit v1.2.3 From 82b69a5cbb1455d31916be4f19ab327ae63f313c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 23:00:36 +0600 Subject: [mdr] PEP 8 --- youtube_dl/extractor/mdr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index a63257c56..a566c6a2c 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -20,7 +20,7 @@ class MDRIE(InfoExtractor): # MDR regularily deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', 'only_matching': True, - }, { + }, { # audio 'url': 'http://www.mdr.de/kultur/audio1312272_zc-15948bad_zs-86171fdd.html', 'md5': '64c4ee50f0a791deb9479cd7bbe9d2fa', -- cgit v1.2.3 From e327b736ca6a6a1c880b93e09a3b310c354c2c7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 23:05:30 +0600 Subject: [generic] Update test --- youtube_dl/extractor/generic.py | 1 + 1 file changed, 1 insertion(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..a84135032 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -141,6 +141,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'upload_date': '20130627', 'formats': 'mincount:16', 'subtitles': 'mincount:1', }, -- cgit v1.2.3 From dc519b5421366a8cac681455a817ae25f7f4aa83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 23:12:57 +0600 Subject: [extractor/common] Make ie_key and IE_NAME return unicode string --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 10c0d5d1f..59c3fa8dc 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -310,11 +310,11 @@ class InfoExtractor(object): @classmethod def ie_key(cls): """A string for getting the InfoExtractor with get_info_extractor""" - return cls.__name__[:-2] + return compat_str(cls.__name__[:-2]) @property def IE_NAME(self): - return type(self).__name__[:-2] + return compat_str(type(self).__name__[:-2]) def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the response handle """ -- cgit v1.2.3 From 76f0c50d3d3e2eb5903b61da08829699e902916d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Nov 2015 00:01:08 +0600 Subject: [mdr] Fix failed formats processing --- youtube_dl/extractor/mdr.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index a566c6a2c..88334889e 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -96,8 +96,6 @@ class MDRIE(InfoExtractor): vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) - url_formats = [] - ext = determine_ext(url_el.text) if ext == 'm3u8': url_formats = self._extract_m3u8_formats( @@ -130,7 +128,10 @@ class MDRIE(InfoExtractor): 'height': height, }) - url_formats.append(f) + url_formats = [f] + + if not url_formats: + continue if not vbr: for f in url_formats: @@ -142,8 +143,8 @@ class MDRIE(InfoExtractor): 'vcodec': 'none', }) - if url_formats: - formats.extend(url_formats) + formats.extend(url_formats) + self._sort_formats(formats) description = xpath_text(doc, './broadcast/broadcastDescription', 'description') -- cgit v1.2.3 From dbd82a1d4fff1655920e111cc25a7fd526d7bf9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Nov 2015 00:01:34 +0600 Subject: [extractor/common] Fix m3u8 extraction on failure --- youtube_dl/extractor/common.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 59c3fa8dc..1f09fbb47 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -943,13 +943,14 @@ class InfoExtractor(object): if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - m3u8_doc, urlh = self._download_webpage_handle( + res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) - if m3u8_doc is False: - return m3u8_doc + if res is False: + return res + m3u8_doc, urlh = res m3u8_url = urlh.geturl() last_info = None last_media = None -- cgit v1.2.3 From 9550ca506fccf9c9d795816cc0a7817ff262ef45 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 31 Oct 2015 19:36:04 +0100 Subject: [utils] change extract_attributes to work in python 2 --- youtube_dl/extractor/brightcove.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index b41cee91b..c6ad1d065 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -383,8 +383,7 @@ class BrightcoveInPageEmbedIE(InfoExtractor): return None def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - account_id, player_id, embed, video_id = mobj.groups() + account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) -- cgit v1.2.3 From 80dcee5cd5cbe623a53e0c582e3e3ae170c63e8d Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 31 Oct 2015 04:02:49 +0100 Subject: [eitb] fix info extraction --- youtube_dl/extractor/eitb.py | 65 ++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 21 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index 2cba82532..fc8f15544 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -1,39 +1,62 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .brightcove import BrightcoveIE -from ..utils import ExtractorError +from ..compat import compat_urllib_request +from ..utils import ( + int_or_none, + unified_strdate, +) class EitbIE(InfoExtractor): IE_NAME = 'eitb.tv' - _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)' + _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/\d+/(?P<id>\d+)' _TEST = { - 'add_ie': ['Brightcove'], - 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', + 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/4104995148001/4090227752001/lasa-y-zabala-30-anos/', 'md5': 'edf4436247185adee3ea18ce64c47998', 'info_dict': { - 'id': '2743577154001', + 'id': '4090227752001', 'ext': 'mp4', 'title': '60 minutos (Lasa y Zabala, 30 años)', - # All videos from eitb has this description in the brightcove info - 'description': '.', - 'uploader': 'Euskal Telebista', + 'description': '', + 'duration': 3996760, + 'upload_date': '20131014', }, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - chapter_id = mobj.group('chapter_id') - webpage = self._download_webpage(url, chapter_id) - bc_url = BrightcoveIE._extract_brightcove_url(webpage) - if bc_url is None: - raise ExtractorError('Could not extract the Brightcove url') - # The BrightcoveExperience object doesn't contain the video id, we set - # it manually - bc_url += '&%40videoPlayer={0}'.format(chapter_id) - return self.url_result(bc_url, BrightcoveIE.ie_key()) + video_id = self._match_id(url) + video_data = self._download_json('http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id, video_id)['web_media'][0] + + formats = [] + for rendition in video_data['RENDITIONS']: + formats.append({ + 'url': rendition['PMD_URL'], + 'width': int_or_none(rendition.get('FRAME_WIDTH')), + 'height': int_or_none(rendition.get('FRAME_HEIGHT')), + 'tbr': int_or_none(rendition.get('ENCODING_RATE')), + }) + + # TODO: parse f4m manifest + request = compat_urllib_request.Request( + 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', + headers={'Referer': url}) + token_data = self._download_json(request, video_id, fatal=False) + if token_data: + m3u8_formats = self._extract_m3u8_formats('%s?hdnts=%s' % (video_data['HLS_SURL'], token_data['token']), video_id, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['NAME_ES'], + 'description': video_data.get('SHORT_DESC_ES'), + 'thumbnail': video_data.get('STILL_URL'), + 'duration': int_or_none(video_data.get('LENGTH')), + 'upload_date': unified_strdate(video_data.get('BROADCST_DATE')), + 'formats': formats, + } -- cgit v1.2.3 From 8a06999ba0f9c948f8d2a1ef89c73eedbfb09cfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Nov 2015 01:52:33 +0600 Subject: [eitb] Improve, make more robust and extract f4m formats (Closes #7328) --- youtube_dl/extractor/eitb.py | 71 +++++++++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 21 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index fc8f15544..0de8d3dc6 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -4,14 +4,15 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_request from ..utils import ( + float_or_none, int_or_none, - unified_strdate, + parse_iso8601, ) class EitbIE(InfoExtractor): IE_NAME = 'eitb.tv' - _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/\d+/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?eitb\.tv/(?:eu/bideoa|es/video)/[^/]+/\d+/(?P<id>\d+)' _TEST = { 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/4104995148001/4090227752001/lasa-y-zabala-30-anos/', @@ -20,43 +21,71 @@ class EitbIE(InfoExtractor): 'id': '4090227752001', 'ext': 'mp4', 'title': '60 minutos (Lasa y Zabala, 30 años)', - 'description': '', - 'duration': 3996760, + 'description': 'Programa de reportajes de actualidad.', + 'duration': 3996.76, + 'timestamp': 1381789200, 'upload_date': '20131014', + 'tags': list, }, } def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json('http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id, video_id)['web_media'][0] + + video = self._download_json( + 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id, + video_id, 'Downloading video JSON') + + media = video['web_media'][0] formats = [] - for rendition in video_data['RENDITIONS']: + for rendition in media['RENDITIONS']: + video_url = rendition.get('PMD_URL') + if not video_url: + continue + tbr = float_or_none(rendition.get('ENCODING_RATE'), 1000) + format_id = 'http' + if tbr: + format_id += '-%d' % int(tbr) formats.append({ 'url': rendition['PMD_URL'], + 'format_id': format_id, 'width': int_or_none(rendition.get('FRAME_WIDTH')), 'height': int_or_none(rendition.get('FRAME_HEIGHT')), - 'tbr': int_or_none(rendition.get('ENCODING_RATE')), + 'tbr': tbr, }) - # TODO: parse f4m manifest - request = compat_urllib_request.Request( - 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', - headers={'Referer': url}) - token_data = self._download_json(request, video_id, fatal=False) - if token_data: - m3u8_formats = self._extract_m3u8_formats('%s?hdnts=%s' % (video_data['HLS_SURL'], token_data['token']), video_id, m3u8_id='hls', fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) + hls_url = media.get('HLS_SURL') + if hls_url: + request = compat_urllib_request.Request( + 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', + headers={'Referer': url}) + token_data = self._download_json( + request, video_id, 'Downloading auth token', fatal=False) + if token_data: + token = token_data.get('token') + if token: + m3u8_formats = self._extract_m3u8_formats( + '%s?hdnts=%s' % (hls_url, token), video_id, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + + hds_url = media.get('HDS_SURL').replace('euskalsvod', 'euskalvod') + if hds_url: + f4m_formats = self._extract_f4m_formats( + '%s?hdcore=3.7.0' % hds_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) self._sort_formats(formats) return { 'id': video_id, - 'title': video_data['NAME_ES'], - 'description': video_data.get('SHORT_DESC_ES'), - 'thumbnail': video_data.get('STILL_URL'), - 'duration': int_or_none(video_data.get('LENGTH')), - 'upload_date': unified_strdate(video_data.get('BROADCST_DATE')), + 'title': media.get('NAME_ES') or media.get('name') or media['NAME_EU'], + 'description': media.get('SHORT_DESC_ES') or video.get('desc_group') or media.get('SHORT_DESC_EU'), + 'thumbnail': media.get('STILL_URL') or media.get('THUMBNAIL_URL'), + 'duration': float_or_none(media.get('LENGTH'), 1000), + 'timestamp': parse_iso8601(media.get('BROADCST_DATE'), ' '), + 'tags': media.get('TAGS'), 'formats': formats, } -- cgit v1.2.3 From 999079b4543b4cd5e71a235865fbfefd349eb064 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Nov 2015 15:49:11 +0600 Subject: [eitb] Improve hds extraction --- youtube_dl/extractor/eitb.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index 0de8d3dc6..357a2196c 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -70,10 +70,11 @@ class EitbIE(InfoExtractor): if m3u8_formats: formats.extend(m3u8_formats) - hds_url = media.get('HDS_SURL').replace('euskalsvod', 'euskalvod') + hds_url = media.get('HDS_SURL') if hds_url: f4m_formats = self._extract_f4m_formats( - '%s?hdcore=3.7.0' % hds_url, video_id, f4m_id='hds', fatal=False) + '%s?hdcore=3.7.0' % hds_url.replace('euskalsvod', 'euskalvod'), + video_id, f4m_id='hds', fatal=False) if f4m_formats: formats.extend(f4m_formats) -- cgit v1.2.3 From eb97f46e8bd9cb04f0fe5f8a5c13aeeaabeefef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 2 Nov 2015 12:46:10 +0100 Subject: [mitele] Fix extraction and update test checksum (fixes #7343) --- youtube_dl/extractor/mitele.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 3142fcde2..c595f2077 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import ( + compat_urllib_parse, + compat_urlparse, +) from ..utils import ( encode_dict, get_element_by_attribute, @@ -15,7 +18,7 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', - 'md5': '757b0b66cbd7e0a97226d7d3156cb3e9', + 'md5': '0ff1a13aebb35d9bc14081ff633dd324', 'info_dict': { 'id': '0NF1jJnxS1Wu3pHrmvFyw2', 'display_id': 'programa-144', @@ -34,6 +37,7 @@ class MiTeleIE(InfoExtractor): config_url = self._search_regex( r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url') + config_url = compat_urlparse.urljoin(url, config_url) config = self._download_json( config_url, display_id, 'Downloading config JSON') -- cgit v1.2.3 From c514b0ec655b23e7804eb18df04daa863d973f32 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 1 Nov 2015 22:12:20 +0100 Subject: [videofy.me] fix info extraction Closes #7339. --- youtube_dl/extractor/videofyme.py | 40 ++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py index 94f9e9be9..cd3f50a63 100644 --- a/youtube_dl/extractor/videofyme.py +++ b/youtube_dl/extractor/videofyme.py @@ -2,8 +2,8 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - find_xpath_attr, int_or_none, + parse_iso8601, ) @@ -18,33 +18,35 @@ class VideofyMeIE(InfoExtractor): 'id': '1100701', 'ext': 'mp4', 'title': 'This is VideofyMe', - 'description': None, + 'description': '', + 'upload_date': '20130326', + 'timestamp': 1364288959, 'uploader': 'VideofyMe', 'uploader_id': 'thisisvideofyme', 'view_count': int, + 'likes': int, + 'comment_count': int, }, - } def _real_extract(self, url): video_id = self._match_id(url) - config = self._download_xml('http://sunshine.videofy.me/?videoId=%s' % video_id, - video_id) - video = config.find('video') - sources = video.find('sources') - url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key) - for key in ['on', 'av', 'off']] if node is not None) - video_url = url_node.find('url').text - view_count = int_or_none(self._search_regex( - r'([0-9]+)', video.find('views').text, 'view count', fatal=False)) + + config = self._download_json('http://vf-player-info-loader.herokuapp.com/%s.json' % video_id, video_id)['videoinfo'] + + video = config.get('video') + blog = config.get('blog', {}) return { 'id': video_id, - 'title': video.find('title').text, - 'url': video_url, - 'thumbnail': video.find('thumb').text, - 'description': video.find('description').text, - 'uploader': config.find('blog/name').text, - 'uploader_id': video.find('identifier').text, - 'view_count': view_count, + 'title': video['title'], + 'url': video['sources']['source']['url'], + 'thumbnail': video.get('thumb'), + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('date')), + 'uploader': blog.get('name'), + 'uploader_id': blog.get('identifier'), + 'view_count': int_or_none(self._search_regex(r'([0-9]+)', video.get('views'), 'view count', fatal=False)), + 'likes': int_or_none(video.get('likes')), + 'comment_count': int_or_none(video.get('nrOfComments')), } -- cgit v1.2.3 From dde9fe9788f23f168e0bddaf8ab0470f469165fa Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 3 Nov 2015 18:36:54 +0800 Subject: [democracynow] Simplify --- youtube_dl/extractor/democracynow.py | 80 ++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 39 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 05cfc7502..824b8e2c5 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -2,11 +2,18 @@ from __future__ import unicode_literals import re +import os.path + from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + url_basename, + remove_start, +) class DemocracynowIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?democracynow.org/?(?P<id>[^\?]*)' + _VALID_URL = r'https?://(?:www\.)?democracynow.org/(?P<id>[^\?]*)' IE_NAME = 'democracynow' _TESTS = [{ 'url': 'http://www.democracynow.org/shows/2015/7/3', @@ -14,9 +21,7 @@ class DemocracynowIE(InfoExtractor): 'id': '2015-0703-001', 'ext': 'mp4', 'title': 'July 03, 2015 - Democracy Now!', - 'description': 'A daily independent global news hour with Amy Goodman & Juan Gonz\xe1lez "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs', - 'uploader': 'Democracy Now', - 'upload_date': None, + 'description': 'A daily independent global news hour with Amy Goodman & Juan González "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs', }, }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', @@ -25,60 +30,57 @@ class DemocracynowIE(InfoExtractor): 'ext': 'mp4', 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag', 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21', - 'uploader': 'Democracy Now', - 'upload_date': None, }, }] def _real_extract(self, url): display_id = self._match_id(url) - base_host = re.search(r'^(.+?://[^/]+)', url).group(1) - if display_id == '': - display_id = 'home' webpage = self._download_webpage(url, display_id) description = self._og_search_description(webpage) - jstr = self._search_regex(r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json') - js = self._parse_json(jstr, display_id) + js = self._parse_json(self._search_regex( + r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'), + display_id) video_id = None formats = [] + + default_lang = 'en' + subtitles = {} - for key in ('caption_file', '.......'): - # ....... = pending vtt support that doesn't clobber srt 'chapter_file': - url = js.get(key, '') - if url == '' or url is None: - continue - if not re.match(r'^https?://', url): - url = base_host + url - ext = re.search(r'\.([^\.]+)$', url).group(1) - subtitles['eng'] = [{ - 'ext': ext, - 'url': url, - }] + + def add_subtitle_item(lang, info_dict): + if lang not in subtitles: + subtitles[lang] = [] + subtitles[lang].append(info_dict) + + # chapter_file are not subtitles + if 'caption_file' in js: + add_subtitle_item(default_lang, { + 'url': compat_urlparse.urljoin(url, js['caption_file']), + }) + + for subtitle_item in js.get('captions', []): + lang = subtitle_item.get('language', '').lower() or default_lang + add_subtitle_item(lang, { + 'url': compat_urlparse.urljoin(url, subtitle_item['url']), + }) + for key in ('file', 'audio', 'video'): - url = js.get(key, '') - if url == '' or url is None: + media_url = js.get(key, '') + if not media_url: continue - if not re.match(r'^https?://', url): - url = base_host + url - purl = re.search(r'/(?P<dir>[^/]+)/(?:dn)?(?P<fn>[^/]+?)\.(?P<ext>[^\.\?]+)(?P<hasparams>\?|$)', url) - if video_id is None: - video_id = purl.group('fn') - if js.get('start') is not None: - url += '&' if purl.group('hasparams') == '?' else '?' - url = url + 'start=' + str(js.get('start')) + media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url)) + video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn') formats.append({ - 'format_id': purl.group('dir'), - 'ext': purl.group('ext'), - 'url': url, + 'url': media_url, }) + self._sort_formats(formats) - ret = { + + return { 'id': video_id, 'title': js.get('title'), 'description': description, - 'uploader': 'Democracy Now', 'subtitles': subtitles, 'formats': formats, } - return ret -- cgit v1.2.3 From fc68d52bb95dc81ed3d05a5c5397cd3f35ee093a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 3 Nov 2015 21:24:10 +0800 Subject: [democracynow] Add MD5 sums --- youtube_dl/extractor/democracynow.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 824b8e2c5..70c364e8b 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -17,6 +17,7 @@ class DemocracynowIE(InfoExtractor): IE_NAME = 'democracynow' _TESTS = [{ 'url': 'http://www.democracynow.org/shows/2015/7/3', + 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d', 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', @@ -25,6 +26,7 @@ class DemocracynowIE(InfoExtractor): }, }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', + 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d', 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', -- cgit v1.2.3 From 852fad922ffa931b3c90b0b9fdb2fa1c7f965ab4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Nov 2015 20:53:17 +0600 Subject: [vimeo] Fix non-ASCII video passwords (Closes #7352) --- youtube_dl/extractor/vimeo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 2437ae1eb..cc0d337e8 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -13,6 +13,7 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + encode_dict, ExtractorError, InAdvancePagedList, int_or_none, @@ -208,10 +209,10 @@ class VimeoIE(VimeoBaseInfoExtractor): if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ + data = urlencode_postdata(encode_dict({ 'password': password, 'token': token, - }) + })) if url.startswith('http://'): # vimeo only supports https now, but the user can give an http url url = url.replace('http://', 'https://') -- cgit v1.2.3 From 0a0110fc6bbd21850e25541fd0bd4b602ce194e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Nov 2015 21:01:09 +0600 Subject: [vimeo] Fix non-ASCII video passwords (2) --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cc0d337e8..fa07bd59c 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -228,7 +228,7 @@ class VimeoIE(VimeoBaseInfoExtractor): password = self._downloader.params.get('videopassword', None) if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option') - data = compat_urllib_parse.urlencode({'password': password}) + data = urlencode_postdata(encode_dict({'password': password})) pass_url = url + '/check-password' password_request = compat_urllib_request.Request(pass_url, data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') -- cgit v1.2.3 From 3fa3ff1bc36aaf82ac0a5e880304cb7aae217b9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Nov 2015 21:06:36 +0600 Subject: [vimeo] Fix non-ASCII login --- youtube_dl/extractor/vimeo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index fa07bd59c..46fb36f21 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -41,13 +41,13 @@ class VimeoBaseInfoExtractor(InfoExtractor): self.report_login() webpage = self._download_webpage(self._LOGIN_URL, None, False) token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ + data = urlencode_postdata(encode_dict({ 'action': 'login', 'email': username, 'password': password, 'service': 'vimeo', 'token': token, - }) + })) login_request = compat_urllib_request.Request(self._LOGIN_URL, data) login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') login_request.add_header('Cookie', 'vuid=%s' % vuid) -- cgit v1.2.3 From bfdf891fd36811909aa5d83dc0614eacbb634fcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Nov 2015 21:09:24 +0600 Subject: [vimeo] Fix non-ASCII album passwords --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 46fb36f21..b608740b8 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -489,7 +489,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): token, vuid = self._extract_xsrft_and_vuid(webpage) fields['token'] = token fields['password'] = password - post = urlencode_postdata(fields) + post = urlencode_postdata(encode_dict(fields)) password_path = self._search_regex( r'action="([^"]+)"', login_form, 'password URL') password_url = compat_urlparse.urljoin(page_url, password_path) -- cgit v1.2.3 From fd8102820c4d14fdb1ff7e090553211717012f67 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 4 Nov 2015 00:09:55 +0800 Subject: [democracynow] Rename js to json_data --- youtube_dl/extractor/democracynow.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 70c364e8b..72fc75d80 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -40,7 +40,7 @@ class DemocracynowIE(InfoExtractor): webpage = self._download_webpage(url, display_id) description = self._og_search_description(webpage) - js = self._parse_json(self._search_regex( + json_data = self._parse_json(self._search_regex( r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'), display_id) video_id = None @@ -56,19 +56,19 @@ class DemocracynowIE(InfoExtractor): subtitles[lang].append(info_dict) # chapter_file are not subtitles - if 'caption_file' in js: + if 'caption_file' in json_data: add_subtitle_item(default_lang, { - 'url': compat_urlparse.urljoin(url, js['caption_file']), + 'url': compat_urlparse.urljoin(url, json_data['caption_file']), }) - for subtitle_item in js.get('captions', []): + for subtitle_item in json_data.get('captions', []): lang = subtitle_item.get('language', '').lower() or default_lang add_subtitle_item(lang, { 'url': compat_urlparse.urljoin(url, subtitle_item['url']), }) for key in ('file', 'audio', 'video'): - media_url = js.get(key, '') + media_url = json_data.get(key, '') if not media_url: continue media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url)) @@ -81,7 +81,7 @@ class DemocracynowIE(InfoExtractor): return { 'id': video_id, - 'title': js.get('title'), + 'title': json_data.get('title'), 'description': description, 'subtitles': subtitles, 'formats': formats, -- cgit v1.2.3 From 0aeb9a106e1aad37967e0ee666ed816a7d5eb7c2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 4 Nov 2015 00:13:00 +0800 Subject: [democracynow] Prevent required fields to be None --- youtube_dl/extractor/democracynow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 72fc75d80..6cd395e11 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -80,8 +80,8 @@ class DemocracynowIE(InfoExtractor): self._sort_formats(formats) return { - 'id': video_id, - 'title': json_data.get('title'), + 'id': video_id or display_id, + 'title': json_data['title'], 'description': description, 'subtitles': subtitles, 'formats': formats, -- cgit v1.2.3 From ad607563a2fbb5275ea39f7a052c09ffa232e271 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 16:46:26 +0600 Subject: [globo] Separate article extractor --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/globo.py | 140 +++++++++++++++++++++------------------ 2 files changed, 79 insertions(+), 66 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 10286aa88..94150a28f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -212,7 +212,10 @@ from .gfycat import GfycatIE from .giantbomb import GiantBombIE from .giga import GigaIE from .glide import GlideIE -from .globo import GloboIE +from .globo import ( + GloboIE, + GloboArticleIE, +) from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 33d6432a6..828e40d76 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -18,75 +18,52 @@ from ..utils import ( class GloboIE(InfoExtractor): - _VALID_URL = 'https?://.+?\.globo\.com/(?P<id>.+)' + _VALID_URL = '(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})' _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s' - _VIDEOID_REGEXES = [ - r'\bdata-video-id="(\d+)"', - r'\bdata-player-videosids="(\d+)"', - r'<div[^>]+\bid="(\d+)"', - ] - _RESIGN_EXPIRATION = 86400 - _TESTS = [ - { - 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/', - 'md5': '03ebf41cb7ade43581608b7d9b71fab0', - 'info_dict': { - 'id': '3654973', - 'ext': 'mp4', - 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão', - 'duration': 251.585, - 'uploader': 'SporTV', - 'uploader_id': 698, - 'like_count': int, - } - }, - { - 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', - 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', - 'info_dict': { - 'id': '3607726', - 'ext': 'mp4', - 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', - 'duration': 103.204, - 'uploader': 'Globo.com', - 'uploader_id': 265, - 'like_count': int, - } - }, - { - 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', - 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', - 'info_dict': { - 'id': '3652183', - 'ext': 'mp4', - 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', - 'duration': 110.711, - 'uploader': 'Rede Globo', - 'uploader_id': 196, - 'like_count': int, - } - }, - { - 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', - 'md5': 'c1defca721ce25b2354e927d3e4b3dec', - 'info_dict': { - 'id': '3928201', - 'ext': 'mp4', - 'title': 'Ator e diretor argentino, Ricado Darín fala sobre utopias e suas perdas', - 'duration': 1472.906, - 'uploader': 'Canal Brasil', - 'uploader_id': 705, - 'like_count': int, - } - }, - ] + _TESTS = [{ + 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/', + 'md5': '03ebf41cb7ade43581608b7d9b71fab0', + 'info_dict': { + 'id': '3654973', + 'ext': 'mp4', + 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão', + 'duration': 251.585, + 'uploader': 'SporTV', + 'uploader_id': 698, + 'like_count': int, + } + }, { + 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', + 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', + 'info_dict': { + 'id': '3607726', + 'ext': 'mp4', + 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', + 'duration': 103.204, + 'uploader': 'Globo.com', + 'uploader_id': 265, + 'like_count': int, + } + }, { + 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', + 'md5': 'c1defca721ce25b2354e927d3e4b3dec', + 'info_dict': { + 'id': '3928201', + 'ext': 'mp4', + 'title': 'Ator e diretor argentino, Ricado Darín fala sobre utopias e suas perdas', + 'duration': 1472.906, + 'uploader': 'Canal Brasil', + 'uploader_id': 705, + 'like_count': int, + } + }] - class MD5(): + class MD5: HEX_FORMAT_LOWERCASE = 0 HEX_FORMAT_UPPERCASE = 1 BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = '' @@ -353,9 +330,6 @@ class GloboIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id') - video = self._download_json( self._API_URL_TEMPLATE % video_id, video_id)['videos'][0] @@ -417,3 +391,39 @@ class GloboIE(InfoExtractor): 'like_count': like_count, 'formats': formats } + + +class GloboArticleIE(InfoExtractor): + _VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/]+)\.html' + + _VIDEOID_REGEXES = [ + r'\bdata-video-id=["\'](\d{7,})', + r'\bdata-player-videosids=["\'](\d{7,})', + r'\bvideosIDs\s*:\s*["\'](\d{7,})', + r'\bdata-id=["\'](\d{7,})', + r'<div[^>]+\bid=["\'](\d{7,})', + ] + + _TEST = { + 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', + 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', + 'info_dict': { + 'id': '3652183', + 'ext': 'mp4', + 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', + 'duration': 110.711, + 'uploader': 'Rede Globo', + 'uploader_id': 196, + 'like_count': int, + } + } + + @classmethod + def suitable(cls, url): + return False if GloboIE.suitable(url) else super(GloboArticleIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id') + return self.url_result('globo:%s' % video_id, 'Globo') -- cgit v1.2.3 From e3778cce0e912f803ea10cb806406f7fcafe840f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 16:51:19 +0600 Subject: [globo] Improve m3u8 extraction --- youtube_dl/extractor/globo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 828e40d76..c28899011 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -367,7 +367,10 @@ class GloboIE(InfoExtractor): resource_url = resource['url'] signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(signed_url, resource_id, 'mp4')) + m3u8_formats = self._extract_m3u8_formats( + signed_url, resource_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) else: formats.append({ 'url': signed_url, -- cgit v1.2.3 From c3459d24f16056e8ae8f982db2a10871ef18e80a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 16:53:21 +0600 Subject: [globo] Skip unsupported smooth streaming --- youtube_dl/extractor/globo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index c28899011..ec451bb07 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -338,7 +338,7 @@ class GloboIE(InfoExtractor): formats = [] for resource in video['resources']: resource_id = resource.get('_id') - if not resource_id: + if not resource_id or resource_id.endswith('manifest'): continue security = self._download_json( -- cgit v1.2.3 From 5d235ca7f66af1f82c1a4d753d238f48fc3afa40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 16:55:39 +0600 Subject: [globo] Prefer native m3u8 --- youtube_dl/extractor/globo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index ec451bb07..2a805cbb2 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -368,7 +368,8 @@ class GloboIE(InfoExtractor): signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): m3u8_formats = self._extract_m3u8_formats( - signed_url, resource_id, 'mp4', m3u8_id='hls', fatal=False) + signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) if m3u8_formats: formats.extend(m3u8_formats) else: -- cgit v1.2.3 From b4ef6a0038657c1adde565df947e42ad1e1b4195 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:01:27 +0600 Subject: [globo] Remove non available test --- youtube_dl/extractor/globo.py | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 2a805cbb2..8aada01dc 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -26,18 +26,6 @@ class GloboIE(InfoExtractor): _RESIGN_EXPIRATION = 86400 _TESTS = [{ - 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/', - 'md5': '03ebf41cb7ade43581608b7d9b71fab0', - 'info_dict': { - 'id': '3654973', - 'ext': 'mp4', - 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão', - 'duration': 251.585, - 'uploader': 'SporTV', - 'uploader_id': 698, - 'like_count': int, - } - }, { 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', 'info_dict': { -- cgit v1.2.3 From aebb42d32b608eaffb424e5e7c22f1b68a491e3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:01:55 +0600 Subject: [globo] Remove like count It's no longer provided --- youtube_dl/extractor/globo.py | 5 ----- 1 file changed, 5 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 8aada01dc..dc89e46ac 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -35,7 +35,6 @@ class GloboIE(InfoExtractor): 'duration': 103.204, 'uploader': 'Globo.com', 'uploader_id': 265, - 'like_count': int, } }, { 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', @@ -47,7 +46,6 @@ class GloboIE(InfoExtractor): 'duration': 1472.906, 'uploader': 'Canal Brasil', 'uploader_id': 705, - 'like_count': int, } }] @@ -370,7 +368,6 @@ class GloboIE(InfoExtractor): self._sort_formats(formats) duration = float_or_none(video.get('duration'), 1000) - like_count = int_or_none(video.get('likes')) uploader = video.get('channel') uploader_id = video.get('channel_id') @@ -380,7 +377,6 @@ class GloboIE(InfoExtractor): 'duration': duration, 'uploader': uploader, 'uploader_id': uploader_id, - 'like_count': like_count, 'formats': formats } @@ -406,7 +402,6 @@ class GloboArticleIE(InfoExtractor): 'duration': 110.711, 'uploader': 'Rede Globo', 'uploader_id': 196, - 'like_count': int, } } -- cgit v1.2.3 From a4a6b7b80f18680ee0a8bba50a24c58edd3f2a73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:03:45 +0600 Subject: [globo] Improve http formats --- youtube_dl/extractor/globo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index dc89e46ac..64622aa5c 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -361,8 +361,8 @@ class GloboIE(InfoExtractor): else: formats.append({ 'url': signed_url, - 'format_id': resource_id, - 'height': resource.get('height'), + 'format_id': 'http-%s' % resource_id, + 'height': int_or_none(resource.get('height')), }) self._sort_formats(formats) -- cgit v1.2.3 From 264cd00fff4f6d7063d43e1d476de46901bd9c5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:10:45 +0600 Subject: [globo] Update tests --- youtube_dl/extractor/globo.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 64622aa5c..0337256ed 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -35,18 +35,30 @@ class GloboIE(InfoExtractor): 'duration': 103.204, 'uploader': 'Globo.com', 'uploader_id': 265, - } + }, }, { - 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', - 'md5': 'c1defca721ce25b2354e927d3e4b3dec', + 'url': 'http://globoplay.globo.com/v/4581987/', + 'md5': 'f36a1ecd6a50da1577eee6dd17f67eff', 'info_dict': { - 'id': '3928201', + 'id': '4581987', 'ext': 'mp4', - 'title': 'Ator e diretor argentino, Ricado Darín fala sobre utopias e suas perdas', - 'duration': 1472.906, - 'uploader': 'Canal Brasil', - 'uploader_id': 705, - } + 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP', + 'duration': 137.973, + 'uploader': 'Rede Globo', + 'uploader_id': 196, + }, + }, { + 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html', + 'only_matching': True, + }, { + 'url': 'http://globosatplay.globo.com/globonews/v/4472924/', + 'only_matching': True, + }, { + 'url': 'http://globotv.globo.com/t/programa/v/clipe-sexo-e-as-negas-adeus/3836166/', + 'only_matching': True, + }, { + 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', + 'only_matching': True, }] class MD5: -- cgit v1.2.3 From e7d34c03f200e178e9d6dfe4ae3f6856e382a4b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:12:42 +0600 Subject: [globo] Force uploader id to be string --- youtube_dl/extractor/globo.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 0337256ed..6c0fc54de 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -14,6 +14,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + str_or_none, ) @@ -34,7 +35,7 @@ class GloboIE(InfoExtractor): 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', 'duration': 103.204, 'uploader': 'Globo.com', - 'uploader_id': 265, + 'uploader_id': '265', }, }, { 'url': 'http://globoplay.globo.com/v/4581987/', @@ -45,7 +46,7 @@ class GloboIE(InfoExtractor): 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP', 'duration': 137.973, 'uploader': 'Rede Globo', - 'uploader_id': 196, + 'uploader_id': '196', }, }, { 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html', @@ -381,7 +382,7 @@ class GloboIE(InfoExtractor): duration = float_or_none(video.get('duration'), 1000) uploader = video.get('channel') - uploader_id = video.get('channel_id') + uploader_id = str_or_none(video.get('channel_id')) return { 'id': video_id, -- cgit v1.2.3 From c13722480bebfb1fc33169516790df2e99b3e499 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:13:35 +0600 Subject: [globo:article] Fix test --- youtube_dl/extractor/globo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 6c0fc54de..5883be704 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -414,7 +414,7 @@ class GloboArticleIE(InfoExtractor): 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', 'duration': 110.711, 'uploader': 'Rede Globo', - 'uploader_id': 196, + 'uploader_id': '196', } } -- cgit v1.2.3 From 5d501a0901c36695c9d6ca3958ac4ccfdea90954 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:42:11 +0600 Subject: [globo] Add more tests --- youtube_dl/extractor/globo.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 5883be704..c65ef6bcf 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -60,6 +60,9 @@ class GloboIE(InfoExtractor): }, { 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', 'only_matching': True, + }, { + 'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html', + 'only_matching': True, }] class MD5: @@ -405,7 +408,7 @@ class GloboArticleIE(InfoExtractor): r'<div[^>]+\bid=["\'](\d{7,})', ] - _TEST = { + _TESTS = [{ 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', 'info_dict': { @@ -416,7 +419,13 @@ class GloboArticleIE(InfoExtractor): 'uploader': 'Rede Globo', 'uploader_id': '196', } - } + }, { + 'url': 'http://gq.globo.com/Prazeres/Poder/noticia/2015/10/all-o-desafio-assista-ao-segundo-capitulo-da-serie.html', + 'only_matching': True, + }, { + 'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html', + 'only_matching': True, + }] @classmethod def suitable(cls, url): -- cgit v1.2.3 From 17d1900581ffd12866e56640080ce340d99149a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:57:46 +0600 Subject: [vk] Fix view count extraction (Closes #7353) --- youtube_dl/extractor/vk.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 765e9e6fd..01960b827 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -281,9 +281,13 @@ class VKIE(InfoExtractor): mobj.group(1) + ' ' + mobj.group(2) upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2)) - view_count = str_to_int(self._search_regex( - r'"mv_views_count_number"[^>]*>([\d,.]+) views<', - info_page, 'view count', fatal=False)) + view_count = None + views = self._html_search_regex( + r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', + info_page, 'view count', fatal=False) + if views: + view_count = str_to_int(self._search_regex( + r'([\d,.]+)', views, 'view count', fatal=False)) formats = [{ 'format_id': k, -- cgit v1.2.3 From cb5a470635ea2ad91f18d33e391979aabb0755fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 4 Nov 2015 16:18:51 +0100 Subject: [vimeo] Remove unused import --- youtube_dl/extractor/vimeo.py | 1 - 1 file changed, 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index b608740b8..ca716c8f5 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,7 +8,6 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse, compat_urllib_request, compat_urlparse, ) -- cgit v1.2.3 From 44b2264feae331eeb34e83eed1387def3d61a437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 22:12:24 +0600 Subject: [youtube] Prefer video_info with token available --- youtube_dl/extractor/youtube.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d7eda7aa7..5eeb3c663 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1107,6 +1107,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not video_info: video_info = get_video_info if 'token' in get_video_info: + if 'token' not in video_info: + video_info = get_video_info break if 'token' not in video_info: if 'reason' in video_info: -- cgit v1.2.3 From 89ea063eebae84792a7ccb968533ff8bf6a41d56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 22:49:23 +0600 Subject: [youtube] Clarify rationale for preferring a video info with token (#7362) --- youtube_dl/extractor/youtube.py | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5eeb3c663..e2a43299f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1107,6 +1107,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not video_info: video_info = get_video_info if 'token' in get_video_info: + # Different get_video_info requests may report different results, e.g. + # some may report video unavailability, but some may serve it without + # any complaint (see https://github.com/rg3/youtube-dl/issues/7362, + # the original webpage as well as el=info and el=embedded get_video_info + # requests report video unavailability due to geo restriction while + # el=detailpage succeeds and returns valid data). This is probably + # due to YouTube measures against IP ranges of hosting providers. + # Working around by preferring the first succeeded video_info containing + # the token if no such video_info yet was found. if 'token' not in video_info: video_info = get_video_info break -- cgit v1.2.3 From f93ded98522cc1272a8d2210738937132292afc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 01:54:49 +0600 Subject: [prosiebensat1] Add support for .ch domains (Closes #7365) --- youtube_dl/extractor/prosiebensat1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index effcf1db3..baa54a3af 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -20,7 +20,7 @@ from ..utils import ( class ProSiebenSat1IE(InfoExtractor): IE_NAME = 'prosiebensat1' IE_DESC = 'ProSiebenSat.1 Digital' - _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)' _TESTS = [ { -- cgit v1.2.3 From b15c44cd36831f175e9dd4081b82beb8075790b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 02:51:30 +0600 Subject: [periscope] Add support for videos with broadcast_id (Closes #7359) --- youtube_dl/extractor/periscope.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 8ad936758..0f9d7576f 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -27,9 +27,10 @@ class PeriscopeIE(InfoExtractor): 'skip': 'Expires in 24 hours', } - def _call_api(self, method, token): + def _call_api(self, method, value): + attribute = 'token' if len(value) > 13 else 'broadcast_id' return self._download_json( - 'https://api.periscope.tv/api/v2/%s?token=%s' % (method, token), token) + 'https://api.periscope.tv/api/v2/%s?%s=%s' % (method, attribute, value), value) def _real_extract(self, url): token = self._match_id(url) -- cgit v1.2.3 From 2549e113b8750a493917436d4fd15ed74a1a4983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 02:55:53 +0600 Subject: [periscope] Add test for broadcast_id based URL --- youtube_dl/extractor/periscope.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 0f9d7576f..7621d9e99 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -12,7 +12,7 @@ from ..utils import parse_iso8601 class PeriscopeIE(InfoExtractor): IE_DESC = 'Periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', 'md5': '65b57957972e503fcbbaeed8f4fa04ca', 'info_dict': { @@ -25,7 +25,10 @@ class PeriscopeIE(InfoExtractor): 'uploader_id': '1465763', }, 'skip': 'Expires in 24 hours', - } + }, { + 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', + 'only_matching': True, + }] def _call_api(self, method, value): attribute = 'token' if len(value) > 13 else 'broadcast_id' -- cgit v1.2.3 From 53472df85793cc89deb779c2ffc3ae1f47292fd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 02:56:44 +0600 Subject: [periscope] Add note on where to find alive example URLs --- youtube_dl/extractor/periscope.py | 1 + 1 file changed, 1 insertion(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 7621d9e99..887c8020d 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -12,6 +12,7 @@ from ..utils import parse_iso8601 class PeriscopeIE(InfoExtractor): IE_DESC = 'Periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)' + # Alive example URLs can be found here http://onperiscope.com/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', 'md5': '65b57957972e503fcbbaeed8f4fa04ca', -- cgit v1.2.3 From 6953d8e95a78e83f087693b7353baab96b09fbdd Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 02:09:55 +0100 Subject: [miomio] fix info extraction (fixes #7366) --- youtube_dl/extractor/miomio.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index a784fc5fb..3f812e005 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import random from .common import InfoExtractor +from ..compat import compat_urllib_request from ..utils import ( xpath_text, int_or_none, @@ -60,10 +61,12 @@ class MioMioIE(InfoExtractor): 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/xml.php?id=%s&r=%s' % (id, random.randint(100, 999)), video_id) - # the following xml contains the actual configuration information on the video file(s) - vid_config = self._download_xml( + vid_config_request = compat_urllib_request.Request( 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config), - video_id) + headers={'Referer': 'http://www.miomio.tv/mioplayer/mioplayer-v3.0.swf'}) + + # the following xml contains the actual configuration information on the video file(s) + vid_config = self._download_xml(vid_config_request, video_id) http_headers = { 'Referer': 'http://www.miomio.tv%s' % mioplayer_path, -- cgit v1.2.3 From e68dd1921ad7528d225a8571066f99b9934b6a06 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 06:33:05 +0100 Subject: [miomio] use the formats urls headers for downloading xml --- youtube_dl/extractor/miomio.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index 3f812e005..6f40bf1b9 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -52,6 +52,8 @@ class MioMioIE(InfoExtractor): mioplayer_path = self._search_regex( r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path') + http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path,} + xml_config = self._search_regex( r'flashvars="type=(?:sina|video)&(.+?)&', webpage, 'xml config') @@ -63,15 +65,11 @@ class MioMioIE(InfoExtractor): vid_config_request = compat_urllib_request.Request( 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config), - headers={'Referer': 'http://www.miomio.tv/mioplayer/mioplayer-v3.0.swf'}) + headers=http_headers) # the following xml contains the actual configuration information on the video file(s) vid_config = self._download_xml(vid_config_request, video_id) - http_headers = { - 'Referer': 'http://www.miomio.tv%s' % mioplayer_path, - } - if not int_or_none(xpath_text(vid_config, 'timelength')): raise ExtractorError('Unable to load videos!', expected=True) -- cgit v1.2.3 From 5003e4283b35acb82ea9793d91bc3cd1ee679f86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:06:44 +0600 Subject: [ndr] Relax _VALID_URL (Closes #7383) --- youtube_dl/extractor/ndr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index ba06d8a98..a2b51ccb3 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -23,7 +23,7 @@ class NDRBaseIE(InfoExtractor): class NDRIE(NDRBaseIE): IE_NAME = 'ndr' IE_DESC = 'NDR.de - Norddeutscher Rundfunk' - _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P<id>[^/?#]+),[\da-z]+\.html' + _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P<id>[^/?#]+),[\da-z]+\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', -- cgit v1.2.3 From 01003d072c20c2ed095930d87c5ce3d5610e66b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:07:52 +0600 Subject: [ndr] Add test for #7383 --- youtube_dl/extractor/ndr.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index a2b51ccb3..0be866681 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -78,6 +78,9 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', + 'only_matching': True, }] def _extract_embed(self, webpage, display_id): -- cgit v1.2.3 From 1e2eb4b40d46f39d15c067657ecad16fa3b2121d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:08:21 +0600 Subject: [njoy] Relax _VALID_URL --- youtube_dl/extractor/ndr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 0be866681..7043c7e0f 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -105,7 +105,7 @@ class NDRIE(NDRBaseIE): class NJoyIE(NDRBaseIE): IE_NAME = 'njoy' IE_DESC = 'N-JOY' - _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?:(?P<display_id>[^/?#]+),)?(?P<id>[\da-z]+)\.html' + _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?:(?P<display_id>[^/?#]+),)?(?P<id>[\da-z]+)\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', -- cgit v1.2.3 From 81413c01651eddcc5180af379f2ce3689a376051 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:08:52 +0600 Subject: [ndr:embed] Relax _VALID_URL --- youtube_dl/extractor/ndr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 7043c7e0f..477ce4e6b 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -238,7 +238,7 @@ class NDREmbedBaseIE(InfoExtractor): class NDREmbedIE(NDREmbedBaseIE): IE_NAME = 'ndr:embed' - _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html' + _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html' _TESTS = [{ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', -- cgit v1.2.3 From 92366d189ef280b8ba0057930c54aa14b0ecdd24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:09:17 +0600 Subject: [njoy:embed] Relax _VALID_URL --- youtube_dl/extractor/ndr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 477ce4e6b..16213eed9 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -332,7 +332,7 @@ class NDREmbedIE(NDREmbedBaseIE): class NJoyEmbedIE(NDREmbedBaseIE): IE_NAME = 'njoy:embed' - _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' + _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' _TESTS = [{ # httpVideo 'url': 'http://www.n-joy.de/events/reeperbahnfestival/doku948-player_image-bc168e87-5263-4d6d-bd27-bb643005a6de_theme-n-joy.html', -- cgit v1.2.3 From 179ffab69c3359ab7d0a7b0a2b63c94d8c70af67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:06:13 +0600 Subject: [lynda:course] Force log out (Closes #7361) --- youtube_dl/extractor/lynda.py | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 5c973e75c..67f2025de 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -82,6 +82,11 @@ class LyndaBaseIE(InfoExtractor): expected=True) raise ExtractorError('Unable to log in') + def _logout(self): + self._download_webpage( + 'http://www.lynda.com/ajax/logout.aspx', None, + 'Logging out', 'Unable to log out', fatal=False) + class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' @@ -210,6 +215,8 @@ class LyndaCourseIE(LyndaBaseIE): course_id, 'Downloading course JSON') course_json = json.loads(page) + self._logout() + if 'Status' in course_json and course_json['Status'] == 'NotFound': raise ExtractorError( 'Course %s does not exist' % course_id, expected=True) -- cgit v1.2.3 From 71bb016160744a80fecaadf5b75b0dc2b1e8089b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:10:07 +0600 Subject: [lynda:course] Modernize and make more robust --- youtube_dl/extractor/lynda.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 67f2025de..98474ded9 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -210,14 +210,13 @@ class LyndaCourseIE(LyndaBaseIE): course_path = mobj.group('coursepath') course_id = mobj.group('courseid') - page = self._download_webpage( + course = self._download_json( 'http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, course_id, 'Downloading course JSON') - course_json = json.loads(page) self._logout() - if 'Status' in course_json and course_json['Status'] == 'NotFound': + if course.get('Status') == 'NotFound': raise ExtractorError( 'Course %s does not exist' % course_id, expected=True) @@ -227,12 +226,14 @@ class LyndaCourseIE(LyndaBaseIE): # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided # by single video API anymore - for chapter in course_json['Chapters']: - for video in chapter['Videos']: - if video['HasAccess'] is False: + for chapter in course['Chapters']: + for video in chapter.get('Videos', []): + if video.get('HasAccess') is False: unaccessible_videos += 1 continue - videos.append(video['ID']) + video_id = video.get('ID') + if video_id: + videos.append(video_id) if unaccessible_videos > 0: self._downloader.report_warning( @@ -245,6 +246,6 @@ class LyndaCourseIE(LyndaBaseIE): 'Lynda') for video_id in videos] - course_title = course_json['Title'] + course_title = course.get('Title') return self.playlist_result(entries, course_id, course_title) -- cgit v1.2.3 From ea8ed40b2fb70fc2f01aba475128821078873d46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:24:39 +0600 Subject: [lynda] Modernize and make more robust --- youtube_dl/extractor/lynda.py | 54 ++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 29 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 98474ded9..c8a16842e 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -113,51 +113,47 @@ class LyndaIE(LyndaBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage( + video = self._download_json( 'http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id, 'Downloading video JSON') - video_json = json.loads(page) - if 'Status' in video_json: + if 'Status' in video: raise ExtractorError( - 'lynda returned error: %s' % video_json['Message'], expected=True) + 'lynda returned error: %s' % video['Message'], expected=True) - if video_json['HasAccess'] is False: + if video.get('HasAccess') is False: self.raise_login_required('Video %s is only available for members' % video_id) - video_id = compat_str(video_json['ID']) - duration = video_json['DurationInSeconds'] - title = video_json['Title'] + video_id = compat_str(video.get('ID') or video_id) + duration = int_or_none(video.get('DurationInSeconds')) + title = video['Title'] formats = [] - fmts = video_json.get('Formats') + fmts = video.get('Formats') if fmts: - formats.extend([ - { - 'url': fmt['Url'], - 'ext': fmt['Extension'], - 'width': fmt['Width'], - 'height': fmt['Height'], - 'filesize': fmt['FileSize'], - 'format_id': str(fmt['Resolution']) - } for fmt in fmts]) - - prioritized_streams = video_json.get('PrioritizedStreams') + formats.extend([{ + 'url': f['Url'], + 'ext': f.get('Extension'), + 'width': int_or_none(f.get('Width')), + 'height': int_or_none(f.get('Height')), + 'filesize': int_or_none(f.get('FileSize')), + 'format_id': compat_str(f.get('Resolution')) if f.get('Resolution') else None, + } for f in fmts if f.get('Url')]) + + prioritized_streams = video.get('PrioritizedStreams') if prioritized_streams: for prioritized_stream_id, prioritized_stream in prioritized_streams.items(): - formats.extend([ - { - 'url': video_url, - 'width': int_or_none(format_id), - 'format_id': '%s-%s' % (prioritized_stream_id, format_id), - } for format_id, video_url in prioritized_stream.items() - ]) + formats.extend([{ + 'url': video_url, + 'width': int_or_none(format_id), + 'format_id': '%s-%s' % (prioritized_stream_id, format_id), + } for format_id, video_url in prioritized_stream.items()]) self._check_formats(formats, video_id) self._sort_formats(formats) - subtitles = self.extract_subtitles(video_id, page) + subtitles = self.extract_subtitles(video_id) return { 'id': video_id, @@ -188,7 +184,7 @@ class LyndaIE(LyndaBaseIE): if srt: return srt - def _get_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id): url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id subs = self._download_json(url, None, False) if subs: -- cgit v1.2.3 From ae4ddf9efae816f4d52fc584c93e4f0e3c79c410 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:27:38 +0600 Subject: [lynda] PEP 8 --- youtube_dl/extractor/lynda.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index c8a16842e..9a207b2cd 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -227,9 +227,8 @@ class LyndaCourseIE(LyndaBaseIE): if video.get('HasAccess') is False: unaccessible_videos += 1 continue - video_id = video.get('ID') - if video_id: - videos.append(video_id) + if video.get('ID'): + videos.append(video['ID']) if unaccessible_videos > 0: self._downloader.report_warning( -- cgit v1.2.3 From 472404953a22811cc8156da110ea872a924f1f18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:28:14 +0600 Subject: [miomio] PEP 8 --- youtube_dl/extractor/miomio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index 6f40bf1b9..ce391c759 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -52,7 +52,7 @@ class MioMioIE(InfoExtractor): mioplayer_path = self._search_regex( r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path') - http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path,} + http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} xml_config = self._search_regex( r'flashvars="type=(?:sina|video)&(.+?)&', -- cgit v1.2.3 From 0fa6b17dccd2347cb0611651fc04e36839d33a4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:45:26 +0600 Subject: [pbs] Simplify and speed up player URL search --- youtube_dl/extractor/pbs.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 3448736a2..7b868d057 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -191,9 +191,13 @@ class PBSIE(InfoExtractor): if media_id: return media_id, presumptive_id, upload_date - url = self._search_regex( - r'(?s)<iframe[^>]+?(?:[a-z-]+?=["\'].*?["\'][^>]+?)*?\bsrc=["\']([^\'"]+partnerplayer[^\'"]+)["\']', - webpage, 'player URL') + for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage): + url = self._search_regex( + r'src=(["\'])(?P<url>.+?partnerplayer.+?)\1', iframe, + 'player URL', default=None, group='url') + if url: + break + mobj = re.match(self._VALID_URL, url) player_id = mobj.group('player_id') -- cgit v1.2.3 From 686f98816ecbbcb224d1336682688b05cdb051a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 00:39:16 +0600 Subject: [pbs] Add support for flp frontlines (Closes #7369) --- youtube_dl/extractor/pbs.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 7b868d057..3169e9c3f 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -8,6 +8,7 @@ from ..utils import ( ExtractorError, determine_ext, int_or_none, + strip_jsonp, unified_strdate, US_RATINGS, ) @@ -191,6 +192,23 @@ class PBSIE(InfoExtractor): if media_id: return media_id, presumptive_id, upload_date + # Fronline video embedded via flp + video_id = self._search_regex( + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid') + if video_id: + # pkg_id calculation is reverse engineered from + # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js + prg_id = self._search_regex( + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid')[7:] + if 'q' in prg_id: + prg_id = prg_id.split('q')[1] + prg_id = int(prg_id, 16) + getdir = self._download_json( + 'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id, + presumptive_id, 'Downloading getdir JSON', + transform_source=strip_jsonp) + return getdir['mid'], presumptive_id, upload_date + for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage): url = self._search_regex( r'src=(["\'])(?P<url>.+?partnerplayer.+?)\1', iframe, -- cgit v1.2.3 From 8b6d9406db1d3361b006016e6aace54b05cb6fea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 00:42:30 +0600 Subject: [pbs] Add test for flp frontline embeds --- youtube_dl/extractor/pbs.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 3169e9c3f..a690f9c29 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -154,6 +154,22 @@ class PBSIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, + { + # Frontline video embedded via flp2012.js + 'url': 'http://www.pbs.org/wgbh/pages/frontline/the-atomic-artists', + 'info_dict': { + 'id': '2070868960', + 'display_id': 'the-atomic-artists', + 'ext': 'mp4', + 'title': 'FRONTLINE - The Atomic Artists', + 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e', + 'duration': 723, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, } ] _ERRORS = { -- cgit v1.2.3 From 21d0c33ecde573db961b97f5f0c37ba9d3c02ff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 01:08:40 +0600 Subject: [pbs] Make flp embed lookup non fatal --- youtube_dl/extractor/pbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index a690f9c29..8fb9b1849 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -210,7 +210,7 @@ class PBSIE(InfoExtractor): # Fronline video embedded via flp video_id = self._search_regex( - r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid') + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None) if video_id: # pkg_id calculation is reverse engineered from # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js -- cgit v1.2.3 From ee223abb88263bdda2d92c4b2139d1dca60ba3ae Mon Sep 17 00:00:00 2001 From: Mister Hat <misterhat144@gmail.com> Date: Tue, 3 Nov 2015 19:13:27 -0600 Subject: [vidzi] fixed. finds url from hash and host in script Closes #7386. --- youtube_dl/extractor/vidzi.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 08a5a7b8d..2ba9f31df 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -20,8 +20,14 @@ class VidziIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'{\s*file\s*:\s*"([^"]+)"\s*}', webpage, 'video url') + video_host = self._html_search_regex( + r'id=\'vplayer\'><img src="http://(.*?)/i', webpage, + 'video host') + video_hash = self._html_search_regex( + r'\|([a-z0-9]+)\|hls\|type', webpage, 'video_hash') + ext = self._html_search_regex( + r'\|tracks\|([a-z0-9]+)\|', webpage, 'video ext') + video_url = 'http://' + video_host + '/' + video_hash + '/v.' + ext title = self._html_search_regex( r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') -- cgit v1.2.3 From 5d0f84d32cc038dd71673987cb6efaa85e953474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 06:23:00 +0600 Subject: [beeg] Skip empty URLs (Closes #7392) --- youtube_dl/extractor/beeg.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index e6c928699..61bc2f744 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -33,6 +33,8 @@ class BeegIE(InfoExtractor): formats = [] for format_id, video_url in video.items(): + if not video_url: + continue height = self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None) if not height: -- cgit v1.2.3 From 5214f1e31d5e5ba692fb1ed4803ff71ef4e480e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 19:25:59 +0600 Subject: [crunchyroll] Fix title extraction (Closes #7396) --- youtube_dl/extractor/crunchyroll.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 0c9b8ca02..4243f3e2e 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -287,7 +287,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if 'To view this, please log in to verify you are 18 or older.' in webpage: self.raise_login_required() - video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL) + video_title = self._html_search_regex( + r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', + webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') if not video_description: -- cgit v1.2.3 From 2c740cf28d257d2a915195e7cc60f83e6690d2cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 19:29:09 +0600 Subject: [crunchyroll] Simplify description extraction --- youtube_dl/extractor/crunchyroll.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4243f3e2e..9aa5d58b4 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -291,9 +291,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) - video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') - if not video_description: - video_description = None + video_description = self._html_search_regex( + r'"description":"([^"]+)', webpage, 'video_description', default=None) video_upload_date = self._html_search_regex( [r'<div>Availability for free users:(.+?)</div>', r'<div>[^<>]+<span>\s*(.+?\d{4})\s*</span></div>'], webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) -- cgit v1.2.3 From 6d02b9a392d39c114d3fb58bf7965f62196ccecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 20:02:39 +0600 Subject: [crunchyroll] Fix description extraction --- youtube_dl/extractor/crunchyroll.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 9aa5d58b4..6e5999c72 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -21,6 +21,7 @@ from ..utils import ( bytes_to_intlist, intlist_to_bytes, int_or_none, + lowercase_escape, remove_end, unified_strdate, urlencode_postdata, @@ -104,7 +105,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'id': '589804', 'ext': 'flv', 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', - 'description': 'md5:fe2743efedb49d279552926d0bd0cd9e', + 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'Danny Choo Network', 'upload_date': '20120213', @@ -292,7 +293,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) video_description = self._html_search_regex( - r'"description":"([^"]+)', webpage, 'video_description', default=None) + r'<script[^>]*>\s*.+?\[media_id=%s\].+?"description"\s*:\s*"([^"]+)' % video_id, + webpage, 'description', default=None) + if video_description: + video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) video_upload_date = self._html_search_regex( [r'<div>Availability for free users:(.+?)</div>', r'<div>[^<>]+<span>\s*(.+?\d{4})\s*</span></div>'], webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) -- cgit v1.2.3 From cff551c0b0ed8eb55c1ab63ec669c07a51aa4998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 7 Nov 2015 18:43:22 +0100 Subject: [googleplus] Fix extraction of formats --- youtube_dl/extractor/googleplus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index fcefe54cd..731bacd67 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -61,7 +61,7 @@ class GooglePlusIE(InfoExtractor): 'width': int(width), 'height': int(height), } for width, height, video_url in re.findall( - r'\d+,(\d+),(\d+),"(https?://redirector\.googlevideo\.com.*?)"', webpage)] + r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent.com.*?)"', webpage)] self._sort_formats(formats) return { -- cgit v1.2.3 From ee4337d100f68bbb2ae795101d4c391b522ec753 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 20:16:14 +0100 Subject: [videolecture] add support for multi part videos --- youtube_dl/extractor/videolecturesnet.py | 95 +++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 25 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py index 649ac9433..351706362 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/videolecturesnet.py @@ -10,17 +10,19 @@ from ..compat import ( from ..utils import ( ExtractorError, parse_duration, + js_to_json, + parse_iso8601, ) class VideoLecturesNetIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/*(?:[#?].*)?$' + _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?' IE_NAME = 'videolectures.net' _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', 'info_dict': { - 'id': 'promogram_igor_mekjavic_eng', + 'id': '20171_part1', 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', @@ -32,7 +34,7 @@ class VideoLecturesNetIE(InfoExtractor): # video with invalid direct format links (HTTP 403) 'url': 'http://videolectures.net/russir2010_filippova_nlp/', 'info_dict': { - 'id': 'russir2010_filippova_nlp', + 'id': '14891_part1', 'ext': 'flv', 'title': 'NLP at Google', 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', @@ -46,37 +48,80 @@ class VideoLecturesNetIE(InfoExtractor): }, { 'url': 'http://videolectures.net/deeplearning2015_montreal/', 'info_dict': { - 'id': 'deeplearning2015_montreal', + 'id': '23181', 'title': 'Deep Learning Summer School, Montreal 2015', - 'description': 'md5:90121a40cc6926df1bf04dcd8563ed3b', + 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', + 'timestamp': 1438560000, }, 'playlist_count': 30, + }, { + # multi part lecture + 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/', + 'info_dict': { + 'id': '9737', + 'title': 'Introduction To Bayesian Inference', + 'timestamp': 1251622800, + }, + 'playlist': [{ + 'info_dict': { + 'id': '9737_part1', + 'ext': 'wmv', + 'title': 'Introduction To Bayesian Inference', + }, + }, { + 'info_dict': { + 'id': '9737_part2', + 'ext': 'wmv', + 'title': 'Introduction To Bayesian Inference', + }, + }], + 'playlist_count': 2, }] def _real_extract(self, url): - video_id = self._match_id(url) + lecture_slug, part = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage(url, lecture_slug) - smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id + cfg = self._parse_json(self._search_regex(r'cfg\s*:\s*({[^}]+})', webpage, 'cfg'), lecture_slug, js_to_json) - try: - smil = self._download_smil(smil_url, video_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - # Probably a playlist - webpage = self._download_webpage(url, video_id) - entries = [ - self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') - for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] - playlist_title = self._html_search_meta('title', webpage, 'title', fatal=True) - playlist_description = self._html_search_meta('description', webpage, 'description') - return self.playlist_result(entries, video_id, playlist_title, playlist_description) + lecture_id = str(cfg['obj_id']) - info = self._parse_smil(smil, smil_url, video_id) + lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (self._proto_relative_url(cfg['livepipe'], 'http:'), lecture_id), lecture_id)['lecture'][0] - info['id'] = video_id + lecture_info = { + 'id': lecture_id, + 'display_id': lecture_slug, + 'title': lecture_data['title'], + 'timestamp': parse_iso8601(lecture_data.get('time')), + 'description': lecture_data.get('description_wiki'), + 'thumbnail': lecture_data.get('thumb'), + } - switch = smil.find('.//switch') - if switch is not None: - info['duration'] = parse_duration(switch.attrib.get('dur')) + entries = [] + parts = cfg.get('videos') + if parts: + if len(parts) == 1: + part = str(parts[0]) + if part: + smil_url = 'http://videolectures.net/%s/video/%s/smil.xml' % (lecture_slug, part) + smil = self._download_smil(smil_url, lecture_id) + info = self._parse_smil(smil, smil_url, lecture_id) + info['id'] = '%s_part%s' % (lecture_id, part) + switch = smil.find('.//switch') + if switch is not None: + info['duration'] = parse_duration(switch.attrib.get('dur')) + return info + else: + for part in parts: + entries.append(self.url_result('http://videolectures.net/%s/video/%s' % (lecture_slug, part), 'VideoLecturesNet')) + lecture_info['_type'] = 'multi_video' + else: + # Probably a playlist + entries = [ + self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') + for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] + lecture_info['_type'] = 'playlist' - return info + lecture_info['entries'] = entries + return lecture_info -- cgit v1.2.3 From a06bf87a2c6009d82ec28afe566f653b3deb11bf Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 21:23:41 +0100 Subject: [viidea] add support for sites using viidea service --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/videolecturesnet.py | 127 --------------------------- youtube_dl/extractor/viidea.py | 144 +++++++++++++++++++++++++++++++ 3 files changed, 145 insertions(+), 128 deletions(-) delete mode 100644 youtube_dl/extractor/videolecturesnet.py create mode 100644 youtube_dl/extractor/viidea.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 94150a28f..0a90da73c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -724,7 +724,6 @@ from .vh1 import VH1IE from .vice import ViceIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE -from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videomega import VideoMegaIE from .videopremium import VideoPremiumIE @@ -734,6 +733,7 @@ from .vidme import VidmeIE from .vidzi import VidziIE from .vier import VierIE, VierVideosIE from .viewster import ViewsterIE +from .viidea import ViideaIE from .vimeo import ( VimeoIE, VimeoAlbumIE, diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py deleted file mode 100644 index 351706362..000000000 --- a/youtube_dl/extractor/videolecturesnet.py +++ /dev/null @@ -1,127 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - parse_duration, - js_to_json, - parse_iso8601, -) - - -class VideoLecturesNetIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?' - IE_NAME = 'videolectures.net' - - _TESTS = [{ - 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', - 'info_dict': { - 'id': '20171_part1', - 'ext': 'mp4', - 'title': 'Automatics, robotics and biocybernetics', - 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', - 'upload_date': '20130627', - 'duration': 565, - 'thumbnail': 're:http://.*\.jpg', - }, - }, { - # video with invalid direct format links (HTTP 403) - 'url': 'http://videolectures.net/russir2010_filippova_nlp/', - 'info_dict': { - 'id': '14891_part1', - 'ext': 'flv', - 'title': 'NLP at Google', - 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', - 'duration': 5352, - 'thumbnail': 're:http://.*\.jpg', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://videolectures.net/deeplearning2015_montreal/', - 'info_dict': { - 'id': '23181', - 'title': 'Deep Learning Summer School, Montreal 2015', - 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', - 'timestamp': 1438560000, - }, - 'playlist_count': 30, - }, { - # multi part lecture - 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/', - 'info_dict': { - 'id': '9737', - 'title': 'Introduction To Bayesian Inference', - 'timestamp': 1251622800, - }, - 'playlist': [{ - 'info_dict': { - 'id': '9737_part1', - 'ext': 'wmv', - 'title': 'Introduction To Bayesian Inference', - }, - }, { - 'info_dict': { - 'id': '9737_part2', - 'ext': 'wmv', - 'title': 'Introduction To Bayesian Inference', - }, - }], - 'playlist_count': 2, - }] - - def _real_extract(self, url): - lecture_slug, part = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage(url, lecture_slug) - - cfg = self._parse_json(self._search_regex(r'cfg\s*:\s*({[^}]+})', webpage, 'cfg'), lecture_slug, js_to_json) - - lecture_id = str(cfg['obj_id']) - - lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (self._proto_relative_url(cfg['livepipe'], 'http:'), lecture_id), lecture_id)['lecture'][0] - - lecture_info = { - 'id': lecture_id, - 'display_id': lecture_slug, - 'title': lecture_data['title'], - 'timestamp': parse_iso8601(lecture_data.get('time')), - 'description': lecture_data.get('description_wiki'), - 'thumbnail': lecture_data.get('thumb'), - } - - entries = [] - parts = cfg.get('videos') - if parts: - if len(parts) == 1: - part = str(parts[0]) - if part: - smil_url = 'http://videolectures.net/%s/video/%s/smil.xml' % (lecture_slug, part) - smil = self._download_smil(smil_url, lecture_id) - info = self._parse_smil(smil, smil_url, lecture_id) - info['id'] = '%s_part%s' % (lecture_id, part) - switch = smil.find('.//switch') - if switch is not None: - info['duration'] = parse_duration(switch.attrib.get('dur')) - return info - else: - for part in parts: - entries.append(self.url_result('http://videolectures.net/%s/video/%s' % (lecture_slug, part), 'VideoLecturesNet')) - lecture_info['_type'] = 'multi_video' - else: - # Probably a playlist - entries = [ - self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') - for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] - lecture_info['_type'] = 'playlist' - - lecture_info['entries'] = entries - return lecture_info diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py new file mode 100644 index 000000000..71fb298e6 --- /dev/null +++ b/youtube_dl/extractor/viidea.py @@ -0,0 +1,144 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + parse_duration, + js_to_json, + parse_iso8601, +) + + +class ViideaIE(InfoExtractor): + _VALID_URL = r'''(?x)http://(?:www\.)?(?: + videolectures\.net| + flexilearn\.viidea\.net| + presentations\.ocwconsortium\.org| + video\.travel-zoom\.si| + video\.pomp-forum\.si| + tv\.nil\.si| + video\.hekovnik.com| + video\.szko\.si| + kpk\.viidea\.com| + inside\.viidea\.net| + video\.kiberpipa\.org| + bvvideo\.si| + kongres\.viidea\.net| + edemokracija\.viidea\.com + )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?''' + + _TESTS = [{ + 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', + 'info_dict': { + 'id': '20171_part1', + 'ext': 'mp4', + 'title': 'Automatics, robotics and biocybernetics', + 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'upload_date': '20130627', + 'duration': 565, + 'thumbnail': 're:http://.*\.jpg', + }, + }, { + # video with invalid direct format links (HTTP 403) + 'url': 'http://videolectures.net/russir2010_filippova_nlp/', + 'info_dict': { + 'id': '14891_part1', + 'ext': 'flv', + 'title': 'NLP at Google', + 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', + 'duration': 5352, + 'thumbnail': 're:http://.*\.jpg', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://videolectures.net/deeplearning2015_montreal/', + 'info_dict': { + 'id': '23181', + 'title': 'Deep Learning Summer School, Montreal 2015', + 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', + 'timestamp': 1438560000, + }, + 'playlist_count': 30, + }, { + # multi part lecture + 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/', + 'info_dict': { + 'id': '9737', + 'title': 'Introduction To Bayesian Inference', + 'timestamp': 1251622800, + }, + 'playlist': [{ + 'info_dict': { + 'id': '9737_part1', + 'ext': 'wmv', + 'title': 'Introduction To Bayesian Inference', + }, + }, { + 'info_dict': { + 'id': '9737_part2', + 'ext': 'wmv', + 'title': 'Introduction To Bayesian Inference', + }, + }], + 'playlist_count': 2, + }] + + def _real_extract(self, url): + lecture_slug, part = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage(url, lecture_slug) + + cfg = self._parse_json(self._search_regex(r'cfg\s*:\s*({[^}]+})', webpage, 'cfg'), lecture_slug, js_to_json) + + lecture_id = str(cfg['obj_id']) + + base_url = self._proto_relative_url(cfg['livepipe'], 'http:') + + lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), lecture_id)['lecture'][0] + + lecture_info = { + 'id': lecture_id, + 'display_id': lecture_slug, + 'title': lecture_data['title'], + 'timestamp': parse_iso8601(lecture_data.get('time')), + 'description': lecture_data.get('description_wiki'), + 'thumbnail': lecture_data.get('thumb'), + } + + entries = [] + parts = cfg.get('videos') + if parts: + if len(parts) == 1: + part = str(parts[0]) + if part: + smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part) + smil = self._download_smil(smil_url, lecture_id) + info = self._parse_smil(smil, smil_url, lecture_id) + info['id'] = '%s_part%s' % (lecture_id, part) + switch = smil.find('.//switch') + if switch is not None: + info['duration'] = parse_duration(switch.attrib.get('dur')) + return info + else: + for part in parts: + entries.append(self.url_result('%s/video/%s' % (base_url, lecture_id, part), 'Viidea')) + lecture_info['_type'] = 'multi_video' + else: + # Probably a playlist + playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) + entries = [ + self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea') + for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] + lecture_info['_type'] = 'playlist' + + lecture_info['entries'] = entries + return lecture_info -- cgit v1.2.3 From 8e3a2bd6200660f9fb9d485b1c924fa5462bd566 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 7 Nov 2015 17:43:23 +0100 Subject: [viidea] fix _VALID_URL regex and tests --- youtube_dl/extractor/viidea.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 71fb298e6..ae9a42737 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -31,7 +31,7 @@ class ViideaIE(InfoExtractor): bvvideo\.si| kongres\.viidea\.net| edemokracija\.viidea\.com - )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?''' + )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?/*(?:[#?].*)?$''' _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', @@ -130,7 +130,7 @@ class ViideaIE(InfoExtractor): return info else: for part in parts: - entries.append(self.url_result('%s/video/%s' % (base_url, lecture_id, part), 'Viidea')) + entries.append(self.url_result('%s/%s/video/%s' % (base_url, lecture_slug, part), 'Viidea')) lecture_info['_type'] = 'multi_video' else: # Probably a playlist -- cgit v1.2.3 From 6fdb39ded15c6276b49fa67cb517bf1fed63af35 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 7 Nov 2015 20:38:33 +0100 Subject: [viidia] Cleaup [viidea] extract playlist if lecture is an event [viidia] use compat_str --- youtube_dl/extractor/viidea.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index ae9a42737..2541a36ed 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -4,11 +4,10 @@ import re from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_urlparse, + compat_str, ) from ..utils import ( - ExtractorError, parse_duration, js_to_json, parse_iso8601, @@ -97,9 +96,9 @@ class ViideaIE(InfoExtractor): webpage = self._download_webpage(url, lecture_slug) - cfg = self._parse_json(self._search_regex(r'cfg\s*:\s*({[^}]+})', webpage, 'cfg'), lecture_slug, js_to_json) + cfg = self._parse_json(self._search_regex([r'cfg\s*:\s*({.+?}),[\da-zA-Z_]:\(?function', r'cfg\s*:\s*({[^}]+})'], webpage, 'cfg'), lecture_slug, js_to_json) - lecture_id = str(cfg['obj_id']) + lecture_id = compat_str(cfg['obj_id']) base_url = self._proto_relative_url(cfg['livepipe'], 'http:') @@ -118,7 +117,7 @@ class ViideaIE(InfoExtractor): parts = cfg.get('videos') if parts: if len(parts) == 1: - part = str(parts[0]) + part = compat_str(parts[0]) if part: smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part) smil = self._download_smil(smil_url, lecture_id) @@ -132,7 +131,7 @@ class ViideaIE(InfoExtractor): for part in parts: entries.append(self.url_result('%s/%s/video/%s' % (base_url, lecture_slug, part), 'Viidea')) lecture_info['_type'] = 'multi_video' - else: + if not parts or lecture_data.get('type') == 'evt': # Probably a playlist playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) entries = [ -- cgit v1.2.3 From e8ce2375e0851e65c4882002297404825fe1045e Mon Sep 17 00:00:00 2001 From: Sergey M? <dstftw@gmail.com> Date: Sun, 8 Nov 2015 06:54:27 +0600 Subject: [viidea] Improve and cleanup (Closes #7390) * Optimize requests for multipart videos * Fix cfg regex * Improve titles and identifiers --- youtube_dl/extractor/viidea.py | 99 ++++++++++++++++++++++++++++++------------ 1 file changed, 72 insertions(+), 27 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 2541a36ed..525e303d4 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -35,35 +35,42 @@ class ViideaIE(InfoExtractor): _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', 'info_dict': { - 'id': '20171_part1', + 'id': '20171', + 'display_id': 'promogram_igor_mekjavic_eng', 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'thumbnail': 're:http://.*\.jpg', + 'timestamp': 1372349289, 'upload_date': '20130627', 'duration': 565, - 'thumbnail': 're:http://.*\.jpg', }, }, { # video with invalid direct format links (HTTP 403) 'url': 'http://videolectures.net/russir2010_filippova_nlp/', 'info_dict': { - 'id': '14891_part1', + 'id': '14891', + 'display_id': 'russir2010_filippova_nlp', 'ext': 'flv', 'title': 'NLP at Google', 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', - 'duration': 5352, 'thumbnail': 're:http://.*\.jpg', + 'timestamp': 1284375600, + 'upload_date': '20100913', + 'duration': 5352, }, 'params': { # rtmp download 'skip_download': True, }, }, { + # event playlist 'url': 'http://videolectures.net/deeplearning2015_montreal/', 'info_dict': { 'id': '23181', 'title': 'Deep Learning Summer School, Montreal 2015', 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', + 'thumbnail': 're:http://.*\.jpg', 'timestamp': 1438560000, }, 'playlist_count': 30, @@ -72,37 +79,54 @@ class ViideaIE(InfoExtractor): 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/', 'info_dict': { 'id': '9737', + 'display_id': 'mlss09uk_bishop_ibi', 'title': 'Introduction To Bayesian Inference', + 'thumbnail': 're:http://.*\.jpg', 'timestamp': 1251622800, }, 'playlist': [{ 'info_dict': { 'id': '9737_part1', + 'display_id': 'mlss09uk_bishop_ibi_part1', 'ext': 'wmv', - 'title': 'Introduction To Bayesian Inference', + 'title': 'Introduction To Bayesian Inference (Part 1)', + 'thumbnail': 're:http://.*\.jpg', + 'duration': 4622, + 'timestamp': 1251622800, + 'upload_date': '20090830', }, }, { 'info_dict': { 'id': '9737_part2', + 'display_id': 'mlss09uk_bishop_ibi_part2', 'ext': 'wmv', - 'title': 'Introduction To Bayesian Inference', + 'title': 'Introduction To Bayesian Inference (Part 2)', + 'thumbnail': 're:http://.*\.jpg', + 'duration': 5641, + 'timestamp': 1251622800, + 'upload_date': '20090830', }, }], 'playlist_count': 2, }] def _real_extract(self, url): - lecture_slug, part = re.match(self._VALID_URL, url).groups() + lecture_slug, explicit_part_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, lecture_slug) - cfg = self._parse_json(self._search_regex([r'cfg\s*:\s*({.+?}),[\da-zA-Z_]:\(?function', r'cfg\s*:\s*({[^}]+})'], webpage, 'cfg'), lecture_slug, js_to_json) + cfg = self._parse_json(self._search_regex( + [r'cfg\s*:\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*:\s*\(?\s*function', + r'cfg\s*:\s*({[^}]+})'], + webpage, 'cfg'), lecture_slug, js_to_json) lecture_id = compat_str(cfg['obj_id']) base_url = self._proto_relative_url(cfg['livepipe'], 'http:') - lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), lecture_id)['lecture'][0] + lecture_data = self._download_json( + '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), + lecture_id)['lecture'][0] lecture_info = { 'id': lecture_id, @@ -113,31 +137,52 @@ class ViideaIE(InfoExtractor): 'thumbnail': lecture_data.get('thumb'), } - entries = [] - parts = cfg.get('videos') + playlist_entries = [] + lecture_type = lecture_data.get('type') + parts = [compat_str(video) for video in cfg.get('videos', [])] if parts: - if len(parts) == 1: - part = compat_str(parts[0]) - if part: - smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part) + multipart = len(parts) > 1 + + def extract_part(part_id): + smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id) smil = self._download_smil(smil_url, lecture_id) info = self._parse_smil(smil, smil_url, lecture_id) - info['id'] = '%s_part%s' % (lecture_id, part) + info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id) + info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id) + if multipart: + info['title'] += ' (Part %s)' % part_id switch = smil.find('.//switch') if switch is not None: info['duration'] = parse_duration(switch.attrib.get('dur')) - return info + item_info = lecture_info.copy() + item_info.update(info) + return item_info + + if explicit_part_id or not multipart: + result = extract_part(explicit_part_id or parts[0]) else: - for part in parts: - entries.append(self.url_result('%s/%s/video/%s' % (base_url, lecture_slug, part), 'Viidea')) - lecture_info['_type'] = 'multi_video' - if not parts or lecture_data.get('type') == 'evt': - # Probably a playlist - playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) + result = { + '_type': 'multi_video', + 'entries': [extract_part(part) for part in parts], + } + result.update(lecture_info) + + # Immediately return explicitly requested part or non event item + if explicit_part_id or lecture_type != 'evt': + return result + + playlist_entries.append(result) + + # It's probably a playlist + if not parts or lecture_type == 'evt': + playlist_webpage = self._download_webpage( + '%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) entries = [ self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea') - for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] - lecture_info['_type'] = 'playlist' + for _, video_url in re.findall( + r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] + playlist_entries.extend(entries) - lecture_info['entries'] = entries - return lecture_info + playlist = self.playlist_result(playlist_entries, lecture_id) + playlist.update(lecture_info) + return playlist -- cgit v1.2.3 From d5c181a14e08198e400932d591b47683a630c8c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 8 Nov 2015 11:49:51 +0100 Subject: [movieclips] Fix extraction (fixes #7404) They use theplatform now. Changed the test, because the old one seems to be georestricted. --- youtube_dl/extractor/movieclips.py | 77 ++++++++++---------------------------- 1 file changed, 19 insertions(+), 58 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py index 04e17d055..e06828b55 100644 --- a/youtube_dl/extractor/movieclips.py +++ b/youtube_dl/extractor/movieclips.py @@ -1,80 +1,41 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( - compat_str, -) -from ..utils import ( - ExtractorError, - clean_html, + compat_urllib_request, ) class MovieClipsIE(InfoExtractor): - _VALID_URL = r'https?://movieclips\.com/(?P<id>[\da-zA-Z]+)(?:-(?P<display_id>[\da-z-]+))?' + _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/(?P<id>[^/?#]+)' _TEST = { - 'url': 'http://movieclips.com/Wy7ZU-my-week-with-marilyn-movie-do-you-love-me/', + 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597?autoPlay=true&playlistId=5', 'info_dict': { - 'id': 'Wy7ZU', - 'display_id': 'my-week-with-marilyn-movie-do-you-love-me', + 'id': 'pKIGmG83AqD9', + 'display_id': 'warcraft-trailer-1-561180739597', 'ext': 'mp4', - 'title': 'My Week with Marilyn - Do You Love Me?', - 'description': 'md5:e86795bd332fe3cff461e7c8dc542acb', + 'title': 'Warcraft Trailer 1', + 'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.', 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - # rtmp download - 'skip_download': True, - } + 'add_ie': ['ThePlatform'], } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - show_id = display_id or video_id - - config = self._download_xml( - 'http://config.movieclips.com/player/config/%s' % video_id, - show_id, 'Downloading player config') - - if config.find('./country-region').text == 'false': - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, config.find('./region_alert').text), expected=True) - - properties = config.find('./video/properties') - smil_file = properties.attrib['smil_file'] + display_id = self._match_id(url) - smil = self._download_xml(smil_file, show_id, 'Downloading SMIL') - base_url = smil.find('./head/meta').attrib['base'] - - formats = [] - for video in smil.findall('./body/switch/video'): - vbr = int(video.attrib['system-bitrate']) / 1000 - src = video.attrib['src'] - formats.append({ - 'url': base_url, - 'play_path': src, - 'ext': src.split(':')[0], - 'vbr': vbr, - 'format_id': '%dk' % vbr, - }) - - self._sort_formats(formats) - - title = '%s - %s' % (properties.attrib['clip_movie_title'], properties.attrib['clip_title']) - description = clean_html(compat_str(properties.attrib['clip_description'])) - thumbnail = properties.attrib['image'] - categories = properties.attrib['clip_categories'].split(',') + req = compat_urllib_request.Request(url) + # it doesn't work if it thinks the browser it's too old + req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/43.0 (Chrome)') + webpage = self._download_webpage(req, display_id) + theplatform_link = self._html_search_regex(r'src="(http://player.theplatform.com/p/.*?)"', webpage, 'theplatform link') + title = self._html_search_regex(r'<title[^>]*>([^>]+)-\s*\d+\s*|\s*Movieclips.com', webpage, 'title') + description = self._html_search_meta('description', webpage) return { - 'id': video_id, - 'display_id': display_id, + '_type': 'url_transparent', + 'url': theplatform_link, 'title': title, + 'display_id': display_id, 'description': description, - 'thumbnail': thumbnail, - 'categories': categories, - 'formats': formats, } -- cgit v1.2.3 From 937511dfc01c3d00c35a00f78c2b6f989b4d46e3 Mon Sep 17 00:00:00 2001 From: Frans de Jonge Date: Sat, 7 Nov 2015 22:55:02 +0100 Subject: Added support for the RTBF OUFtivi subpage --- youtube_dl/extractor/rtbf.py | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 04a66df90..e75b45112 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -9,17 +9,36 @@ from ..utils import ( class RTBFIE(InfoExtractor): - _VALID_URL = r'https?://www.rtbf.be/video/[^\?]+\?id=(?P\d+)' - _TEST = { - 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', - 'info_dict': { - 'id': '1921274', - 'ext': 'mp4', - 'title': 'Les Diables au coeur (épisode 2)', - 'duration': 3099, - } - } + _VALID_URL = r'''(?x) + https?://www\.rtbf\.be/ + (?: + video/[^\?]+\?id=| + ouftivi/heros/[^&]+&videoId= + ) + (?P\d+) + ''' + _TESTS = [ + { + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '799f334ddf2c0a582ba80c44655be570', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'duration': 3099, + } + }, + { + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'md5': '25aea17e949e1e0c7c41270d60d25f22', + 'info_dict': { + 'id': '2057442', + 'ext': 'mp4', + 'title': 'Scooby-Doo, myst\xe8res associ\xe9s', + 'duration': 1279, + } + }, + ] _QUALITIES = [ ('mobile', 'mobile'), -- cgit v1.2.3 From fda2717ef9d429358d5816582590d15d18f9109f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 16:56:20 +0600 Subject: [movieclips] Add coding cookie --- youtube_dl/extractor/movieclips.py | 1 + 1 file changed, 1 insertion(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py index e06828b55..b8c43a163 100644 --- a/youtube_dl/extractor/movieclips.py +++ b/youtube_dl/extractor/movieclips.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -- cgit v1.2.3 From 114e6025b09e12bd01b5ce22bd2c43a3ef0ba460 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 17:01:45 +0600 Subject: [rtbf] Expand _VALID_URL (Closes #7402) --- youtube_dl/extractor/rtbf.py | 48 +++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 30 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index e75b45112..acf10e253 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -9,36 +9,24 @@ from ..utils import ( class RTBFIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?://www\.rtbf\.be/ - (?: - video/[^\?]+\?id=| - ouftivi/heros/[^&]+&videoId= - ) - (?P\d+) - ''' - _TESTS = [ - { - 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', - 'info_dict': { - 'id': '1921274', - 'ext': 'mp4', - 'title': 'Les Diables au coeur (épisode 2)', - 'duration': 3099, - } - }, - { - 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', - 'md5': '25aea17e949e1e0c7c41270d60d25f22', - 'info_dict': { - 'id': '2057442', - 'ext': 'mp4', - 'title': 'Scooby-Doo, myst\xe8res associ\xe9s', - 'duration': 1279, - } - }, - ] + _VALID_URL = r'https?://www\.rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P\d+)' + _TESTS = [{ + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '799f334ddf2c0a582ba80c44655be570', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'duration': 3099, + } + }, { + # geo restricted + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', + 'only_matching': True, + }] _QUALITIES = [ ('mobile', 'mobile'), -- cgit v1.2.3 From aa8d2d5be6a99542b85a85af3310fab1bf641e86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 17:03:21 +0600 Subject: [rtbf] Make www optional in _VALID_URL --- youtube_dl/extractor/rtbf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index acf10e253..e42b319a3 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -9,7 +9,7 @@ from ..utils import ( class RTBFIE(InfoExtractor): - _VALID_URL = r'https?://www\.rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P\d+)' _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', 'md5': '799f334ddf2c0a582ba80c44655be570', -- cgit v1.2.3 From 50506cb60798fe4d2ebb9603798b3fb1cb81e55f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 19:01:37 +0600 Subject: [extremetube] Fix extraction (Closes #7163) --- youtube_dl/extractor/extremetube.py | 45 ++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 16 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index c826a5404..3e11e3299 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -3,12 +3,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_request, -) +from ..compat import compat_urllib_request from ..utils import ( - qualities, + int_or_none, str_to_int, ) @@ -49,20 +46,36 @@ class ExtremeTubeIE(InfoExtractor): r'Views:\s*\s*([\d,\.]+)', webpage, 'view count', fatal=False)) - flash_vars = compat_parse_qs(self._search_regex( - r']+?name="flashvars"[^>]+?value="([^"]+)"', webpage, 'flash vars')) + flash_vars = self._parse_json( + self._search_regex( + r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flash vars'), + video_id) formats = [] - quality = qualities(['180p', '240p', '360p', '480p', '720p', '1080p']) - for k, vals in flash_vars.items(): - m = re.match(r'quality_(?P[0-9]+p)$', k) - if m is not None: - formats.append({ - 'format_id': m.group('quality'), - 'quality': quality(m.group('quality')), - 'url': vals[0], + for quality_key, video_url in flash_vars.items(): + height = int_or_none(self._search_regex( + r'quality_(\d+)[pP]$', quality_key, 'height', default=None)) + if not height: + continue + f = { + 'url': video_url, + } + mobj = re.search( + r'/(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+', video_url) + if mobj: + height = int(mobj.group('height')) + bitrate = int(mobj.group('bitrate')) + f.update({ + 'format_id': '%dp-%dk' % (height, bitrate), + 'height': height, + 'tbr': bitrate, }) - + else: + f.update({ + 'format_id': '%dp' % height, + 'height': height, + }) + formats.append(f) self._sort_formats(formats) return { -- cgit v1.2.3 From cc8034cc4c52fcbfaf9f8edf34d562c481860193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 19:14:39 +0600 Subject: [extremetube] Modernize --- youtube_dl/extractor/extremetube.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 3e11e3299..c5677c82b 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -11,12 +11,12 @@ from ..utils import ( class ExtremeTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pextremetube\.com/.*?video/.+?(?P[0-9]+))(?:[/?&]|$)' + _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P[^/#?&]+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'md5': '344d0c6d50e2f16b06e49ca011d8ac69', 'info_dict': { - 'id': '652431', + 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'ext': 'mp4', 'title': 'Music Video 14 british euro brit european cumshots swallow', 'uploader': 'unknown', @@ -26,12 +26,16 @@ class ExtremeTubeIE(InfoExtractor): }, { 'url': 'http://www.extremetube.com/gay/video/abcde-1234', 'only_matching': True, + }, { + 'url': 'http://www.extremetube.com/video/latina-slut-fucked-by-fat-black-dick', + 'only_matching': True, + }, { + 'url': 'http://www.extremetube.com/video/652431', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - url = 'http://www.' + mobj.group('url') + video_id = self._match_id(url) req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') -- cgit v1.2.3 From f09a767d3198823e5c0ac187a91284c8d2736eb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 19:19:13 +0600 Subject: [mit] Allow external embeds (Closes #7406) --- youtube_dl/extractor/mit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index f088ab9e2..29ca45778 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -86,7 +86,7 @@ class MITIE(TechTVMITIE): webpage = self._download_webpage(url, page_title) embed_url = self._search_regex( r'