diff options
Diffstat (limited to 'youtube_dl')
68 files changed, 2175 insertions, 1376 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c642a1fbf..50425b8d7 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -47,7 +47,9 @@ from .utils import ( DEFAULT_OUTTMPL, determine_ext, DownloadError, + encode_compat_str, encodeFilename, + error_to_compat_str, ExtractorError, format_bytes, formatSeconds, @@ -495,7 +497,7 @@ class YoutubeDL(object): tb = '' if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info)) - tb += compat_str(traceback.format_exc()) + tb += encode_compat_str(traceback.format_exc()) else: tb_data = traceback.format_list(traceback.extract_stack()) tb = ''.join(tb_data) @@ -674,14 +676,14 @@ class YoutubeDL(object): return self.process_ie_result(ie_result, download, extra_info) else: return ie_result - except ExtractorError as de: # An error we somewhat expected - self.report_error(compat_str(de), de.format_traceback()) + except ExtractorError as e: # An error we somewhat expected + self.report_error(compat_str(e), e.format_traceback()) break except MaxDownloadsReached: raise except Exception as e: if self.params.get('ignoreerrors', False): - self.report_error(compat_str(e), tb=compat_str(traceback.format_exc())) + self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) break else: raise @@ -1459,7 +1461,7 @@ class YoutubeDL(object): if dn and not os.path.exists(dn): os.makedirs(dn) except (OSError, IOError) as err: - self.report_error('unable to create directory ' + compat_str(err)) + self.report_error('unable to create directory ' + error_to_compat_str(err)) return if self.params.get('writedescription', False): @@ -1510,7 +1512,7 @@ class YoutubeDL(object): sub_info['url'], info_dict['id'], note=False) except ExtractorError as err: self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, compat_str(err.cause))) + (sub_lang, error_to_compat_str(err.cause))) continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) @@ -2039,4 +2041,4 @@ class YoutubeDL(object): (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self.report_warning('Unable to download thumbnail "%s": %s' % - (t['url'], compat_str(err))) + (t['url'], error_to_compat_str(err))) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index b8bf8daf8..beae8c4d0 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -5,9 +5,9 @@ import re import sys import time -from ..compat import compat_str from ..utils import ( encodeFilename, + error_to_compat_str, decodeArgument, format_bytes, timeconvert, @@ -186,7 +186,7 @@ class FileDownloader(object): return os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) except (IOError, OSError) as err: - self.report_error('unable to rename file: %s' % compat_str(err)) + self.report_error('unable to rename file: %s' % error_to_compat_str(err)) def try_utime(self, filename, last_modified_hdr): """Try to set the last-modified time of the given file.""" diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3cd95ba01..165835f63 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -15,7 +15,6 @@ from .adobetv import ( AdobeTVVideoIE, ) from .adultswim import AdultSwimIE -from .aftenposten import AftenpostenIE from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE @@ -26,7 +25,10 @@ from .aol import AolIE from .allocine import AllocineIE from .aparat import AparatIE from .appleconnect import AppleConnectIE -from .appletrailers import AppleTrailersIE +from .appletrailers import ( + AppleTrailersIE, + AppleTrailersSectionIE, +) from .archiveorg import ArchiveOrgIE from .ard import ( ARDIE, @@ -61,8 +63,11 @@ from .beatportpro import BeatportProIE from .bet import BetIE from .bild import BildIE from .bilibili import BiliBiliIE +from .bleacherreport import ( + BleacherReportIE, + BleacherReportCMSIE, +) from .blinkx import BlinkxIE -from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE from .bpb import BpbIE from .br import BRIE @@ -78,7 +83,6 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) -from .canal13cl import Canal13clIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cbs import CBSIE @@ -232,9 +236,11 @@ from .globo import ( from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE +from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE +from .gputechconf import GPUTechConfIE from .groupon import GrouponIE from .hark import HarkIE from .hearthisat import HearThisAtIE @@ -247,12 +253,17 @@ from .history import HistoryIE from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE from .hotnewhiphop import HotNewHipHopIE +from .hotstar import HotStarIE from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE from .huffpost import HuffPostIE from .hypem import HypemIE from .iconosquare import IconosquareIE -from .ign import IGNIE, OneUPIE +from .ign import ( + IGNIE, + OneUPIE, + PCMagIE, +) from .imdb import ( ImdbIE, ImdbListIE @@ -281,6 +292,7 @@ from .jadorecettepub import JadoreCettePubIE from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .jukebox import JukeboxIE +from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE from .kanalplay import KanalPlayIE @@ -335,6 +347,7 @@ from .lynda import ( from .m6 import M6IE from .macgamestore import MacGameStoreIE from .mailru import MailRuIE +from .makertv import MakerTVIE from .malemotion import MalemotionIE from .mdr import MDRIE from .metacafe import MetacafeIE @@ -519,7 +532,10 @@ from .radiode import RadioDeIE from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE -from .rai import RaiIE +from .rai import ( + RaiTVIE, + RaiIE, +) from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redtube import RedTubeIE @@ -585,10 +601,6 @@ from .snagfilms import ( ) from .snotr import SnotrIE from .sohu import SohuIE -from .soompi import ( - SoompiIE, - SoompiShowIE, -) from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, @@ -647,6 +659,7 @@ from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .tele13 import Tele13IE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE @@ -656,6 +669,7 @@ from .tenplay import TenPlayIE from .testurl import TestURLIE from .testtube import TestTubeIE from .tf1 import TF1IE +from .theintercept import TheInterceptIE from .theonion import TheOnionIE from .theplatform import ( ThePlatformIE, @@ -675,6 +689,7 @@ from .tnaflix import ( EMPFlixIE, MovieFapIE, ) +from .toggle import ToggleIE from .thvideo import ( THVideoIE, THVideoPlaylistIE @@ -850,7 +865,7 @@ from .youtube import ( YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeUserIE, - YoutubeUserPlaylistsIE, + YoutubePlaylistsIE, YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index c0e5d1abf..6a29e587f 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -23,6 +23,7 @@ class ABCIE(InfoExtractor): 'title': 'Australia to help staff Ebola treatment centre in Sierra Leone', 'description': 'md5:809ad29c67a05f54eb41f2a105693a67', }, + 'skip': 'this video has expired', }, { 'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326', 'md5': 'db2a5369238b51f9811ad815b69dc086', @@ -36,6 +37,7 @@ class ABCIE(InfoExtractor): 'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill', }, 'add_ie': ['Youtube'], + 'skip': 'Not accessible from Travis CI server', }, { 'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080', 'md5': 'b96eee7c9edf4fc5a358a0252881cc1f', @@ -58,6 +60,9 @@ class ABCIE(InfoExtractor): r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);', webpage) if mobj is None: + expired = self._html_search_regex(r'(?s)class="expired-(?:video|audio)".+?<span>(.+?)</span>', webpage, 'expired', None) + if expired: + raise ExtractorError('%s said: %s' % (self.IE_NAME, expired), expected=True) raise ExtractorError('Unable to extract video urls') urls_info = self._parse_json( diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 3ae618e71..bf21a6887 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -68,7 +68,7 @@ class AdultSwimIE(InfoExtractor): 'md5': '3e346a2ab0087d687a05e1e7f3b3e529', 'info_dict': { 'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', }, @@ -79,6 +79,10 @@ class AdultSwimIE(InfoExtractor): 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', }, + 'params': { + # m3u8 download + 'skip_download': True, + } }] @staticmethod diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py deleted file mode 100644 index 0c00acfb5..000000000 --- a/youtube_dl/extractor/aftenposten.py +++ /dev/null @@ -1,23 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class AftenpostenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', - 'md5': 'fd828cd29774a729bf4d4425fe192972', - 'info_dict': { - 'id': '21039', - 'ext': 'mov', - 'title': 'TRAILER: "Sweatshop" - I can´t take any more', - 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', - 'timestamp': 1416927969, - 'upload_date': '20141125', - } - } - - def _real_extract(self, url): - return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream') diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py new file mode 100644 index 000000000..dcc3c97f1 --- /dev/null +++ b/youtube_dl/extractor/amp.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class AMPIE(InfoExtractor): + # parse Akamai Adaptive Media Player feed + def _extract_feed_info(self, url): + item = self._download_json( + url, None, 'Downloading Akamai AMP feed', + 'Unable to download Akamai AMP feed')['channel']['item'] + + video_id = item['guid'] + + def get_media_node(name, default=None): + media_name = 'media-%s' % name + media_group = item.get('media-group') or item + return media_group.get(media_name) or item.get(media_name) or item.get(name, default) + + thumbnails = [] + media_thumbnail = get_media_node('thumbnail') + if media_thumbnail: + if isinstance(media_thumbnail, dict): + media_thumbnail = [media_thumbnail] + for thumbnail_data in media_thumbnail: + thumbnail = thumbnail_data['@attributes'] + thumbnails.append({ + 'url': self._proto_relative_url(thumbnail['url'], 'http:'), + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + subtitles = {} + media_subtitle = get_media_node('subTitle') + if media_subtitle: + if isinstance(media_subtitle, dict): + media_subtitle = [media_subtitle] + for subtitle_data in media_subtitle: + subtitle = subtitle_data['@attributes'] + lang = subtitle.get('lang') or 'en' + subtitles[lang] = [{'url': subtitle['href']}] + + formats = [] + media_content = get_media_node('content') + if isinstance(media_content, dict): + media_content = [media_content] + for media_data in media_content: + media = media_data['@attributes'] + media_type = media['type'] + if media_type == 'video/f4m': + f4m_formats = self._extract_f4m_formats( + media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', + video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + elif media_type == 'application/x-mpegURL': + m3u8_formats = self._extract_m3u8_formats( + media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + else: + formats.append({ + 'format_id': media_data['media-category']['@attributes']['label'], + 'url': media['url'], + 'tbr': int_or_none(media.get('bitrate')), + 'filesize': int_or_none(media.get('fileSize')), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': get_media_node('title'), + 'description': get_media_node('description'), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(item.get('pubDate'), ' '), + 'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index f68dc3236..62ed0c918 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -11,6 +11,7 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): + IE_NAME = 'appletrailers' _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)' _TESTS = [{ 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', @@ -64,6 +65,12 @@ class AppleTrailersIE(InfoExtractor): }, ] }, { + 'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', + 'info_dict': { + 'id': 'blackthorn', + }, + 'playlist_mincount': 2, + }, { 'url': 'http://trailers.apple.com/ca/metropole/autrui/', 'only_matching': True, }] @@ -79,7 +86,7 @@ class AppleTrailersIE(InfoExtractor): def fix_html(s): s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s) - s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s) + s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s) # The ' in the onClick attributes are not escaped, it couldn't be parsed # like: http://trailers.apple.com/trailers/wb/gravity/ @@ -96,6 +103,9 @@ class AppleTrailersIE(InfoExtractor): trailer_info_json = self._search_regex(self._JSON_RE, on_click, 'trailer info') trailer_info = json.loads(trailer_info_json) + first_url = trailer_info.get('url') + if not first_url: + continue title = trailer_info['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() thumbnail = li.find('.//img').attrib['src'] @@ -107,7 +117,6 @@ class AppleTrailersIE(InfoExtractor): if m: duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) - first_url = trailer_info['url'] trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') @@ -144,3 +153,76 @@ class AppleTrailersIE(InfoExtractor): 'id': movie, 'entries': playlist, } + + +class AppleTrailersSectionIE(InfoExtractor): + IE_NAME = 'appletrailers:section' + _SECTIONS = { + 'justadded': { + 'feed_path': 'just_added', + 'title': 'Just Added', + }, + 'exclusive': { + 'feed_path': 'exclusive', + 'title': 'Exclusive', + }, + 'justhd': { + 'feed_path': 'just_hd', + 'title': 'Just HD', + }, + 'mostpopular': { + 'feed_path': 'most_pop', + 'title': 'Most Popular', + }, + 'moviestudios': { + 'feed_path': 'studios', + 'title': 'Movie Studios', + }, + } + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS) + _TESTS = [{ + 'url': 'http://trailers.apple.com/#section=justadded', + 'info_dict': { + 'title': 'Just Added', + 'id': 'justadded', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=exclusive', + 'info_dict': { + 'title': 'Exclusive', + 'id': 'exclusive', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=justhd', + 'info_dict': { + 'title': 'Just HD', + 'id': 'justhd', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=mostpopular', + 'info_dict': { + 'title': 'Most Popular', + 'id': 'mostpopular', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=moviestudios', + 'info_dict': { + 'title': 'Movie Studios', + 'id': 'moviestudios', + }, + 'playlist_mincount': 80, + }] + + def _real_extract(self, url): + section = self._match_id(url) + section_data = self._download_json( + 'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'], + section) + entries = [ + self.url_result('http://trailers.apple.com' + e['location']) + for e in section_data] + return self.playlist_result(entries, section, self._SECTIONS[section]['title']) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 73be6d204..687eb9f82 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -110,13 +110,19 @@ class ARDMediathekIE(InfoExtractor): server = stream.get('_server') for stream_url in stream_urls: ext = determine_ext(stream_url) + if quality != 'auto' and ext in ('f4m', 'm3u8'): + continue if ext == 'f4m': - formats.extend(self._extract_f4m_formats( + f4m_formats = self._extract_f4m_formats( stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', - video_id, preference=-1, f4m_id='hds')) + video_id, preference=-1, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', preference=1, m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + stream_url, video_id, 'mp4', preference=1, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) else: if server and server.startswith('rtmp'): f = { diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 2a00da3ee..10301a8ea 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -68,9 +68,13 @@ class ArteTVPlus7IE(InfoExtractor): def _extract_url_info(cls, url): mobj = re.match(cls._VALID_URL, url) lang = mobj.group('lang') - # This is not a real id, it can be for example AJT for the news - # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal - video_id = mobj.group('id') + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + if 'vid' in query: + video_id = query['vid'][0] + else: + # This is not a real id, it can be for example AJT for the news + # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal + video_id = mobj.group('id') return video_id, lang def _real_extract(self, url): @@ -79,9 +83,15 @@ class ArteTVPlus7IE(InfoExtractor): return self._extract_from_webpage(webpage, video_id, lang) def _extract_from_webpage(self, webpage, video_id, lang): + patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') + ids = (video_id, '') + # some pages contain multiple videos (like + # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), + # so we first try to look for json URLs that contain the video id from + # the 'vid' parameter. + patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] json_url = self._html_search_regex( - [r'arte_vp_url=["\'](.*?)["\']', r'data-url=["\']([^"]+)["\']'], - webpage, 'json vp url', default=None) + patterns, webpage, 'json vp url', default=None) if not json_url: iframe_url = self._html_search_regex( r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 50e47ba0a..7ac3044c7 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals import time import hmac +import hashlib +import re from .common import InfoExtractor from ..compat import ( @@ -32,6 +34,19 @@ class AtresPlayerIE(InfoExtractor): 'duration': 5527.6, 'thumbnail': 're:^https?://.*\.jpg$', }, + 'skip': 'This video is only available for registered users' + }, + { + 'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html', + 'md5': '0d0e918533bbd4b263f2de4d197d4aac', + 'info_dict': { + 'id': 'capitulo-112-david-bustamante', + 'ext': 'flv', + 'title': 'David Bustamante', + 'description': 'md5:f33f1c0a05be57f6708d4dd83a3b81c6', + 'duration': 1439.0, + 'thumbnail': 're:^https?://.*\.jpg$', + }, }, { 'url': 'http://www.atresplayer.com/television/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_2014122400174.html', @@ -50,6 +65,13 @@ class AtresPlayerIE(InfoExtractor): _LOGIN_URL = 'https://servicios.atresplayer.com/j_spring_security_check' + _ERRORS = { + 'UNPUBLISHED': 'We\'re sorry, but this video is not yet available.', + 'DELETED': 'This video has expired and is no longer available for online streaming.', + 'GEOUNPUBLISHED': 'We\'re sorry, but this video is not available in your region due to right restrictions.', + # 'PREMIUM': 'PREMIUM', + } + def _real_initialize(self): self._login() @@ -83,58 +105,81 @@ class AtresPlayerIE(InfoExtractor): episode_id = self._search_regex( r'episode="([^"]+)"', webpage, 'episode id') + request = sanitized_Request( + self._PLAYER_URL_TEMPLATE % episode_id, + headers={'User-Agent': self._USER_AGENT}) + player = self._download_json(request, episode_id, 'Downloading player JSON') + + episode_type = player.get('typeOfEpisode') + error_message = self._ERRORS.get(episode_type) + if error_message: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) + + formats = [] + video_url = player.get('urlVideo') + if video_url: + format_info = { + 'url': video_url, + 'format_id': 'http', + } + mobj = re.search(r'(?P<bitrate>\d+)K_(?P<width>\d+)x(?P<height>\d+)', video_url) + if mobj: + format_info.update({ + 'width': int_or_none(mobj.group('width')), + 'height': int_or_none(mobj.group('height')), + 'tbr': int_or_none(mobj.group('bitrate')), + }) + formats.append(format_info) + + m3u8_url = player.get('urlVideoHls') + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, episode_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + timestamp = int_or_none(self._download_webpage( self._TIME_API_URL, video_id, 'Downloading timestamp', fatal=False), 1000, time.time()) timestamp_shifted = compat_str(timestamp + self._TIMESTAMP_SHIFT) token = hmac.new( self._MAGIC.encode('ascii'), - (episode_id + timestamp_shifted).encode('utf-8') + (episode_id + timestamp_shifted).encode('utf-8'), hashlib.md5 ).hexdigest() - formats = [] - for fmt in ['windows', 'android_tablet']: - request = sanitized_Request( - self._URL_VIDEO_TEMPLATE.format(fmt, episode_id, timestamp_shifted, token)) - request.add_header('User-Agent', self._USER_AGENT) - - fmt_json = self._download_json( - request, video_id, 'Downloading %s video JSON' % fmt) - - result = fmt_json.get('resultDes') - if result.lower() != 'ok': - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, result), expected=True) - - for format_id, video_url in fmt_json['resultObject'].items(): - if format_id == 'token' or not video_url.startswith('http'): - continue - if video_url.endswith('/Manifest'): - if 'geodeswowsmpra3player' in video_url: - f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] - f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) - # this videos are protected by DRM, the f4m downloader doesn't support them - continue - else: - f4m_url = video_url[:-9] + '/manifest.f4m' - formats.extend(self._extract_f4m_formats(f4m_url, video_id)) - else: - formats.append({ - 'url': video_url, - 'format_id': 'android-%s' % format_id, - 'preference': 1, - }) - self._sort_formats(formats) + request = sanitized_Request( + self._URL_VIDEO_TEMPLATE.format('windows', episode_id, timestamp_shifted, token), + headers={'User-Agent': self._USER_AGENT}) - player = self._download_json( - self._PLAYER_URL_TEMPLATE % episode_id, - episode_id) + fmt_json = self._download_json( + request, video_id, 'Downloading windows video JSON') + + result = fmt_json.get('resultDes') + if result.lower() != 'ok': + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, result), expected=True) + + for format_id, video_url in fmt_json['resultObject'].items(): + if format_id == 'token' or not video_url.startswith('http'): + continue + if 'geodeswowsmpra3player' in video_url: + f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] + f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) + # this videos are protected by DRM, the f4m downloader doesn't support them + continue + else: + f4m_url = video_url[:-9] + '/manifest.f4m' + f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + self._sort_formats(formats) path_data = player.get('pathData') episode = self._download_xml( - self._EPISODE_URL_TEMPLATE % path_data, - video_id, 'Downloading episode XML') + self._EPISODE_URL_TEMPLATE % path_data, video_id, + 'Downloading episode XML') duration = float_or_none(xpath_text( episode, './media/asset/info/technical/contentDuration', 'duration')) diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index b0b089dee..4382a302b 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -15,7 +15,7 @@ class AudiMediaIE(InfoExtractor): 'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test', 'md5': '79a8b71c46d49042609795ab59779b66', 'info_dict': { - 'id': '1564', + 'id': '1565', 'ext': 'mp4', 'title': '60 Seconds of Audi Sport 104/2015 - WEC Bahrain, Rookie Test', 'description': 'md5:60e5d30a78ced725f7b8d34370762941', diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index 693ba22c6..3eed91279 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -56,7 +56,7 @@ class AudiomackIE(InfoExtractor): # API is inconsistent with errors if 'url' not in api_response or not api_response['url'] or 'error' in api_response: - raise ExtractorError('Invalid url %s', url) + raise ExtractorError('Invalid url %s' % url) # Audiomack wraps a lot of soundcloud tracks in their branded wrapper # if so, pass the work off to the soundcloud extractor diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py new file mode 100644 index 000000000..38bda3af5 --- /dev/null +++ b/youtube_dl/extractor/bleacherreport.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .amp import AMPIE +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) + + +class BleacherReportIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', + 'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', + 'info_dict': { + 'id': '2496438', + 'ext': 'mp4', + 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', + 'uploader_id': 3992341, + 'description': 'CFB, ACC, Florida State', + 'timestamp': 1434380212, + 'upload_date': '20150615', + 'uploader': 'Team Stream Now ', + }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', + 'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', + 'info_dict': { + 'id': '2586817', + 'ext': 'mp4', + 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', + 'timestamp': 1446839961, + 'uploader': 'Sean Fay', + 'description': 'md5:825e94e0f3521df52fa83b2ed198fa20', + 'uploader_id': 6466954, + 'upload_date': '20151011', + }, + 'add_ie': ['Youtube'], + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + + article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article'] + + thumbnails = [] + primary_photo = article_data.get('primaryPhoto') + if primary_photo: + thumbnails = [{ + 'url': primary_photo['url'], + 'width': primary_photo.get('width'), + 'height': primary_photo.get('height'), + }] + + info = { + '_type': 'url_transparent', + 'id': article_id, + 'title': article_data['title'], + 'uploader': article_data.get('author', {}).get('name'), + 'uploader_id': article_data.get('authorId'), + 'timestamp': parse_iso8601(article_data.get('createdAt')), + 'thumbnails': thumbnails, + 'comment_count': int_or_none(article_data.get('commentsCount')), + 'view_count': int_or_none(article_data.get('hitCount')), + } + + video = article_data.get('video') + if video: + video_type = video['type'] + if video_type == 'cms.bleacherreport.com': + info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] + elif video_type == 'ooyala.com': + info['url'] = 'ooyala:%s' % video['id'] + elif video_type == 'youtube.com': + info['url'] = video['id'] + elif video_type == 'vine.co': + info['url'] = 'https://vine.co/v/%s' % video['id'] + else: + info['url'] = video_type + video['id'] + return info + else: + raise ExtractorError('no video in the article', expected=True) + + +class BleacherReportCMSIE(AMPIE): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' + _TESTS = [{ + 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'md5': '8c2c12e3af7805152675446c905d159b', + 'info_dict': { + 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'ext': 'flv', + 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', + 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._extract_feed_info('http://cms.bleacherreport.com/media/items/%s/akamai.json' % video_id) + info['id'] = video_id + return info diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py deleted file mode 100644 index 35375f7b1..000000000 --- a/youtube_dl/extractor/bliptv.py +++ /dev/null @@ -1,290 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - -from ..compat import compat_urlparse -from ..utils import ( - clean_html, - int_or_none, - parse_iso8601, - sanitized_Request, - unescapeHTML, - xpath_text, - xpath_with_ns, -) - - -class BlipTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+_]+)))' - - _TESTS = [ - { - 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', - 'md5': '80baf1ec5c3d2019037c1c707d676b9f', - 'info_dict': { - 'id': '5779306', - 'ext': 'm4v', - 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', - 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', - 'timestamp': 1323138843, - 'upload_date': '20111206', - 'uploader': 'cbr', - 'uploader_id': '679425', - 'duration': 81, - } - }, - { - # https://github.com/rg3/youtube-dl/pull/2274 - 'note': 'Video with subtitles', - 'url': 'http://blip.tv/play/h6Uag5OEVgI.html', - 'md5': '309f9d25b820b086ca163ffac8031806', - 'info_dict': { - 'id': '6586561', - 'ext': 'mp4', - 'title': 'Red vs. Blue Season 11 Episode 1', - 'description': 'One-Zero-One', - 'timestamp': 1371261608, - 'upload_date': '20130615', - 'uploader': 'redvsblue', - 'uploader_id': '792887', - 'duration': 279, - } - }, - { - # https://bugzilla.redhat.com/show_bug.cgi?id=967465 - 'url': 'http://a.blip.tv/api.swf#h6Uag5KbVwI', - 'md5': '314e87b1ebe7a48fcbfdd51b791ce5a6', - 'info_dict': { - 'id': '6573122', - 'ext': 'mov', - 'upload_date': '20130520', - 'description': 'Two hapless space marines argue over what to do when they realize they have an astronomically huge problem on their hands.', - 'title': 'Red vs. Blue Season 11 Trailer', - 'timestamp': 1369029609, - 'uploader': 'redvsblue', - 'uploader_id': '792887', - } - }, - { - 'url': 'http://blip.tv/play/gbk766dkj4Yn', - 'md5': 'fe0a33f022d49399a241e84a8ea8b8e3', - 'info_dict': { - 'id': '1749452', - 'ext': 'mp4', - 'upload_date': '20090208', - 'description': 'Witness the first appearance of the Nostalgia Critic character, as Doug reviews the movie Transformers.', - 'title': 'Nostalgia Critic: Transformers', - 'timestamp': 1234068723, - 'uploader': 'NostalgiaCritic', - 'uploader_id': '246467', - } - }, - { - # https://github.com/rg3/youtube-dl/pull/4404 - 'note': 'Audio only', - 'url': 'http://blip.tv/hilarios-productions/weekly-manga-recap-kingdom-7119982', - 'md5': '76c0a56f24e769ceaab21fbb6416a351', - 'info_dict': { - 'id': '7103299', - 'ext': 'flv', - 'title': 'Weekly Manga Recap: Kingdom', - 'description': 'And then Shin breaks the enemy line, and he's all like HWAH! And then he slices a guy and it's all like FWASHING! And... it's really hard to describe the best parts of this series without breaking down into sound effects, okay?', - 'timestamp': 1417660321, - 'upload_date': '20141204', - 'uploader': 'The Rollo T', - 'uploader_id': '407429', - 'duration': 7251, - 'vcodec': 'none', - } - }, - { - # missing duration - 'url': 'http://blip.tv/rss/flash/6700880', - 'info_dict': { - 'id': '6684191', - 'ext': 'm4v', - 'title': 'Cowboy Bebop: Gateway Shuffle Review', - 'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8', - 'timestamp': 1386639757, - 'upload_date': '20131210', - 'uploader': 'sfdebris', - 'uploader_id': '706520', - } - } - ] - - @staticmethod - def _extract_url(webpage): - mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) - if mobj: - return 'http://blip.tv/a/a-' + mobj.group(1) - mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage) - if mobj: - return mobj.group(1) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - lookup_id = mobj.group('lookup_id') - - # See https://github.com/rg3/youtube-dl/issues/857 and - # https://github.com/rg3/youtube-dl/issues/4197 - if lookup_id: - urlh = self._request_webpage( - 'http://blip.tv/play/%s' % lookup_id, lookup_id, 'Resolving lookup id') - url = compat_urlparse.urlparse(urlh.geturl()) - qs = compat_urlparse.parse_qs(url.query) - mobj = re.match(self._VALID_URL, qs['file'][0]) - - video_id = mobj.group('id') - - rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS') - - def _x(p): - return xpath_with_ns(p, { - 'blip': 'http://blip.tv/dtd/blip/1.0', - 'media': 'http://search.yahoo.com/mrss/', - 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', - }) - - item = rss.find('channel/item') - - video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id - title = xpath_text(item, 'title', 'title', fatal=True) - description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description')) - timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp')) - uploader = xpath_text(item, _x('blip:user'), 'uploader') - uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id') - duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration')) - media_thumbnail = item.find(_x('media:thumbnail')) - thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None - else xpath_text(item, 'image', 'thumbnail')) - categories = [category.text for category in item.findall('category') if category is not None] - - formats = [] - subtitles_urls = {} - - media_group = item.find(_x('media:group')) - for media_content in media_group.findall(_x('media:content')): - url = media_content.get('url') - role = media_content.get(_x('blip:role')) - msg = self._download_webpage( - url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url', - video_id, 'Resolving URL for %s' % role) - real_url = compat_urlparse.parse_qs(msg.strip())['message'][0] - - media_type = media_content.get('type') - if media_type == 'text/srt' or url.endswith('.srt'): - LANGS = { - 'english': 'en', - } - lang = role.rpartition('-')[-1].strip().lower() - langcode = LANGS.get(lang, lang) - subtitles_urls[langcode] = url - elif media_type.startswith('video/'): - formats.append({ - 'url': real_url, - 'format_id': role, - 'format_note': media_type, - 'vcodec': media_content.get(_x('blip:vcodec')) or 'none', - 'acodec': media_content.get(_x('blip:acodec')), - 'filesize': media_content.get('filesize'), - 'width': int_or_none(media_content.get('width')), - 'height': int_or_none(media_content.get('height')), - }) - self._check_formats(formats, video_id) - self._sort_formats(formats) - - subtitles = self.extract_subtitles(video_id, subtitles_urls) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'duration': duration, - 'thumbnail': thumbnail, - 'categories': categories, - 'formats': formats, - 'subtitles': subtitles, - } - - def _get_subtitles(self, video_id, subtitles_urls): - subtitles = {} - for lang, url in subtitles_urls.items(): - # For some weird reason, blip.tv serves a video instead of subtitles - # when we request with a common UA - req = sanitized_Request(url) - req.add_header('User-Agent', 'youtube-dl') - subtitles[lang] = [{ - # The extension is 'srt' but it's actually an 'ass' file - 'ext': 'ass', - 'data': self._download_webpage(req, None, note=False), - }] - return subtitles - - -class BlipTVUserIE(InfoExtractor): - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$' - _PAGE_SIZE = 12 - IE_NAME = 'blip.tv:user' - _TEST = { - 'url': 'http://blip.tv/actone', - 'info_dict': { - 'id': 'actone', - 'title': 'Act One: The Series', - }, - 'playlist_count': 5, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - username = mobj.group(1) - - page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' - - page = self._download_webpage(url, username, 'Downloading user page') - mobj = re.search(r'data-users-id="([^"]+)"', page) - page_base = page_base % mobj.group(1) - title = self._og_search_title(page) - - # Download video ids using BlipTV Ajax calls. Result size per - # query is limited (currently to 12 videos) so we need to query - # page by page until there are no video ids - it means we got - # all of them. - - video_ids = [] - pagenum = 1 - - while True: - url = page_base + "&page=" + str(pagenum) - page = self._download_webpage( - url, username, 'Downloading video ids from page %d' % pagenum) - - # Extract video identifiers - ids_in_page = [] - - for mobj in re.finditer(r'href="/([^"]+)"', page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(unescapeHTML(mobj.group(1))) - - video_ids.extend(ids_in_page) - - # A little optimization - if current page is not - # "full", ie. does not contain PAGE_SIZE video ids then - # we can assume that this page is the last one - there - # are no more ids on further pages - no need to query - # again. - - if len(ids_in_page) < self._PAGE_SIZE: - break - - pagenum += 1 - - urls = ['http://blip.tv/%s' % video_id for video_id in video_ids] - url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls] - return self.playlist_result( - url_entries, playlist_title=title, playlist_id=username) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 66e394e10..e66854538 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -1,18 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, parse_duration, + xpath_element, + xpath_text, ) class BRIE(InfoExtractor): IE_DESC = 'Bayerischer Rundfunk Mediathek' - _VALID_URL = r'https?://(?:www\.)?br\.de/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' - _BASE_URL = 'http://www.br.de' + _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' _TESTS = [ { @@ -22,7 +25,7 @@ class BRIE(InfoExtractor): 'id': '48f656ef-287e-486f-be86-459122db22cc', 'ext': 'mp4', 'title': 'Die böse Überraschung', - 'description': 'Betriebliche Altersvorsorge: Die böse Überraschung', + 'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9', 'duration': 180, 'uploader': 'Reinhard Weber', 'upload_date': '20150422', @@ -30,23 +33,23 @@ class BRIE(InfoExtractor): }, { 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', - 'md5': 'a44396d73ab6a68a69a568fae10705bb', + 'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef', 'info_dict': { 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Manfred Schreiber ist tot', - 'description': 'Abendschau kompakt: Manfred Schreiber ist tot', + 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97', 'duration': 26, } }, { - 'url': 'http://www.br.de/radio/br-klassik/sendungen/allegro/premiere-urauffuehrung-the-land-2015-dance-festival-muenchen-100.html', + 'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html', 'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d', 'info_dict': { 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b', 'ext': 'aac', 'title': 'Kurzweilig und sehr bewegend', - 'description': '"The Land" von Peeping Tom: Kurzweilig und sehr bewegend', + 'description': 'md5:0351996e3283d64adeb38ede91fac54e', 'duration': 296, } }, @@ -57,7 +60,7 @@ class BRIE(InfoExtractor): 'id': '6ba73750-d405-45d3-861d-1ce8c524e059', 'ext': 'mp4', 'title': 'Umweltbewusster Häuslebauer', - 'description': 'Uwe Erdelt: Umweltbewusster Häuslebauer', + 'description': 'md5:d52dae9792d00226348c1dbb13c9bae2', 'duration': 116, } }, @@ -68,7 +71,7 @@ class BRIE(InfoExtractor): 'id': 'd982c9ce-8648-4753-b358-98abb8aec43d', 'ext': 'mp4', 'title': 'Folge 1 - Metaphysik', - 'description': 'Kant für Anfänger: Folge 1 - Metaphysik', + 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3', 'duration': 893, 'uploader': 'Eva Maria Steimle', 'upload_date': '20140117', @@ -77,28 +80,31 @@ class BRIE(InfoExtractor): ] def _real_extract(self, url): - display_id = self._match_id(url) + base_url, display_id = re.search(self._VALID_URL, url).groups() page = self._download_webpage(url, display_id) xml_url = self._search_regex( r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') - xml = self._download_xml(self._BASE_URL + xml_url, None) + xml = self._download_xml(base_url + xml_url, display_id) medias = [] for xml_media in xml.findall('video') + xml.findall('audio'): + media_id = xml_media.get('externalId') media = { - 'id': xml_media.get('externalId'), - 'title': xml_media.find('title').text, - 'duration': parse_duration(xml_media.find('duration').text), - 'formats': self._extract_formats(xml_media.find('assets')), - 'thumbnails': self._extract_thumbnails(xml_media.find('teaserImage/variants')), - 'description': ' '.join(xml_media.find('shareTitle').text.splitlines()), - 'webpage_url': xml_media.find('permalink').text + 'id': media_id, + 'title': xpath_text(xml_media, 'title', 'title', True), + 'duration': parse_duration(xpath_text(xml_media, 'duration')), + 'formats': self._extract_formats(xpath_element( + xml_media, 'assets'), media_id), + 'thumbnails': self._extract_thumbnails(xpath_element( + xml_media, 'teaserImage/variants'), base_url), + 'description': xpath_text(xml_media, 'desc'), + 'webpage_url': xpath_text(xml_media, 'permalink'), + 'uploader': xpath_text(xml_media, 'author'), } - if xml_media.find('author').text: - media['uploader'] = xml_media.find('author').text - if xml_media.find('broadcastDate').text: - media['upload_date'] = ''.join(reversed(xml_media.find('broadcastDate').text.split('.'))) + broadcast_date = xpath_text(xml_media, 'broadcastDate') + if broadcast_date: + media['upload_date'] = ''.join(reversed(broadcast_date.split('.'))) medias.append(media) if len(medias) > 1: @@ -109,35 +115,58 @@ class BRIE(InfoExtractor): raise ExtractorError('No media entries found') return medias[0] - def _extract_formats(self, assets): - - def text_or_none(asset, tag): - elem = asset.find(tag) - return None if elem is None else elem.text - - formats = [{ - 'url': text_or_none(asset, 'downloadUrl'), - 'ext': text_or_none(asset, 'mediaType'), - 'format_id': asset.get('type'), - 'width': int_or_none(text_or_none(asset, 'frameWidth')), - 'height': int_or_none(text_or_none(asset, 'frameHeight')), - 'tbr': int_or_none(text_or_none(asset, 'bitrateVideo')), - 'abr': int_or_none(text_or_none(asset, 'bitrateAudio')), - 'vcodec': text_or_none(asset, 'codecVideo'), - 'acodec': text_or_none(asset, 'codecAudio'), - 'container': text_or_none(asset, 'mediaType'), - 'filesize': int_or_none(text_or_none(asset, 'size')), - } for asset in assets.findall('asset') - if asset.find('downloadUrl') is not None] - + def _extract_formats(self, assets, media_id): + formats = [] + for asset in assets.findall('asset'): + format_url = xpath_text(asset, ['downloadUrl', 'url']) + asset_type = asset.get('type') + if asset_type == 'HDS': + f4m_formats = self._extract_f4m_formats( + format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + elif asset_type == 'HLS': + m3u8_formats = self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + else: + format_info = { + 'ext': xpath_text(asset, 'mediaType'), + 'width': int_or_none(xpath_text(asset, 'frameWidth')), + 'height': int_or_none(xpath_text(asset, 'frameHeight')), + 'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')), + 'abr': int_or_none(xpath_text(asset, 'bitrateAudio')), + 'vcodec': xpath_text(asset, 'codecVideo'), + 'acodec': xpath_text(asset, 'codecAudio'), + 'container': xpath_text(asset, 'mediaType'), + 'filesize': int_or_none(xpath_text(asset, 'size')), + } + format_url = self._proto_relative_url(format_url) + if format_url: + http_format_info = format_info.copy() + http_format_info.update({ + 'url': format_url, + 'format_id': 'http-%s' % asset_type, + }) + formats.append(http_format_info) + server_prefix = xpath_text(asset, 'serverPrefix') + if server_prefix: + rtmp_format_info = format_info.copy() + rtmp_format_info.update({ + 'url': server_prefix, + 'play_path': xpath_text(asset, 'fileName'), + 'format_id': 'rtmp-%s' % asset_type, + }) + formats.append(rtmp_format_info) self._sort_formats(formats) return formats - def _extract_thumbnails(self, variants): + def _extract_thumbnails(self, variants, base_url): thumbnails = [{ - 'url': self._BASE_URL + variant.find('url').text, - 'width': int_or_none(variant.find('width').text), - 'height': int_or_none(variant.find('height').text), - } for variant in variants.findall('variant')] + 'url': base_url + xpath_text(variant, 'url'), + 'width': int_or_none(xpath_text(variant, 'width')), + 'height': int_or_none(xpath_text(variant, 'height')), + } for variant in variants.findall('variant') if xpath_text(variant, 'url')] thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) return thumbnails diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f5ebae1e6..03a4f446e 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -355,7 +355,7 @@ class BrightcoveLegacyIE(InfoExtractor): class BrightcoveNewIE(InfoExtractor): IE_NAME = 'brightcove:new' - _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+)' + _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>(?:ref:)?\d+)' _TESTS = [{ 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', 'md5': 'c8100925723840d4b0d243f7025703be', @@ -387,14 +387,24 @@ class BrightcoveNewIE(InfoExtractor): 'params': { 'skip_download': True, } + }, { + # ref: prefixed video id + 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', + 'only_matching': True, }] @staticmethod + def _extract_url(webpage): + urls = BrightcoveNewIE._extract_urls(webpage) + return urls[0] if urls else None + + @staticmethod def _extract_urls(webpage): # Reference: # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe - # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript) + # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html + # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player entries = [] @@ -407,9 +417,10 @@ class BrightcoveNewIE(InfoExtractor): for video_id, account_id, player_id, embed in re.findall( # According to examples from [3] it's unclear whether video id # may be optional and what to do when it is + # According to [4] data-video-id may be prefixed with ref: r'''(?sx) <video[^>]+ - data-video-id=["\'](\d+)["\'][^>]*>.*? + data-video-id=["\']((?:ref:)?\d+)["\'][^>]*>.*? </video>.*? <script[^>]+ src=["\'](?:https?:)?//players\.brightcove\.net/ diff --git a/youtube_dl/extractor/canal13cl.py b/youtube_dl/extractor/canal13cl.py deleted file mode 100644 index 93241fefe..000000000 --- a/youtube_dl/extractor/canal13cl.py +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class Canal13clIE(InfoExtractor): - _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', - 'md5': '4cb1fa38adcad8fea88487a078831755', - 'info_dict': { - 'id': '1403022125', - 'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', - 'ext': 'mp4', - 'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda', - 'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - - webpage = self._download_webpage(url, display_id) - - title = self._html_search_meta( - 'twitter:title', webpage, 'title', fatal=True) - description = self._html_search_meta( - 'twitter:description', webpage, 'description') - url = self._html_search_regex( - r'articuloVideo = \"(.*?)\"', webpage, 'url') - real_id = self._search_regex( - r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id) - thumbnail = self._html_search_regex( - r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail') - - return { - 'id': real_id, - 'display_id': display_id, - 'url': url, - 'title': title, - 'description': description, - 'ext': 'mp4', - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index 0b67ba67d..242fba311 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -23,6 +23,8 @@ class ChaturbateIE(InfoExtractor): 'only_matching': True, }] + _ROOM_OFFLINE = 'Room is currently offline' + def _real_extract(self, url): video_id = self._match_id(url) @@ -34,9 +36,16 @@ class ChaturbateIE(InfoExtractor): if not m3u8_url: error = self._search_regex( - r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>', - webpage, 'error', group='error') - raise ExtractorError(error, expected=True) + [r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>', + r'<div[^>]+id=(["\'])defchat\1[^>]*>\s*<p><strong>(?P<error>[^<]+)<'], + webpage, 'error', group='error', default=None) + if not error: + if any(p not in webpage for p in ( + self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')): + error = self._ROOM_OFFLINE + if error: + raise ExtractorError(error, expected=True) + raise ExtractorError('Unable to find stream URL') formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index fd1770dac..6d9cd8abd 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ExtractorError -from .bliptv import BlipTVIE from .screenwavemedia import ScreenwaveMediaIE @@ -34,18 +33,17 @@ class CinemassacreIE(InfoExtractor): }, }, { - # blip.tv embedded video + # Youtube embedded video 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', - 'md5': 'ca9b3c8dd5a66f9375daeb5135f5a3de', + 'md5': 'df4cf8a1dcedaec79a73d96d83b99023', 'info_dict': { - 'id': '4065369', - 'ext': 'flv', + 'id': 'OEVzPCY2T-g', + 'ext': 'mp4', 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', 'upload_date': '20061207', - 'uploader': 'cinemassacre', - 'uploader_id': '250778', - 'timestamp': 1283233867, - 'description': 'md5:0a108c78d130676b207d0f6d029ecffd', + 'uploader': 'Cinemassacre', + 'uploader_id': 'JamesNintendoNerd', + 'description': 'md5:784734696c2b8b7f4b8625cc799e07f6', } }, { @@ -89,8 +87,6 @@ class CinemassacreIE(InfoExtractor): ], webpage, 'player data URL', default=None, group='url') if not playerdata_url: - playerdata_url = BlipTVIE._extract_url(webpage) - if not playerdata_url: raise ExtractorError('Unable to find player data') video_title = self._html_search_regex( diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 5dd69bff7..5c3908f72 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -1,15 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import json +from .theplatform import ThePlatformIE +from ..utils import int_or_none -from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) - -class CNETIE(InfoExtractor): +class CNETIE(ThePlatformIE): _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/' _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', @@ -18,25 +14,20 @@ class CNETIE(InfoExtractor): 'ext': 'flv', 'title': 'Hands-on with Microsoft Windows 8.1 Update', 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', - 'thumbnail': 're:^http://.*/flmswindows8.jpg$', 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', 'uploader': 'Sarah Mitroff', + 'duration': 70, }, - 'params': { - 'skip_download': 'requires rtmpdump', - } }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', 'info_dict': { 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', 'ext': 'flv', + 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', - 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', - }, - 'params': { - 'skip_download': True, # requires rtmpdump + 'duration': 1482, }, }] @@ -45,26 +36,13 @@ class CNETIE(InfoExtractor): webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"<div class=\"cnetVideoPlayer\"\s+.*?data-cnet-video-options='([^']+)'", + r"data-cnet-video(?:-uvp)?-options='([^']+)'", webpage, 'data json') - data = json.loads(data_json) - vdata = data['video'] - if not vdata: - vdata = data['videos'][0] - if not vdata: - raise ExtractorError('Cannot find video data') - - mpx_account = data['config']['players']['default']['mpx_account'] - vid = vdata['files'].get('rtmp', vdata['files']['hds']) - tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid) + data = self._parse_json(data_json, display_id) + vdata = data.get('video') or data['videos'][0] video_id = vdata['id'] - title = vdata.get('headline') - if title is None: - title = vdata.get('title') - if title is None: - raise ExtractorError('Cannot find title!') - thumbnail = vdata.get('image', {}).get('path') + title = vdata['title'] author = vdata.get('author') if author: uploader = '%s %s' % (author['firstName'], author['lastName']) @@ -73,13 +51,34 @@ class CNETIE(InfoExtractor): uploader = None uploader_id = None + mpx_account = data['config']['uvpConfig']['default']['mpx_account'] + + metadata = self.get_metadata('%s/%s' % (mpx_account, list(vdata['files'].values())[0]), video_id) + description = vdata.get('description') or metadata.get('description') + duration = int_or_none(vdata.get('duration')) or metadata.get('duration') + + formats = [] + subtitles = {} + for (fkey, vid) in vdata['files'].items(): + if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: + continue + release_url = 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true' % (mpx_account, vid) + if fkey == 'hds': + release_url += '&manifest=f4m' + tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + self._sort_formats(formats) + return { - '_type': 'url_transparent', - 'url': tp_link, 'id': video_id, 'display_id': display_id, 'title': title, + 'description': description, + 'thumbnail': metadata.get('thumbnail'), + 'duration': duration, 'uploader': uploader, 'uploader_id': uploader_id, - 'thumbnail': thumbnail, + 'subtitles': subtitles, + 'formats': formats, } diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 81f3d7697..2efa200b5 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -1,10 +1,12 @@ # encoding: utf-8 from __future__ import unicode_literals -import json - from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, +) class ComCarCoffIE(InfoExtractor): @@ -16,6 +18,7 @@ class ComCarCoffIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20141127', 'timestamp': 1417107600, + 'duration': 1232, 'title': 'Happy Thanksgiving Miranda', 'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.', 'thumbnail': 'http://ccc.crackle.com/images/s5e4_thumb.jpg', @@ -31,9 +34,10 @@ class ComCarCoffIE(InfoExtractor): display_id = 'comediansincarsgettingcoffee.com' webpage = self._download_webpage(url, display_id) - full_data = json.loads(self._search_regex( - r'<script type="application/json" id="videoData">(?P<json>.+?)</script>', - webpage, 'full data json')) + full_data = self._parse_json( + self._search_regex( + r'window\.app\s*=\s*({.+?});\n', webpage, 'full data json'), + display_id)['videoData'] video_id = full_data['activeVideo']['video'] video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id] @@ -45,12 +49,18 @@ class ComCarCoffIE(InfoExtractor): formats = self._extract_m3u8_formats( video_data['mediaUrl'], video_id, ext='mp4') + timestamp = int_or_none(video_data.get('pubDateTime')) or parse_iso8601( + video_data.get('pubDate')) + duration = int_or_none(video_data.get('durationSeconds')) or parse_duration( + video_data.get('duration')) + return { 'id': video_id, 'display_id': display_id, 'title': video_data['title'], 'description': video_data.get('description'), - 'timestamp': parse_iso8601(video_data.get('pubDate')), + 'timestamp': timestamp, + 'duration': duration, 'thumbnails': thumbnails, 'formats': formats, 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))), diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6ab2d68d6..828f58f12 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -30,6 +30,7 @@ from ..utils import ( clean_html, compiled_regex_type, determine_ext, + error_to_compat_str, ExtractorError, fix_xml_ampersands, float_or_none, @@ -332,7 +333,8 @@ class InfoExtractor(object): return False if errnote is None: errnote = 'Unable to download webpage' - errmsg = '%s: %s' % (errnote, compat_str(err)) + + errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) if fatal: raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) else: @@ -622,7 +624,7 @@ class InfoExtractor(object): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning('parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) return (username, password) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 428556213..0c5b6617f 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -7,10 +7,10 @@ import itertools from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( - ExtractorError, determine_ext, + error_to_compat_str, + ExtractorError, int_or_none, parse_iso8601, sanitized_Request, @@ -278,7 +278,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, video_id, note=False) except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err)) + self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) return {} info = json.loads(sub_list) if (info['total'] > 0): diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 934da765e..9a94cf361 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -25,6 +25,18 @@ class DaumIE(InfoExtractor): 'duration': 3868, }, }, { + # Test for https://github.com/rg3/youtube-dl/issues/7949 + 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=M1O35s8HPOo0&clipid=73147290', + 'md5': 'c92d78bcee4424451f1667f275c1dc97', + 'info_dict': { + 'id': '73147290', + 'ext': 'mp4', + 'title': '싸이 - 나팔바지 [유희열의 스케치북] 299회 20151218', + 'description': '싸이 - 나팔바지', + 'upload_date': '20151219', + 'duration': 232, + }, + }, { 'url': 'http://tvpot.daum.net/v/vab4dyeDBysyBssyukBUjBz', 'only_matching': True, }, { @@ -37,9 +49,11 @@ class DaumIE(InfoExtractor): video_id = mobj.group('id') canonical_url = 'http://tvpot.daum.net/v/%s' % video_id webpage = self._download_webpage(canonical_url, video_id) + og_url = self._og_search_url(webpage, default=None) or self._search_regex( + r'<link[^>]+rel=(["\'])canonical\1[^>]+href=(["\'])(?P<url>.+?)\2', + webpage, 'canonical url', group='url') full_id = self._search_regex( - r'src=["\']http://videofarm\.daum\.net/controller/video/viewer/Video\.html\?.*?vid=(.+?)[&"\']', - webpage, 'full id') + r'tvpot\.daum\.net/v/([^/]+)', og_url, 'full id') query = compat_urllib_parse.urlencode({'vid': full_id}) info = self._download_xml( 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index d836c1a6c..60ed438f8 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import itertools -from .common import InfoExtractor +from .amp import AMPIE from ..compat import ( compat_HTTPError, compat_urllib_parse, @@ -12,14 +12,11 @@ from ..compat import ( from ..utils import ( ExtractorError, clean_html, - determine_ext, - int_or_none, - parse_iso8601, sanitized_Request, ) -class DramaFeverBaseIE(InfoExtractor): +class DramaFeverBaseIE(AMPIE): _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _NETRC_MACHINE = 'dramafever' @@ -80,60 +77,25 @@ class DramaFeverIE(DramaFeverBaseIE): 'timestamp': 1404336058, 'upload_date': '20140702', 'duration': 343, - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): video_id = self._match_id(url).replace('/', '.') try: - feed = self._download_json( - 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id, - video_id, 'Downloading episode JSON')['channel']['item'] + info = self._extract_feed_info( + 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): raise ExtractorError( 'Currently unavailable in your country.', expected=True) raise - media_group = feed.get('media-group', {}) - - formats = [] - for media_content in media_group['media-content']: - src = media_content.get('@attributes', {}).get('url') - if not src: - continue - ext = determine_ext(src) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src, video_id, f4m_id='hds')) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id='hls')) - else: - formats.append({ - 'url': src, - }) - self._sort_formats(formats) - - title = media_group.get('media-title') - description = media_group.get('media-description') - duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration')) - thumbnail = self._proto_relative_url( - media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url')) - timestamp = parse_iso8601(feed.get('pubDate'), ' ') - - subtitles = {} - for media_subtitle in media_group.get('media-subTitle', []): - lang = media_subtitle.get('@attributes', {}).get('lang') - href = media_subtitle.get('@attributes', {}).get('href') - if not lang or not href: - continue - subtitles[lang] = [{ - 'ext': 'ttml', - 'url': href, - }] - series_id, episode_number = video_id.split('.') episode_info = self._download_json( # We only need a single episode info, so restricting page size to one episode @@ -146,21 +108,12 @@ class DramaFeverIE(DramaFeverBaseIE): if value: subfile = value[0].get('subfile') or value[0].get('new_subfile') if subfile and subfile != 'http://www.dramafever.com/st/': - subtitles.setdefault('English', []).append({ + info['subtitiles'].setdefault('English', []).append({ 'ext': 'srt', 'url': subfile, }) - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } + return info class DramaFeverSeriesIE(DramaFeverBaseIE): diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py index 02c6a4615..476cce2d0 100644 --- a/youtube_dl/extractor/ellentv.py +++ b/youtube_dl/extractor/ellentv.py @@ -13,12 +13,12 @@ class EllenTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)' _TEST = { 'url': 'http://www.ellentv.com/videos/0-ipq1gsai/', - 'md5': '8e3c576bf2e9bfff4d76565f56f94c9c', + 'md5': '4294cf98bc165f218aaa0b89e0fd8042', 'info_dict': { 'id': '0_ipq1gsai', - 'ext': 'mp4', + 'ext': 'mov', 'title': 'Fast Fingers of Fate', - 'description': 'md5:587e79fbbd0d73b148bc596d99ce48e6', + 'description': 'md5:3539013ddcbfa64b2a6d1b38d910868a', 'timestamp': 1428035648, 'upload_date': '20150403', 'uploader_id': 'batchUser', diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 321eec59e..39c481068 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -7,11 +7,11 @@ import socket from .common import InfoExtractor from ..compat import ( compat_http_client, - compat_str, compat_urllib_error, compat_urllib_parse_unquote, ) from ..utils import ( + error_to_compat_str, ExtractorError, limit_length, sanitized_Request, @@ -116,7 +116,7 @@ class FacebookIE(InfoExtractor): if re.search(r'id="checkpointSubmitButton"', check_response) is not None: self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning('unable to log in: %s' % compat_str(err)) + self._downloader.report_warning('unable to log in: %s' % error_to_compat_str(err)) return def _real_initialize(self): diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index cebdd0193..6f9b003c2 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -2,6 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + xpath_element, + xpath_text, + int_or_none, +) class FazIE(InfoExtractor): @@ -37,31 +42,32 @@ class FazIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + description = self._og_search_description(webpage) config_xml_url = self._search_regex( - r'writeFLV\(\'(.+?)\',', webpage, 'config xml url') + r'videoXMLURL\s*=\s*"([^"]+)', webpage, 'config xml url') config = self._download_xml( config_xml_url, video_id, 'Downloading config xml') - encodings = config.find('ENCODINGS') + encodings = xpath_element(config, 'ENCODINGS', 'encodings', True) formats = [] for pref, code in enumerate(['LOW', 'HIGH', 'HQ']): - encoding = encodings.find(code) - if encoding is None: - continue - encoding_url = encoding.find('FILENAME').text - formats.append({ - 'url': encoding_url, - 'format_id': code.lower(), - 'quality': pref, - }) + encoding = xpath_element(encodings, code) + if encoding: + encoding_url = xpath_text(encoding, 'FILENAME') + if encoding_url: + formats.append({ + 'url': encoding_url, + 'format_id': code.lower(), + 'quality': pref, + 'tbr': int_or_none(xpath_text(encoding, 'AVERAGEBITRATE')), + }) self._sort_formats(formats) - descr = self._html_search_regex( - r'<p class="Content Copy">(.*?)</p>', webpage, 'description', fatal=False) return { 'id': video_id, 'title': self._og_search_title(webpage), 'formats': formats, - 'description': descr, - 'thumbnail': config.find('STILL/STILL_BIG').text, + 'description': description.strip() if description else None, + 'thumbnail': xpath_text(config, 'STILL/STILL_BIG'), + 'duration': int_or_none(xpath_text(config, 'DURATION')), } diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index 40ea27895..5f6e65dae 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -1,12 +1,10 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( clean_html, determine_ext, - ExtractorError, + js_to_json, ) @@ -32,24 +30,22 @@ class FKTVIE(InfoExtractor): 'http://fernsehkritik.tv/folge-%s/play' % episode, episode) title = clean_html(self._html_search_regex( '<h3>([^<]+)</h3>', webpage, 'title')) - matches = re.search( - r'(?s)<video(?:(?!poster)[^>])+(?:poster="([^"]+)")?[^>]*>(.*)</video>', - webpage) - if matches is None: - raise ExtractorError('Unable to extract the video') - - poster, sources = matches.groups() - if poster is None: - self.report_warning('unable to extract thumbnail') - - urls = re.findall(r'<source[^>]+src="([^"]+)"', sources) - formats = [{ - 'url': furl, - 'format_id': determine_ext(furl), - } for furl in urls] + thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False) + sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json) + + formats = [] + for source in sources: + furl = source.get('src') + if furl: + formats.append({ + 'url': furl, + 'format_id': determine_ext(furl), + }) + self._sort_formats(formats) + return { 'id': episode, 'title': title, 'formats': formats, - 'thumbnail': poster, + 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 91cd46e76..18f439df9 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -1,67 +1,93 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, - find_xpath_attr, - sanitized_Request, + int_or_none, + qualities, ) class FlickrIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' + _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/[\w\-_@]+/(?P<id>\d+)' _TEST = { 'url': 'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/', - 'md5': '6fdc01adbc89d72fc9c4f15b4a4ba87b', + 'md5': '164fe3fa6c22e18d448d4d5af2330f31', 'info_dict': { 'id': '5645318632', - 'ext': 'mp4', - "description": "Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.", - "uploader_id": "forestwander-nature-pictures", - "title": "Dark Hollow Waterfalls" + 'ext': 'mpg', + 'description': 'Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.', + 'title': 'Dark Hollow Waterfalls', + 'duration': 19, + 'timestamp': 1303528740, + 'upload_date': '20110423', + 'uploader_id': '10922353@N03', + 'uploader': 'Forest Wander', + 'comment_count': int, + 'view_count': int, + 'tags': list, } } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + _API_BASE_URL = 'https://api.flickr.com/services/rest?' - video_id = mobj.group('id') - video_uploader_id = mobj.group('uploader_id') - webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id - req = sanitized_Request(webpage_url) - req.add_header( - 'User-Agent', - # it needs a more recent version - 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20150101 Firefox/38.0 (Chrome)') - webpage = self._download_webpage(req, video_id) + def _call_api(self, method, video_id, api_key, note, secret=None): + query = { + 'photo_id': video_id, + 'method': 'flickr.%s' % method, + 'api_key': api_key, + 'format': 'json', + 'nojsoncallback': 1, + } + if secret: + query['secret'] = secret + data = self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note) + if data['stat'] != 'ok': + raise ExtractorError(data['message']) + return data - secret = self._search_regex(r'secret"\s*:\s*"(\w+)"', webpage, 'secret') + def _real_extract(self, url): + video_id = self._match_id(url) - first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' - first_xml = self._download_xml(first_url, video_id, 'Downloading first data webpage') + api_key = self._download_json( + 'https://www.flickr.com/hermes_error_beacon.gne', video_id, + 'Downloading api key')['site_key'] - node_id = find_xpath_attr( - first_xml, './/{http://video.yahoo.com/YEP/1.0/}Item', 'id', - 'id').text + video_info = self._call_api( + 'photos.getInfo', video_id, api_key, 'Downloading video info')['photo'] + if video_info['media'] == 'video': + streams = self._call_api( + 'video.getStreamInfo', video_id, api_key, + 'Downloading streams info', video_info['secret'])['streams'] - second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' - second_xml = self._download_xml(second_url, video_id, 'Downloading second data webpage') + preference = qualities( + ['288p', 'iphone_wifi', '100', '300', '700', '360p', 'appletv', '720p', '1080p', 'orig']) - self.report_extraction(video_id) + formats = [] + for stream in streams['stream']: + stream_type = str(stream.get('type')) + formats.append({ + 'format_id': stream_type, + 'url': stream['_content'], + 'preference': preference(stream_type), + }) + self._sort_formats(formats) - stream = second_xml.find('.//STREAM') - if stream is None: - raise ExtractorError('Unable to extract video url') - video_url = stream.attrib['APP'] + stream.attrib['FULLPATH'] + owner = video_info.get('owner', {}) - return { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader_id': video_uploader_id, - } + return { + 'id': video_id, + 'title': video_info['title']['_content'], + 'description': video_info.get('description', {}).get('_content'), + 'formats': formats, + 'timestamp': int_or_none(video_info.get('dateuploaded')), + 'duration': int_or_none(video_info.get('video', {}).get('duration')), + 'uploader_id': owner.get('nsid'), + 'uploader': owner.get('realname'), + 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), + 'view_count': int_or_none(video_info.get('views')), + 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])] + } + else: + raise ExtractorError('not a video', expected=True) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 3a4a59135..318ac013d 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -2,14 +2,10 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor -from ..utils import ( - parse_iso8601, - int_or_none, -) +from .amp import AMPIE -class FoxNewsIE(InfoExtractor): +class FoxNewsIE(AMPIE): IE_DESC = 'Fox News and Fox Business Video' _VALID_URL = r'https?://(?P<host>video\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)' _TESTS = [ @@ -20,10 +16,10 @@ class FoxNewsIE(InfoExtractor): 'id': '3937480', 'ext': 'flv', 'title': 'Frozen in Time', - 'description': 'Doctors baffled by 16-year-old girl that is the size of a toddler', + 'description': '16-year-old girl is size of toddler', 'duration': 265, - 'timestamp': 1304411491, - 'upload_date': '20110503', + # 'timestamp': 1304411491, + # 'upload_date': '20110503', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -34,10 +30,10 @@ class FoxNewsIE(InfoExtractor): 'id': '3922535568001', 'ext': 'mp4', 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal", - 'description': "Congressman discusses the president's executive action", + 'description': "Congressman discusses president's plan", 'duration': 292, - 'timestamp': 1417662047, - 'upload_date': '20141204', + # 'timestamp': 1417662047, + # 'upload_date': '20141204', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -52,52 +48,9 @@ class FoxNewsIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - host = mobj.group('host') + host, video_id = re.match(self._VALID_URL, url).groups() - video = self._download_json( - 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id) - - item = video['channel']['item'] - title = item['title'] - description = item['description'] - timestamp = parse_iso8601(item['dc-date']) - - media_group = item['media-group'] - duration = None - formats = [] - for media in media_group['media-content']: - attributes = media['@attributes'] - video_url = attributes['url'] - if video_url.endswith('.f4m'): - formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id)) - elif video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'flv')) - elif not video_url.endswith('.smil'): - duration = int_or_none(attributes.get('duration')) - formats.append({ - 'url': video_url, - 'format_id': media['media-category']['@attributes']['label'], - 'preference': 1, - 'vbr': int_or_none(attributes.get('bitrate')), - 'filesize': int_or_none(attributes.get('fileSize')) - }) - self._sort_formats(formats) - - media_thumbnail = media_group['media-thumbnail']['@attributes'] - thumbnails = [{ - 'url': media_thumbnail['url'], - 'width': int_or_none(media_thumbnail.get('width')), - 'height': int_or_none(media_thumbnail.get('height')), - }] if media_thumbnail else [] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } + info = self._extract_feed_info( + 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) + info['id'] = video_id + return info diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 6613ee17a..fdc51f44f 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import int_or_none @@ -23,8 +21,7 @@ class FranceInterIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -33,7 +30,7 @@ class FranceInterIE(InfoExtractor): video_url = 'http://www.franceinter.fr/' + path title = self._html_search_regex( - r'<span class="title">(.+?)</span>', webpage, 'title') + r'<span class="title-diffusion">(.+?)</span>', webpage, 'title') description = self._html_search_regex( r'<span class="description">(.*?)</span>', webpage, 'description', fatal=False) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c2e8f9b62..3c3066e38 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -44,7 +44,6 @@ from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE -from .bliptv import BlipTVIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE @@ -55,6 +54,8 @@ from .snagfilms import SnagFilmsEmbedIE from .screenwavemedia import ScreenwaveMediaIE from .mtv import MTVServicesEmbeddedIE from .pladform import PladformIE +from .googledrive import GoogleDriveIE +from .jwplatform import JWPlatformIE class GenericIE(InfoExtractor): @@ -1440,11 +1441,6 @@ class GenericIE(InfoExtractor): 'id': match.group('id') } - # Look for embedded blip.tv player - bliptv_url = BlipTVIE._extract_url(webpage) - if bliptv_url: - return self.url_result(bliptv_url, 'BlipTV') - # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: @@ -1769,6 +1765,11 @@ class GenericIE(InfoExtractor): if nbc_sports_url: return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + # Look for Google Drive embeds + google_drive_url = GoogleDriveIE._extract_url(webpage) + if google_drive_url: + return self.url_result(google_drive_url, 'GoogleDrive') + # Look for UDN embeds mobj = re.search( r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) @@ -1796,6 +1797,11 @@ class GenericIE(InfoExtractor): if snagfilms_url: return self.url_result(snagfilms_url) + # Look for JWPlatform embeds + jwplatform_url = JWPlatformIE._extract_url(webpage) + if jwplatform_url: + return self.url_result(jwplatform_url, 'JWPlatform') + # Look for ScreenwaveMedia embeds mobj = re.search(ScreenwaveMediaIE.EMBED_PATTERN, webpage) if mobj is not None: diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py new file mode 100644 index 000000000..f354c9c7a --- /dev/null +++ b/youtube_dl/extractor/googledrive.py @@ -0,0 +1,88 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class GoogleDriveIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})' + _TEST = { + 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', + 'md5': '881f7700aec4f538571fa1e0eed4a7b6', + 'info_dict': { + 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', + 'ext': 'mp4', + 'title': 'Big Buck Bunny.mp4', + 'duration': 46, + } + } + _FORMATS_EXT = { + '5': 'flv', + '6': 'flv', + '13': '3gp', + '17': '3gp', + '18': 'mp4', + '22': 'mp4', + '34': 'flv', + '35': 'flv', + '36': '3gp', + '37': 'mp4', + '38': 'mp4', + '43': 'webm', + '44': 'webm', + '45': 'webm', + '46': 'webm', + '59': 'mp4', + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', + webpage) + if mobj: + return 'https://drive.google.com/file/d/%s' % mobj.group('id') + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://docs.google.com/file/d/%s' % video_id, video_id, encoding='unicode_escape') + + reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) + if reason: + raise ExtractorError(reason) + + title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title') + duration = int_or_none(self._search_regex( + r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None)) + fmt_stream_map = self._search_regex( + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',') + fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') + + formats = [] + for fmt, fmt_stream in zip(fmt_list, fmt_stream_map): + fmt_id, fmt_url = fmt_stream.split('|') + resolution = fmt.split('/')[1] + width, height = resolution.split('x') + formats.append({ + 'url': fmt_url, + 'format_id': fmt_id, + 'resolution': resolution, + 'width': int_or_none(width), + 'height': int_or_none(height), + 'ext': self._FORMATS_EXT[fmt_id], + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/gputechconf.py b/youtube_dl/extractor/gputechconf.py new file mode 100644 index 000000000..145b55bf3 --- /dev/null +++ b/youtube_dl/extractor/gputechconf.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + xpath_element, + xpath_text, + int_or_none, + parse_duration, +) + + +class GPUTechConfIE(InfoExtractor): + _VALID_URL = r'https?://on-demand\.gputechconf\.com/gtc/2015/video/S(?P<id>\d+)\.html' + _TEST = { + 'url': 'http://on-demand.gputechconf.com/gtc/2015/video/S5156.html', + 'md5': 'a8862a00a0fd65b8b43acc5b8e33f798', + 'info_dict': { + 'id': '5156', + 'ext': 'mp4', + 'title': 'Coordinating More Than 3 Million CUDA Threads for Social Network Analysis', + 'duration': 1219, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + root_path = self._search_regex(r'var\s+rootPath\s*=\s*"([^"]+)', webpage, 'root path', 'http://evt.dispeak.com/nvidia/events/gtc15/') + xml_file_id = self._search_regex(r'var\s+xmlFileId\s*=\s*"([^"]+)', webpage, 'xml file id') + + doc = self._download_xml('%sxml/%s.xml' % (root_path, xml_file_id), video_id) + + metadata = xpath_element(doc, 'metadata') + http_host = xpath_text(metadata, 'httpHost', 'http host', True) + mbr_videos = xpath_element(metadata, 'MBRVideos') + + formats = [] + for mbr_video in mbr_videos.findall('MBRVideo'): + stream_name = xpath_text(mbr_video, 'streamName') + if stream_name: + formats.append({ + 'url': 'http://%s/%s' % (http_host, stream_name.replace('mp4:', '')), + 'tbr': int_or_none(xpath_text(mbr_video, 'bitrate')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': xpath_text(metadata, 'title'), + 'duration': parse_duration(xpath_text(metadata, 'endTime')), + 'creator': xpath_text(metadata, 'speaker'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py new file mode 100644 index 000000000..05d27e75d --- /dev/null +++ b/youtube_dl/extractor/hotstar.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, +) + + +class HotStarIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hotstar\.com/.*?[/-](?P<id>\d{10})' + _TEST = { + 'url': 'http://www.hotstar.com/on-air-with-aib--english-1000076273', + 'info_dict': { + 'id': '1000076273', + 'ext': 'mp4', + 'title': 'On Air With AIB - English', + 'description': 'md5:c957d8868e9bc793ccb813691cc4c434', + 'timestamp': 1447227000, + 'upload_date': '20151111', + 'duration': 381, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + + _GET_CONTENT_TEMPLATE = 'http://account.hotstar.com/AVS/besc?action=GetAggregatedContentDetails&channel=PCTV&contentId=%s' + _GET_CDN_TEMPLATE = 'http://getcdn.hotstar.com/AVS/besc?action=GetCDN&asJson=Y&channel=%s&id=%s&type=%s' + + def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', fatal=True): + json_data = super(HotStarIE, self)._download_json(url_or_request, video_id, note, fatal=fatal) + if json_data['resultCode'] != 'OK': + if fatal: + raise ExtractorError(json_data['errorDescription']) + return None + return json_data['resultObj'] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + self._GET_CONTENT_TEMPLATE % video_id, + video_id)['contentInfo'][0] + + formats = [] + # PCTV for extracting f4m manifest + for f in ('TABLET',): + format_data = self._download_json( + self._GET_CDN_TEMPLATE % (f, video_id, 'VOD'), + video_id, 'Downloading %s JSON metadata' % f, fatal=False) + if format_data: + format_url = format_data['src'] + ext = determine_ext(format_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats(format_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif ext == 'f4m': + # produce broken files + continue + else: + formats.append({ + 'url': format_url, + 'width': int_or_none(format_data.get('width')), + 'height': int_or_none(format_data.get('height')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['episodeTitle'], + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': int_or_none(video_data.get('broadcastDate')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index bf2d2041b..a2e18c8a7 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -3,6 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) class IGNIE(InfoExtractor): @@ -11,25 +15,24 @@ class IGNIE(InfoExtractor): Some videos of it.ign.com are also supported """ - _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles|(?:[^/]*/feature))(/.+)?/(?P<name_or_id>.+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?P<type>videos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P<name_or_id>.+)' IE_NAME = 'ign.com' - _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' - _DESCRIPTION_RE = [ - r'<span class="page-object-description">(.+?)</span>', - r'id="my_show_video">.*?<p>(.*?)</p>', - r'<meta name="description" content="(.*?)"', - ] + _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s' + _EMBED_RE = r'<iframe[^>]+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']' _TESTS = [ { 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - 'md5': 'eac8bdc1890980122c3b66f14bdd02e9', + 'md5': 'febda82c4bafecd2d44b6e1a18a595f8', 'info_dict': { 'id': '8f862beef863986b2785559b9e1aa599', 'ext': 'mp4', 'title': 'The Last of Us Review', 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', + 'timestamp': 1370440800, + 'upload_date': '20130605', + 'uploader_id': 'cberidon@ign.com', } }, { @@ -44,6 +47,9 @@ class IGNIE(InfoExtractor): 'ext': 'mp4', 'title': 'GTA 5 Video Review', 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + 'timestamp': 1379339880, + 'upload_date': '20130916', + 'uploader_id': 'danieljkrupa@gmail.com', }, }, { @@ -52,6 +58,9 @@ class IGNIE(InfoExtractor): 'ext': 'mp4', 'title': '26 Twisted Moments from GTA 5 in Slow Motion', 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', + 'timestamp': 1386878820, + 'upload_date': '20131212', + 'uploader_id': 'togilvie@ign.com', }, }, ], @@ -66,12 +75,20 @@ class IGNIE(InfoExtractor): 'id': '078fdd005f6d3c02f63d795faa1b984f', 'ext': 'mp4', 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', - 'description': ( - 'Giant skeletons, bloody hunts, and captivating' - ' natural beauty take our breath away.' - ), + 'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.', + 'timestamp': 1408047180, + 'upload_date': '20140814', + 'uploader_id': 'jamesduggan1990@gmail.com', }, }, + { + 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', + 'only_matching': True, + }, + { + 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', + 'only_matching': True, + }, ] def _find_video_id(self, webpage): @@ -82,7 +99,7 @@ class IGNIE(InfoExtractor): r'<object id="vid_(.+?)"', r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"', ] - return self._search_regex(res_id, webpage, 'video id') + return self._search_regex(res_id, webpage, 'video id', default=None) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -91,7 +108,7 @@ class IGNIE(InfoExtractor): webpage = self._download_webpage(url, name_or_id) if page_type != 'video': multiple_urls = re.findall( - '<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', + r'<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', webpage) if multiple_urls: entries = [self.url_result(u, ie='IGN') for u in multiple_urls] @@ -102,22 +119,50 @@ class IGNIE(InfoExtractor): } video_id = self._find_video_id(webpage) - result = self._get_video_info(video_id) - description = self._html_search_regex(self._DESCRIPTION_RE, - webpage, 'video description', flags=re.DOTALL) - result['description'] = description - return result + if not video_id: + return self.url_result(self._search_regex(self._EMBED_RE, webpage, 'embed url')) + return self._get_video_info(video_id) def _get_video_info(self, video_id): - config_url = self._CONFIG_URL_TEMPLATE % video_id - config = self._download_json(config_url, video_id) - media = config['playlist']['media'] + api_data = self._download_json(self._API_URL_TEMPLATE % video_id, video_id) + + formats = [] + m3u8_url = api_data['refs'].get('m3uUrl') + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + f4m_url = api_data['refs'].get('f4mUrl') + if f4m_url: + f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + for asset in api_data['assets']: + formats.append({ + 'url': asset['url'], + 'tbr': asset.get('actual_bitrate_kbps'), + 'fps': asset.get('frame_rate'), + 'height': int_or_none(asset.get('height')), + 'width': int_or_none(asset.get('width')), + }) + self._sort_formats(formats) + + thumbnails = [{ + 'url': thumbnail['url'] + } for thumbnail in api_data.get('thumbnails', [])] + + metadata = api_data['metadata'] return { - 'id': media['metadata']['videoId'], - 'url': media['url'], - 'title': media['metadata']['title'], - 'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'), + 'id': api_data.get('videoId') or video_id, + 'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'], + 'description': metadata.get('description'), + 'timestamp': parse_iso8601(metadata.get('publishDate')), + 'duration': int_or_none(metadata.get('duration')), + 'display_id': metadata.get('slug') or video_id, + 'uploader_id': metadata.get('creator'), + 'thumbnails': thumbnails, + 'formats': formats, } @@ -125,16 +170,17 @@ class OneUPIE(IGNIE): _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)\.html' IE_NAME = '1up.com' - _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>' - _TESTS = [{ 'url': 'http://gamevideos.1up.com/video/id/34976.html', - 'md5': '68a54ce4ebc772e4b71e3123d413163d', + 'md5': 'c9cc69e07acb675c31a16719f909e347', 'info_dict': { 'id': '34976', 'ext': 'mp4', 'title': 'Sniper Elite V2 - Trailer', - 'description': 'md5:5d289b722f5a6d940ca3136e9dae89cf', + 'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826', + 'timestamp': 1313099220, + 'upload_date': '20110811', + 'uploader_id': 'IGN', } }] @@ -143,3 +189,36 @@ class OneUPIE(IGNIE): result = super(OneUPIE, self)._real_extract(url) result['id'] = mobj.group('name_or_id') return result + + +class PCMagIE(IGNIE): + _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?P<type>videos|article2)(/.+)?/(?P<name_or_id>.+)' + IE_NAME = 'pcmag' + + _EMBED_RE = r'iframe.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content.html?[^"]*url=([^"]+)["&]' + + _TESTS = [{ + 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', + 'md5': '212d6154fd0361a2781075f1febbe9ad', + 'info_dict': { + 'id': 'ee10d774b508c9b8ec07e763b9125b91', + 'ext': 'mp4', + 'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?', + 'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3', + 'timestamp': 1420571160, + 'upload_date': '20150106', + 'uploader_id': 'cozzipix@gmail.com', + } + }, { + 'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp', + 'md5': '94130c1ca07ba0adb6088350681f16c1', + 'info_dict': { + 'id': '042e560ba94823d43afcb12ddf7142ca', + 'ext': 'mp4', + 'title': 'HTC\'s Weird New Re Camera - What\'s New Now', + 'description': 'md5:53433c45df96d2ea5d0fda18be2ca908', + 'timestamp': 1412953920, + 'upload_date': '20141010', + 'uploader_id': 'chris_snyder@pcmag.com', + } + }] diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 70c8ca64e..85e9344aa 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -13,7 +13,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!gallery)(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|topic/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -21,7 +21,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', + 'description': 'Imgur: The most awesome images on the Internet.', }, }, { 'url': 'https://imgur.com/A61SaA1', @@ -29,8 +29,20 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', + 'description': 'Imgur: The most awesome images on the Internet.', }, + }, { + 'url': 'https://imgur.com/gallery/YcAQlkx', + 'info_dict': { + 'id': 'YcAQlkx', + 'ext': 'mp4', + 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', + 'description': 'Imgur: The most awesome images on the Internet.' + + } + }, { + 'url': 'http://imgur.com/topic/Funny/N8rOudd', + 'only_matching': True, }] def _real_extract(self, url): @@ -100,25 +112,38 @@ class ImgurIE(InfoExtractor): class ImgurAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/gallery/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:a|gallery|topic/[^/]+)/)?(?P<id>[a-zA-Z0-9]{5})(?:[/?#&]+)?$' - _TEST = { + _TESTS = [{ 'url': 'http://imgur.com/gallery/Q95ko', 'info_dict': { 'id': 'Q95ko', }, 'playlist_count': 25, - } + }, { + 'url': 'http://imgur.com/a/j6Orj', + 'only_matching': True, + }, { + 'url': 'http://imgur.com/topic/Aww/ll5Vk', + 'only_matching': True, + }] def _real_extract(self, url): album_id = self._match_id(url) album_images = self._download_json( 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, - album_id)['data']['images'] - - entries = [ - self.url_result('http://imgur.com/%s' % image['hash']) - for image in album_images if image.get('hash')] - - return self.playlist_result(entries, album_id) + album_id, fatal=False) + + if album_images: + data = album_images.get('data') + if data and isinstance(data, dict): + images = data.get('images') + if images and isinstance(images, list): + entries = [ + self.url_result('http://imgur.com/%s' % image['hash']) + for image in images if image.get('hash')] + return self.playlist_result(entries, album_id) + + # Fallback to single video + return self.url_result('http://imgur.com/%s' % album_id, ImgurIE.ie_key()) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index c158f2064..e5e16ca3b 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -47,7 +47,7 @@ class InstagramIE(InfoExtractor): class InstagramUserIE(InfoExtractor): - _VALID_URL = r'https://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' _TEST = { diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index f96e12e69..c3731a110 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -205,8 +205,8 @@ class IqiyiIE(InfoExtractor): def get_enc_key(self, swf_url, video_id): # TODO: automatic key extraction - # last update at 2015-12-06 for Zombie::bite - enc_key = '3719f6a1da83ee0aee3488d8802d7696'[::-1] + # last update at 2015-12-18 for Zombie::bite + enc_key = '8b6b683780897eb8d9a48a02ccc4817d'[::-1] return enc_key def _real_extract(self, url): diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py new file mode 100644 index 000000000..cdc095a79 --- /dev/null +++ b/youtube_dl/extractor/jwplatform.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class JWPlatformIE(InfoExtractor): + _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' + _TEST = { + 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', + 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', + 'info_dict': { + 'id': 'nPripu9l', + 'ext': 'mov', + 'title': 'Big Buck Bunny Trailer', + 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', + 'upload_date': '20081127', + 'timestamp': 1227796140, + } + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) + video_data = json_data['playlist'][0] + subtitles = {} + for track in video_data['tracks']: + if track['kind'] == 'captions': + subtitles[track['label']] = [{'url': self._proto_relative_url(track['file'])}] + + formats = [] + for source in video_data['sources']: + source_url = self._proto_relative_url(source['file']) + source_type = source.get('type') or '' + if source_type == 'application/vnd.apple.mpegurl': + m3u8_formats = self._extract_m3u8_formats(source_url, video_id, 'mp4', 'm3u8_native', fatal=None) + if m3u8_formats: + formats.extend(m3u8_formats) + elif source_type.startswith('audio'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + }) + else: + formats.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['title'], + 'description': video_data.get('description'), + 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 583b1a5ad..4807c8110 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -45,7 +45,7 @@ class KalturaIE(InfoExtractor): 'info_dict': { 'id': '1_1jc2y3e4', 'ext': 'mp4', - 'title': 'Track 4', + 'title': 'Straight from the Heart', 'upload_date': '20131219', 'uploader_id': 'mlundberg@wolfgangsvault.com', 'description': 'The Allman Brothers Band, 12/16/1981', @@ -115,12 +115,9 @@ class KalturaIE(InfoExtractor): 'version': '-1', }, { - 'action': 'getContextData', - 'contextDataParams:objectType': 'KalturaEntryContextDataParams', - 'contextDataParams:referrer': 'http://www.kaltura.com/', - 'contextDataParams:streamerType': 'http', + 'action': 'getbyentryid', 'entryId': video_id, - 'service': 'baseentry', + 'service': 'flavorAsset', }, ] return self._kaltura_api_call( @@ -133,7 +130,7 @@ class KalturaIE(InfoExtractor): partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5') entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5') - info, source_data = self._get_video_info(entry_id, partner_id) + info, flavor_assets = self._get_video_info(entry_id, partner_id) source_url = smuggled_data.get('source_url') if source_url: @@ -144,7 +141,10 @@ class KalturaIE(InfoExtractor): referrer = None formats = [] - for f in source_data['flavorAssets']: + for f in flavor_assets: + # Continue if asset is not ready + if f['status'] != 2: + continue video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id']) if referrer: video_url += '?referrer=%s' % referrer @@ -160,6 +160,14 @@ class KalturaIE(InfoExtractor): 'width': int_or_none(f.get('width')), 'url': video_url, }) + m3u8_url = info['dataUrl'].replace('format/url', 'format/applehttp') + if referrer: + m3u8_url += '?referrer=%s' % referrer + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + self._check_formats(formats, entry_id) self._sort_formats(formats) diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index e3236f7b5..863efd896 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -1,12 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( - determine_ext, - js_to_json, + int_or_none, parse_duration, remove_end, ) @@ -23,9 +20,11 @@ class LRTIE(InfoExtractor): 'title': 'Septynios Kauno dienos', 'description': 'md5:24d84534c7dc76581e59f5689462411a', 'duration': 1783, + 'view_count': int, + 'like_count': int, }, 'params': { - 'skip_download': True, # HLS download + 'skip_download': True, # m3u8 download }, } @@ -34,29 +33,23 @@ class LRTIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = remove_end(self._og_search_title(webpage), ' - LRT') + m3u8_url = self._search_regex( + r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*location\.hash\.substring\(1\)', + webpage, 'm3u8 url', group='url') + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage) duration = parse_duration(self._search_regex( - r"'duration':\s*'([^']+)',", webpage, - 'duration', fatal=False, default=None)) + r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1', + webpage, 'duration', default=None, group='duration')) - formats = [] - for js in re.findall(r'(?s)config:\s*(\{.*?\})', webpage): - data = self._parse_json(js, video_id, transform_source=js_to_json) - if 'provider' not in data: - continue - if data['provider'] == 'rtmp': - formats.append({ - 'format_id': 'rtmp', - 'ext': determine_ext(data['file']), - 'url': data['streamer'], - 'play_path': 'mp4:%s' % data['file'], - 'preference': -1, - 'rtmp_real_time': True, - }) - else: - formats.extend( - self._extract_m3u8_formats(data['file'], video_id, 'mp4')) + view_count = int_or_none(self._html_search_regex( + r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>', + webpage, 'view count', fatal=False, group='count')) + like_count = int_or_none(self._search_regex( + r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<', + webpage, 'like count', fatal=False, group='count')) return { 'id': video_id, @@ -65,4 +58,6 @@ class LRTIE(InfoExtractor): 'thumbnail': thumbnail, 'description': description, 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, } diff --git a/youtube_dl/extractor/makertv.py b/youtube_dl/extractor/makertv.py new file mode 100644 index 000000000..3c34d4604 --- /dev/null +++ b/youtube_dl/extractor/makertv.py @@ -0,0 +1,32 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MakerTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)*video|makerplayer.com/embed/maker)/(?P<id>[a-zA-Z0-9]{12})' + _TEST = { + 'url': 'http://www.maker.tv/video/Fh3QgymL9gsc', + 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', + 'info_dict': { + 'id': 'Fh3QgymL9gsc', + 'ext': 'mp4', + 'title': 'Maze Runner: The Scorch Trials Official Movie Review', + 'description': 'md5:11ff3362d7ef1d679fdb649f6413975a', + 'upload_date': '20150918', + 'timestamp': 1442549540, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + jwplatform_id = self._search_regex(r'jw_?id="([^"]+)"', webpage, 'jwplatform id') + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'jwplatform:%s' % jwplatform_id, + 'ie_key': 'JWPlatform', + } diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 4c1eca96f..340c922bd 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -3,10 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, find_xpath_attr, @@ -189,7 +186,7 @@ class NBCNewsIE(InfoExtractor): 'title': info.find('headline').text, 'ext': 'flv', 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, - 'description': compat_str(info.find('caption').text), + 'description': info.find('caption').text, 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, } else: diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 16213eed9..894c51399 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -88,10 +88,10 @@ class NDRIE(NDRBaseIE): 'embedURL', webpage, 'embed URL', fatal=True) description = self._search_regex( r'<p[^>]+itemprop="description">([^<]+)</p>', - webpage, 'description', fatal=False) + webpage, 'description', default=None) or self._og_search_description(webpage) timestamp = parse_iso8601( self._search_regex( - r'<span itemprop="datePublished" content="([^"]+)">', + r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"', webpage, 'upload date', fatal=False)) return { '_type': 'url_transparent', diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 76bd21e6d..d440313d5 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -9,6 +9,7 @@ from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse, + compat_urlparse, ) from ..utils import ( clean_html, @@ -82,14 +83,21 @@ class NocoIE(InfoExtractor): if 'erreur' in login: raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True) + @staticmethod + def _ts(): + return int(time.time() * 1000) + def _call_api(self, path, video_id, note, sub_lang=None): - ts = compat_str(int(time.time() * 1000)) + ts = compat_str(self._ts() + self._ts_offset) tk = hashlib.md5((hashlib.md5(ts.encode('ascii')).hexdigest() + '#8S?uCraTedap6a').encode('ascii')).hexdigest() url = self._API_URL_TEMPLATE % (path, ts, tk) if sub_lang: url += self._SUB_LANG_TEMPLATE % sub_lang - resp = self._download_json(url, video_id, note) + request = sanitized_Request(url) + request.add_header('Referer', self._referer) + + resp = self._download_json(request, video_id, note) if isinstance(resp, dict) and resp.get('error'): self._raise_error(resp['error'], resp['description']) @@ -102,8 +110,22 @@ class NocoIE(InfoExtractor): expected=True) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) + + # Timestamp adjustment offset between server time and local time + # must be calculated in order to use timestamps closest to server's + # in all API requests (see https://github.com/rg3/youtube-dl/issues/7864) + webpage = self._download_webpage(url, video_id) + + player_url = self._search_regex( + r'(["\'])(?P<player>https?://noco\.tv/(?:[^/]+/)+NocoPlayer.+?\.swf.*?)\1', + webpage, 'noco player', group='player', + default='http://noco.tv/cdata/js/player/NocoPlayer-v1.2.40.swf') + + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(player_url).query) + ts = int_or_none(qs.get('ts', [None])[0]) + self._ts_offset = ts - self._ts() if ts else 0 + self._referer = player_url medias = self._call_api( 'shows/%s/medias' % video_id, @@ -155,8 +177,8 @@ class NocoIE(InfoExtractor): 'format_id': format_id_extended, 'width': int_or_none(fmt.get('res_width')), 'height': int_or_none(fmt.get('res_lines')), - 'abr': int_or_none(fmt.get('audiobitrate')), - 'vbr': int_or_none(fmt.get('videobitrate')), + 'abr': int_or_none(fmt.get('audiobitrate'), 1000), + 'vbr': int_or_none(fmt.get('videobitrate'), 1000), 'filesize': int_or_none(fmt.get('filesize')), 'format_note': qualities[format_id].get('quality_name'), 'quality': qualities[format_id].get('priority'), diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index d480fb58c..446f5901c 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -1,7 +1,10 @@ # encoding: utf-8 from __future__ import unicode_literals -from .brightcove import BrightcoveLegacyIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -23,9 +26,12 @@ class NownessBaseIE(InfoExtractor): note='Downloading player JavaScript', errnote='Unable to download player JavaScript') bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code) - if bc_url is None: - raise ExtractorError('Could not find player definition') - return self.url_result(bc_url, 'BrightcoveLegacy') + if bc_url: + return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) + bc_url = BrightcoveNewIE._extract_url(player_code) + if bc_url: + return self.url_result(bc_url, BrightcoveNewIE.ie_key()) + raise ExtractorError('Could not find player definition') elif source == 'vimeo': return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') elif source == 'youtube': diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 604a0dd22..97e8ffc97 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -16,165 +16,165 @@ from ..utils import ( class PBSIE(InfoExtractor): _STATIONS = ( - ('video.pbs.org', 'PBS: Public Broadcasting Service'), # http://www.pbs.org/ - ('video.aptv.org', 'APT - Alabama Public Television (WBIQ)'), # http://aptv.org/ - ('video.gpb.org', 'GPB/Georgia Public Broadcasting (WGTV)'), # http://www.gpb.org/ - ('video.mpbonline.org', 'Mississippi Public Broadcasting (WMPN)'), # http://www.mpbonline.org - ('video.wnpt.org', 'Nashville Public Television (WNPT)'), # http://www.wnpt.org - ('video.wfsu.org', 'WFSU-TV (WFSU)'), # http://wfsu.org/ - ('video.wsre.org', 'WSRE (WSRE)'), # http://www.wsre.org - ('video.wtcitv.org', 'WTCI (WTCI)'), # http://www.wtcitv.org - ('video.pba.org', 'WPBA/Channel 30 (WPBA)'), # http://pba.org/ - ('video.alaskapublic.org', 'Alaska Public Media (KAKM)'), # http://alaskapublic.org/kakm - # ('kuac.org', 'KUAC (KUAC)'), # http://kuac.org/kuac-tv/ - # ('ktoo.org', '360 North (KTOO)'), # http://www.ktoo.org/ - # ('azpm.org', 'KUAT 6 (KUAT)'), # http://www.azpm.org/ - ('video.azpbs.org', 'Arizona PBS (KAET)'), # http://www.azpbs.org - ('portal.knme.org', 'KNME-TV/Channel 5 (KNME)'), # http://www.newmexicopbs.org/ - ('video.vegaspbs.org', 'Vegas PBS (KLVX)'), # http://vegaspbs.org/ - ('watch.aetn.org', 'AETN/ARKANSAS ETV NETWORK (KETS)'), # http://www.aetn.org/ - ('video.ket.org', 'KET (WKLE)'), # http://www.ket.org/ - ('video.wkno.org', 'WKNO/Channel 10 (WKNO)'), # http://www.wkno.org/ - ('video.lpb.org', 'LPB/LOUISIANA PUBLIC BROADCASTING (WLPB)'), # http://www.lpb.org/ - ('videos.oeta.tv', 'OETA (KETA)'), # http://www.oeta.tv - ('video.optv.org', 'Ozarks Public Television (KOZK)'), # http://www.optv.org/ - ('watch.wsiu.org', 'WSIU Public Broadcasting (WSIU)'), # http://www.wsiu.org/ - ('video.keet.org', 'KEET TV (KEET)'), # http://www.keet.org - ('pbs.kixe.org', 'KIXE/Channel 9 (KIXE)'), # http://kixe.org/ - ('video.kpbs.org', 'KPBS San Diego (KPBS)'), # http://www.kpbs.org/ - ('video.kqed.org', 'KQED (KQED)'), # http://www.kqed.org - ('vids.kvie.org', 'KVIE Public Television (KVIE)'), # http://www.kvie.org - ('video.pbssocal.org', 'PBS SoCal/KOCE (KOCE)'), # http://www.pbssocal.org/ - ('video.valleypbs.org', 'ValleyPBS (KVPT)'), # http://www.valleypbs.org/ - ('video.cptv.org', 'CONNECTICUT PUBLIC TELEVISION (WEDH)'), # http://cptv.org - ('watch.knpb.org', 'KNPB Channel 5 (KNPB)'), # http://www.knpb.org/ - ('video.soptv.org', 'SOPTV (KSYS)'), # http://www.soptv.org - # ('klcs.org', 'KLCS/Channel 58 (KLCS)'), # http://www.klcs.org - # ('krcb.org', 'KRCB Television & Radio (KRCB)'), # http://www.krcb.org - # ('kvcr.org', 'KVCR TV/DT/FM :: Vision for the Future (KVCR)'), # http://kvcr.org - ('video.rmpbs.org', 'Rocky Mountain PBS (KRMA)'), # http://www.rmpbs.org - ('video.kenw.org', 'KENW-TV3 (KENW)'), # http://www.kenw.org - ('video.kued.org', 'KUED Channel 7 (KUED)'), # http://www.kued.org - ('video.wyomingpbs.org', 'Wyoming PBS (KCWC)'), # http://www.wyomingpbs.org - ('video.cpt12.org', 'Colorado Public Television / KBDI 12 (KBDI)'), # http://www.cpt12.org/ - ('video.kbyueleven.org', 'KBYU-TV (KBYU)'), # http://www.kbyutv.org/ - ('video.thirteen.org', 'Thirteen/WNET New York (WNET)'), # http://www.thirteen.org - ('video.wgbh.org', 'WGBH/Channel 2 (WGBH)'), # http://wgbh.org - ('video.wgby.org', 'WGBY (WGBY)'), # http://www.wgby.org - ('watch.njtvonline.org', 'NJTV Public Media NJ (WNJT)'), # http://www.njtvonline.org/ - # ('ripbs.org', 'Rhode Island PBS (WSBE)'), # http://www.ripbs.org/home/ - ('watch.wliw.org', 'WLIW21 (WLIW)'), # http://www.wliw.org/ - ('video.mpt.tv', 'mpt/Maryland Public Television (WMPB)'), # http://www.mpt.org - ('watch.weta.org', 'WETA Television and Radio (WETA)'), # http://www.weta.org - ('video.whyy.org', 'WHYY (WHYY)'), # http://www.whyy.org - ('video.wlvt.org', 'PBS 39 (WLVT)'), # http://www.wlvt.org/ - ('video.wvpt.net', 'WVPT - Your Source for PBS and More! (WVPT)'), # http://www.wvpt.net - ('video.whut.org', 'Howard University Television (WHUT)'), # http://www.whut.org - ('video.wedu.org', 'WEDU PBS (WEDU)'), # http://www.wedu.org - ('video.wgcu.org', 'WGCU Public Media (WGCU)'), # http://www.wgcu.org/ - # ('wjct.org', 'WJCT Public Broadcasting (WJCT)'), # http://www.wjct.org - ('video.wpbt2.org', 'WPBT2 (WPBT)'), # http://www.wpbt2.org - ('video.wucftv.org', 'WUCF TV (WUCF)'), # http://wucftv.org - ('video.wuft.org', 'WUFT/Channel 5 (WUFT)'), # http://www.wuft.org - ('watch.wxel.org', 'WXEL/Channel 42 (WXEL)'), # http://www.wxel.org/home/ - ('video.wlrn.org', 'WLRN/Channel 17 (WLRN)'), # http://www.wlrn.org/ - ('video.wusf.usf.edu', 'WUSF Public Broadcasting (WUSF)'), # http://wusf.org/ - ('video.scetv.org', 'ETV (WRLK)'), # http://www.scetv.org - ('video.unctv.org', 'UNC-TV (WUNC)'), # http://www.unctv.org/ - # ('pbsguam.org', 'PBS Guam (KGTF)'), # http://www.pbsguam.org/ - ('video.pbshawaii.org', 'PBS Hawaii - Oceanic Cable Channel 10 (KHET)'), # http://www.pbshawaii.org/ - ('video.idahoptv.org', 'Idaho Public Television (KAID)'), # http://idahoptv.org - ('video.ksps.org', 'KSPS (KSPS)'), # http://www.ksps.org/home/ - ('watch.opb.org', 'OPB (KOPB)'), # http://www.opb.org - ('watch.nwptv.org', 'KWSU/Channel 10 & KTNW/Channel 31 (KWSU)'), # http://www.kwsu.org - ('video.will.illinois.edu', 'WILL-TV (WILL)'), # http://will.illinois.edu/ - ('video.networkknowledge.tv', 'Network Knowledge - WSEC/Springfield (WSEC)'), # http://www.wsec.tv - ('video.wttw.com', 'WTTW11 (WTTW)'), # http://www.wttw.com/ - # ('wtvp.org', 'WTVP & WTVP.org, Public Media for Central Illinois (WTVP)'), # http://www.wtvp.org/ - ('video.iptv.org', 'Iowa Public Television/IPTV (KDIN)'), # http://www.iptv.org/ - ('video.ninenet.org', 'Nine Network (KETC)'), # http://www.ninenet.org - ('video.wfwa.org', 'PBS39 Fort Wayne (WFWA)'), # http://wfwa.org/ - ('video.wfyi.org', 'WFYI Indianapolis (WFYI)'), # http://www.wfyi.org - ('video.mptv.org', 'Milwaukee Public Television (WMVS)'), # http://www.mptv.org - ('video.wnin.org', 'WNIN (WNIN)'), # http://www.wnin.org/ - ('video.wnit.org', 'WNIT Public Television (WNIT)'), # http://www.wnit.org/ - ('video.wpt.org', 'WPT (WPNE)'), # http://www.wpt.org/ - ('video.wvut.org', 'WVUT/Channel 22 (WVUT)'), # http://wvut.org/ - ('video.weiu.net', 'WEIU/Channel 51 (WEIU)'), # http://www.weiu.net - ('video.wqpt.org', 'WQPT-TV (WQPT)'), # http://www.wqpt.org - ('video.wycc.org', 'WYCC PBS Chicago (WYCC)'), # http://www.wycc.org - # ('lakeshorepublicmedia.org', 'Lakeshore Public Television (WYIN)'), # http://lakeshorepublicmedia.org/ - ('video.wipb.org', 'WIPB-TV (WIPB)'), # http://wipb.org - ('video.indianapublicmedia.org', 'WTIU (WTIU)'), # http://indianapublicmedia.org/tv/ - ('watch.cetconnect.org', 'CET (WCET)'), # http://www.cetconnect.org - ('video.thinktv.org', 'ThinkTVNetwork (WPTD)'), # http://www.thinktv.org - ('video.wbgu.org', 'WBGU-TV (WBGU)'), # http://wbgu.org - ('video.wgvu.org', 'WGVU TV (WGVU)'), # http://www.wgvu.org/ - ('video.netnebraska.org', 'NET1 (KUON)'), # http://netnebraska.org - ('video.pioneer.org', 'Pioneer Public Television (KWCM)'), # http://www.pioneer.org - ('watch.sdpb.org', 'SDPB Television (KUSD)'), # http://www.sdpb.org - ('video.tpt.org', 'TPT (KTCA)'), # http://www.tpt.org - ('watch.ksmq.org', 'KSMQ (KSMQ)'), # http://www.ksmq.org/ - ('watch.kpts.org', 'KPTS/Channel 8 (KPTS)'), # http://www.kpts.org/ - ('watch.ktwu.org', 'KTWU/Channel 11 (KTWU)'), # http://ktwu.org - # ('shptv.org', 'Smoky Hills Public Television (KOOD)'), # http://www.shptv.org - # ('kcpt.org', 'KCPT Kansas City Public Television (KCPT)'), # http://kcpt.org/ - # ('blueridgepbs.org', 'Blue Ridge PBS (WBRA)'), # http://www.blueridgepbs.org/ - ('watch.easttennesseepbs.org', 'East Tennessee PBS (WSJK)'), # http://easttennesseepbs.org - ('video.wcte.tv', 'WCTE-TV (WCTE)'), # http://www.wcte.org - ('video.wljt.org', 'WLJT, Channel 11 (WLJT)'), # http://wljt.org/ - ('video.wosu.org', 'WOSU TV (WOSU)'), # http://wosu.org/ - ('video.woub.org', 'WOUB/WOUC (WOUB)'), # http://woub.org/tv/index.php?section=5 - ('video.wvpublic.org', 'WVPB (WVPB)'), # http://wvpublic.org/ - ('video.wkyupbs.org', 'WKYU-PBS (WKYU)'), # http://www.wkyupbs.org - # ('wyes.org', 'WYES-TV/New Orleans (WYES)'), # http://www.wyes.org - ('video.kera.org', 'KERA 13 (KERA)'), # http://www.kera.org/ - ('video.mpbn.net', 'MPBN (WCBB)'), # http://www.mpbn.net/ - ('video.mountainlake.org', 'Mountain Lake PBS (WCFE)'), # http://www.mountainlake.org/ - ('video.nhptv.org', 'NHPTV (WENH)'), # http://nhptv.org/ - ('video.vpt.org', 'Vermont PBS (WETK)'), # http://www.vpt.org - ('video.witf.org', 'witf (WITF)'), # http://www.witf.org - ('watch.wqed.org', 'WQED Multimedia (WQED)'), # http://www.wqed.org/ - ('video.wmht.org', 'WMHT Educational Telecommunications (WMHT)'), # http://www.wmht.org/home/ - ('video.deltabroadcasting.org', 'Q-TV (WDCQ)'), # http://www.deltabroadcasting.org - ('video.dptv.org', 'WTVS Detroit Public TV (WTVS)'), # http://www.dptv.org/ - ('video.wcmu.org', 'CMU Public Television (WCMU)'), # http://www.wcmu.org - ('video.wkar.org', 'WKAR-TV (WKAR)'), # http://wkar.org/ - ('wnmuvideo.nmu.edu', 'WNMU-TV Public TV 13 (WNMU)'), # http://wnmutv.nmu.edu - ('video.wdse.org', 'WDSE - WRPT (WDSE)'), # http://www.wdse.org/ - ('video.wgte.org', 'WGTE TV (WGTE)'), # http://www.wgte.org - ('video.lptv.org', 'Lakeland Public Television (KAWE)'), # http://www.lakelandptv.org - # ('prairiepublic.org', 'PRAIRIE PUBLIC (KFME)'), # http://www.prairiepublic.org/ - ('video.kmos.org', 'KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS)'), # http://www.kmos.org/ - ('watch.montanapbs.org', 'MontanaPBS (KUSM)'), # http://montanapbs.org - ('video.krwg.org', 'KRWG/Channel 22 (KRWG)'), # http://www.krwg.org - ('video.kacvtv.org', 'KACV (KACV)'), # http://www.panhandlepbs.org/home/ - ('video.kcostv.org', 'KCOS/Channel 13 (KCOS)'), # www.kcostv.org - ('video.wcny.org', 'WCNY/Channel 24 (WCNY)'), # http://www.wcny.org - ('video.wned.org', 'WNED (WNED)'), # http://www.wned.org/ - ('watch.wpbstv.org', 'WPBS (WPBS)'), # http://www.wpbstv.org - ('video.wskg.org', 'WSKG Public TV (WSKG)'), # http://wskg.org - ('video.wxxi.org', 'WXXI (WXXI)'), # http://wxxi.org - ('video.wpsu.org', 'WPSU (WPSU)'), # http://www.wpsu.org - # ('wqln.org', 'WQLN/Channel 54 (WQLN)'), # http://www.wqln.org - ('on-demand.wvia.org', 'WVIA Public Media Studios (WVIA)'), # http://www.wvia.org/ - ('video.wtvi.org', 'WTVI (WTVI)'), # http://www.wtvi.org/ - # ('whro.org', 'WHRO (WHRO)'), # http://whro.org - ('video.westernreservepublicmedia.org', 'Western Reserve PBS (WNEO)'), # http://www.WesternReservePublicMedia.org/ - ('video.ideastream.org', 'WVIZ/PBS ideastream (WVIZ)'), # http://www.wviz.org/ - ('video.kcts9.org', 'KCTS 9 (KCTS)'), # http://kcts9.org/ - ('video.basinpbs.org', 'Basin PBS (KPBT)'), # http://www.basinpbs.org - ('video.houstonpbs.org', 'KUHT / Channel 8 (KUHT)'), # http://www.houstonpublicmedia.org/ - # ('tamu.edu', 'KAMU - TV (KAMU)'), # http://KAMU.tamu.edu - # ('kedt.org', 'KEDT/Channel 16 (KEDT)'), # http://www.kedt.org - ('video.klrn.org', 'KLRN (KLRN)'), # http://www.klrn.org - ('video.klru.tv', 'KLRU (KLRU)'), # http://www.klru.org - # ('kmbh.org', 'KMBH-TV (KMBH)'), # http://www.kmbh.org - # ('knct.org', 'KNCT (KNCT)'), # http://www.knct.org - # ('ktxt.org', 'KTTZ-TV (KTXT)'), # http://www.ktxt.org - ('video.wtjx.org', 'WTJX Channel 12 (WTJX)'), # http://www.wtjx.org/ - ('video.ideastations.org', 'WCVE PBS (WCVE)'), # http://ideastations.org/ - ('video.kbtc.org', 'KBTC Public Television (KBTC)'), # http://kbtc.org + (r'(?:video|www|player)\.pbs\.org', 'PBS: Public Broadcasting Service'), # http://www.pbs.org/ + (r'video\.aptv\.org', 'APT - Alabama Public Television (WBIQ)'), # http://aptv.org/ + (r'video\.gpb\.org', 'GPB/Georgia Public Broadcasting (WGTV)'), # http://www.gpb.org/ + (r'video\.mpbonline\.org', 'Mississippi Public Broadcasting (WMPN)'), # http://www.mpbonline.org + (r'video\.wnpt\.org', 'Nashville Public Television (WNPT)'), # http://www.wnpt.org + (r'video\.wfsu\.org', 'WFSU-TV (WFSU)'), # http://wfsu.org/ + (r'video\.wsre\.org', 'WSRE (WSRE)'), # http://www.wsre.org + (r'video\.wtcitv\.org', 'WTCI (WTCI)'), # http://www.wtcitv.org + (r'video\.pba\.org', 'WPBA/Channel 30 (WPBA)'), # http://pba.org/ + (r'video\.alaskapublic\.org', 'Alaska Public Media (KAKM)'), # http://alaskapublic.org/kakm + # (r'kuac\.org', 'KUAC (KUAC)'), # http://kuac.org/kuac-tv/ + # (r'ktoo\.org', '360 North (KTOO)'), # http://www.ktoo.org/ + # (r'azpm\.org', 'KUAT 6 (KUAT)'), # http://www.azpm.org/ + (r'video\.azpbs\.org', 'Arizona PBS (KAET)'), # http://www.azpbs.org + (r'portal\.knme\.org', 'KNME-TV/Channel 5 (KNME)'), # http://www.newmexicopbs.org/ + (r'video\.vegaspbs\.org', 'Vegas PBS (KLVX)'), # http://vegaspbs.org/ + (r'watch\.aetn\.org', 'AETN/ARKANSAS ETV NETWORK (KETS)'), # http://www.aetn.org/ + (r'video\.ket\.org', 'KET (WKLE)'), # http://www.ket.org/ + (r'video\.wkno\.org', 'WKNO/Channel 10 (WKNO)'), # http://www.wkno.org/ + (r'video\.lpb\.org', 'LPB/LOUISIANA PUBLIC BROADCASTING (WLPB)'), # http://www.lpb.org/ + (r'videos\.oeta\.tv', 'OETA (KETA)'), # http://www.oeta.tv + (r'video\.optv\.org', 'Ozarks Public Television (KOZK)'), # http://www.optv.org/ + (r'watch\.wsiu\.org', 'WSIU Public Broadcasting (WSIU)'), # http://www.wsiu.org/ + (r'video\.keet\.org', 'KEET TV (KEET)'), # http://www.keet.org + (r'pbs\.kixe\.org', 'KIXE/Channel 9 (KIXE)'), # http://kixe.org/ + (r'video\.kpbs\.org', 'KPBS San Diego (KPBS)'), # http://www.kpbs.org/ + (r'video\.kqed\.org', 'KQED (KQED)'), # http://www.kqed.org + (r'vids\.kvie\.org', 'KVIE Public Television (KVIE)'), # http://www.kvie.org + (r'video\.pbssocal\.org', 'PBS SoCal/KOCE (KOCE)'), # http://www.pbssocal.org/ + (r'video\.valleypbs\.org', 'ValleyPBS (KVPT)'), # http://www.valleypbs.org/ + (r'video\.cptv\.org', 'CONNECTICUT PUBLIC TELEVISION (WEDH)'), # http://cptv.org + (r'watch\.knpb\.org', 'KNPB Channel 5 (KNPB)'), # http://www.knpb.org/ + (r'video\.soptv\.org', 'SOPTV (KSYS)'), # http://www.soptv.org + # (r'klcs\.org', 'KLCS/Channel 58 (KLCS)'), # http://www.klcs.org + # (r'krcb\.org', 'KRCB Television & Radio (KRCB)'), # http://www.krcb.org + # (r'kvcr\.org', 'KVCR TV/DT/FM :: Vision for the Future (KVCR)'), # http://kvcr.org + (r'video\.rmpbs\.org', 'Rocky Mountain PBS (KRMA)'), # http://www.rmpbs.org + (r'video\.kenw\.org', 'KENW-TV3 (KENW)'), # http://www.kenw.org + (r'video\.kued\.org', 'KUED Channel 7 (KUED)'), # http://www.kued.org + (r'video\.wyomingpbs\.org', 'Wyoming PBS (KCWC)'), # http://www.wyomingpbs.org + (r'video\.cpt12\.org', 'Colorado Public Television / KBDI 12 (KBDI)'), # http://www.cpt12.org/ + (r'video\.kbyueleven\.org', 'KBYU-TV (KBYU)'), # http://www.kbyutv.org/ + (r'video\.thirteen\.org', 'Thirteen/WNET New York (WNET)'), # http://www.thirteen.org + (r'video\.wgbh\.org', 'WGBH/Channel 2 (WGBH)'), # http://wgbh.org + (r'video\.wgby\.org', 'WGBY (WGBY)'), # http://www.wgby.org + (r'watch\.njtvonline\.org', 'NJTV Public Media NJ (WNJT)'), # http://www.njtvonline.org/ + # (r'ripbs\.org', 'Rhode Island PBS (WSBE)'), # http://www.ripbs.org/home/ + (r'watch\.wliw\.org', 'WLIW21 (WLIW)'), # http://www.wliw.org/ + (r'video\.mpt\.tv', 'mpt/Maryland Public Television (WMPB)'), # http://www.mpt.org + (r'watch\.weta\.org', 'WETA Television and Radio (WETA)'), # http://www.weta.org + (r'video\.whyy\.org', 'WHYY (WHYY)'), # http://www.whyy.org + (r'video\.wlvt\.org', 'PBS 39 (WLVT)'), # http://www.wlvt.org/ + (r'video\.wvpt\.net', 'WVPT - Your Source for PBS and More! (WVPT)'), # http://www.wvpt.net + (r'video\.whut\.org', 'Howard University Television (WHUT)'), # http://www.whut.org + (r'video\.wedu\.org', 'WEDU PBS (WEDU)'), # http://www.wedu.org + (r'video\.wgcu\.org', 'WGCU Public Media (WGCU)'), # http://www.wgcu.org/ + # (r'wjct\.org', 'WJCT Public Broadcasting (WJCT)'), # http://www.wjct.org + (r'video\.wpbt2\.org', 'WPBT2 (WPBT)'), # http://www.wpbt2.org + (r'video\.wucftv\.org', 'WUCF TV (WUCF)'), # http://wucftv.org + (r'video\.wuft\.org', 'WUFT/Channel 5 (WUFT)'), # http://www.wuft.org + (r'watch\.wxel\.org', 'WXEL/Channel 42 (WXEL)'), # http://www.wxel.org/home/ + (r'video\.wlrn\.org', 'WLRN/Channel 17 (WLRN)'), # http://www.wlrn.org/ + (r'video\.wusf\.usf\.edu', 'WUSF Public Broadcasting (WUSF)'), # http://wusf.org/ + (r'video\.scetv\.org', 'ETV (WRLK)'), # http://www.scetv.org + (r'video\.unctv\.org', 'UNC-TV (WUNC)'), # http://www.unctv.org/ + # (r'pbsguam\.org', 'PBS Guam (KGTF)'), # http://www.pbsguam.org/ + (r'video\.pbshawaii\.org', 'PBS Hawaii - Oceanic Cable Channel 10 (KHET)'), # http://www.pbshawaii.org/ + (r'video\.idahoptv\.org', 'Idaho Public Television (KAID)'), # http://idahoptv.org + (r'video\.ksps\.org', 'KSPS (KSPS)'), # http://www.ksps.org/home/ + (r'watch\.opb\.org', 'OPB (KOPB)'), # http://www.opb.org + (r'watch\.nwptv\.org', 'KWSU/Channel 10 & KTNW/Channel 31 (KWSU)'), # http://www.kwsu.org + (r'video\.will\.illinois\.edu', 'WILL-TV (WILL)'), # http://will.illinois.edu/ + (r'video\.networkknowledge\.tv', 'Network Knowledge - WSEC/Springfield (WSEC)'), # http://www.wsec.tv + (r'video\.wttw\.com', 'WTTW11 (WTTW)'), # http://www.wttw.com/ + # (r'wtvp\.org', 'WTVP & WTVP.org, Public Media for Central Illinois (WTVP)'), # http://www.wtvp.org/ + (r'video\.iptv\.org', 'Iowa Public Television/IPTV (KDIN)'), # http://www.iptv.org/ + (r'video\.ninenet\.org', 'Nine Network (KETC)'), # http://www.ninenet.org + (r'video\.wfwa\.org', 'PBS39 Fort Wayne (WFWA)'), # http://wfwa.org/ + (r'video\.wfyi\.org', 'WFYI Indianapolis (WFYI)'), # http://www.wfyi.org + (r'video\.mptv\.org', 'Milwaukee Public Television (WMVS)'), # http://www.mptv.org + (r'video\.wnin\.org', 'WNIN (WNIN)'), # http://www.wnin.org/ + (r'video\.wnit\.org', 'WNIT Public Television (WNIT)'), # http://www.wnit.org/ + (r'video\.wpt\.org', 'WPT (WPNE)'), # http://www.wpt.org/ + (r'video\.wvut\.org', 'WVUT/Channel 22 (WVUT)'), # http://wvut.org/ + (r'video\.weiu\.net', 'WEIU/Channel 51 (WEIU)'), # http://www.weiu.net + (r'video\.wqpt\.org', 'WQPT-TV (WQPT)'), # http://www.wqpt.org + (r'video\.wycc\.org', 'WYCC PBS Chicago (WYCC)'), # http://www.wycc.org + # (r'lakeshorepublicmedia\.org', 'Lakeshore Public Television (WYIN)'), # http://lakeshorepublicmedia.org/ + (r'video\.wipb\.org', 'WIPB-TV (WIPB)'), # http://wipb.org + (r'video\.indianapublicmedia\.org', 'WTIU (WTIU)'), # http://indianapublicmedia.org/tv/ + (r'watch\.cetconnect\.org', 'CET (WCET)'), # http://www.cetconnect.org + (r'video\.thinktv\.org', 'ThinkTVNetwork (WPTD)'), # http://www.thinktv.org + (r'video\.wbgu\.org', 'WBGU-TV (WBGU)'), # http://wbgu.org + (r'video\.wgvu\.org', 'WGVU TV (WGVU)'), # http://www.wgvu.org/ + (r'video\.netnebraska\.org', 'NET1 (KUON)'), # http://netnebraska.org + (r'video\.pioneer\.org', 'Pioneer Public Television (KWCM)'), # http://www.pioneer.org + (r'watch\.sdpb\.org', 'SDPB Television (KUSD)'), # http://www.sdpb.org + (r'video\.tpt\.org', 'TPT (KTCA)'), # http://www.tpt.org + (r'watch\.ksmq\.org', 'KSMQ (KSMQ)'), # http://www.ksmq.org/ + (r'watch\.kpts\.org', 'KPTS/Channel 8 (KPTS)'), # http://www.kpts.org/ + (r'watch\.ktwu\.org', 'KTWU/Channel 11 (KTWU)'), # http://ktwu.org + # (r'shptv\.org', 'Smoky Hills Public Television (KOOD)'), # http://www.shptv.org + # (r'kcpt\.org', 'KCPT Kansas City Public Television (KCPT)'), # http://kcpt.org/ + # (r'blueridgepbs\.org', 'Blue Ridge PBS (WBRA)'), # http://www.blueridgepbs.org/ + (r'watch\.easttennesseepbs\.org', 'East Tennessee PBS (WSJK)'), # http://easttennesseepbs.org + (r'video\.wcte\.tv', 'WCTE-TV (WCTE)'), # http://www.wcte.org + (r'video\.wljt\.org', 'WLJT, Channel 11 (WLJT)'), # http://wljt.org/ + (r'video\.wosu\.org', 'WOSU TV (WOSU)'), # http://wosu.org/ + (r'video\.woub\.org', 'WOUB/WOUC (WOUB)'), # http://woub.org/tv/index.php?section=5 + (r'video\.wvpublic\.org', 'WVPB (WVPB)'), # http://wvpublic.org/ + (r'video\.wkyupbs\.org', 'WKYU-PBS (WKYU)'), # http://www.wkyupbs.org + # (r'wyes\.org', 'WYES-TV/New Orleans (WYES)'), # http://www.wyes.org + (r'video\.kera\.org', 'KERA 13 (KERA)'), # http://www.kera.org/ + (r'video\.mpbn\.net', 'MPBN (WCBB)'), # http://www.mpbn.net/ + (r'video\.mountainlake\.org', 'Mountain Lake PBS (WCFE)'), # http://www.mountainlake.org/ + (r'video\.nhptv\.org', 'NHPTV (WENH)'), # http://nhptv.org/ + (r'video\.vpt\.org', 'Vermont PBS (WETK)'), # http://www.vpt.org + (r'video\.witf\.org', 'witf (WITF)'), # http://www.witf.org + (r'watch\.wqed\.org', 'WQED Multimedia (WQED)'), # http://www.wqed.org/ + (r'video\.wmht\.org', 'WMHT Educational Telecommunications (WMHT)'), # http://www.wmht.org/home/ + (r'video\.deltabroadcasting\.org', 'Q-TV (WDCQ)'), # http://www.deltabroadcasting.org + (r'video\.dptv\.org', 'WTVS Detroit Public TV (WTVS)'), # http://www.dptv.org/ + (r'video\.wcmu\.org', 'CMU Public Television (WCMU)'), # http://www.wcmu.org + (r'video\.wkar\.org', 'WKAR-TV (WKAR)'), # http://wkar.org/ + (r'wnmuvideo\.nmu\.edu', 'WNMU-TV Public TV 13 (WNMU)'), # http://wnmutv.nmu.edu + (r'video\.wdse\.org', 'WDSE - WRPT (WDSE)'), # http://www.wdse.org/ + (r'video\.wgte\.org', 'WGTE TV (WGTE)'), # http://www.wgte.org + (r'video\.lptv\.org', 'Lakeland Public Television (KAWE)'), # http://www.lakelandptv.org + # (r'prairiepublic\.org', 'PRAIRIE PUBLIC (KFME)'), # http://www.prairiepublic.org/ + (r'video\.kmos\.org', 'KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS)'), # http://www.kmos.org/ + (r'watch\.montanapbs\.org', 'MontanaPBS (KUSM)'), # http://montanapbs.org + (r'video\.krwg\.org', 'KRWG/Channel 22 (KRWG)'), # http://www.krwg.org + (r'video\.kacvtv\.org', 'KACV (KACV)'), # http://www.panhandlepbs.org/home/ + (r'video\.kcostv\.org', 'KCOS/Channel 13 (KCOS)'), # www.kcostv.org + (r'video\.wcny\.org', 'WCNY/Channel 24 (WCNY)'), # http://www.wcny.org + (r'video\.wned\.org', 'WNED (WNED)'), # http://www.wned.org/ + (r'watch\.wpbstv\.org', 'WPBS (WPBS)'), # http://www.wpbstv.org + (r'video\.wskg\.org', 'WSKG Public TV (WSKG)'), # http://wskg.org + (r'video\.wxxi\.org', 'WXXI (WXXI)'), # http://wxxi.org + (r'video\.wpsu\.org', 'WPSU (WPSU)'), # http://www.wpsu.org + # (r'wqln\.org', 'WQLN/Channel 54 (WQLN)'), # http://www.wqln.org + (r'on-demand\.wvia\.org', 'WVIA Public Media Studios (WVIA)'), # http://www.wvia.org/ + (r'video\.wtvi\.org', 'WTVI (WTVI)'), # http://www.wtvi.org/ + # (r'whro\.org', 'WHRO (WHRO)'), # http://whro.org + (r'video\.westernreservepublicmedia\.org', 'Western Reserve PBS (WNEO)'), # http://www.WesternReservePublicMedia.org/ + (r'video\.ideastream\.org', 'WVIZ/PBS ideastream (WVIZ)'), # http://www.wviz.org/ + (r'video\.kcts9\.org', 'KCTS 9 (KCTS)'), # http://kcts9.org/ + (r'video\.basinpbs\.org', 'Basin PBS (KPBT)'), # http://www.basinpbs.org + (r'video\.houstonpbs\.org', 'KUHT / Channel 8 (KUHT)'), # http://www.houstonpublicmedia.org/ + # (r'tamu\.edu', 'KAMU - TV (KAMU)'), # http://KAMU.tamu.edu + # (r'kedt\.org', 'KEDT/Channel 16 (KEDT)'), # http://www.kedt.org + (r'video\.klrn\.org', 'KLRN (KLRN)'), # http://www.klrn.org + (r'video\.klru\.tv', 'KLRU (KLRU)'), # http://www.klru.org + # (r'kmbh\.org', 'KMBH-TV (KMBH)'), # http://www.kmbh.org + # (r'knct\.org', 'KNCT (KNCT)'), # http://www.knct.org + # (r'ktxt\.org', 'KTTZ-TV (KTXT)'), # http://www.ktxt.org + (r'video\.wtjx\.org', 'WTJX Channel 12 (WTJX)'), # http://www.wtjx.org/ + (r'video\.ideastations\.org', 'WCVE PBS (WCVE)'), # http://ideastations.org/ + (r'video\.kbtc\.org', 'KBTC Public Television (KBTC)'), # http://kbtc.org ) IE_NAME = 'pbs' @@ -189,7 +189,7 @@ class PBSIE(InfoExtractor): # Player (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ ) - ''' % '|'.join(re.escape(p) for p in list(zip(*_STATIONS))[0]) + ''' % '|'.join(list(zip(*_STATIONS))[0]) _TESTS = [ { diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 63cc764bb..514e9b433 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -31,9 +31,8 @@ class PeriscopeIE(InfoExtractor): }] def _call_api(self, method, value): - attribute = 'token' if len(value) > 13 else 'broadcast_id' return self._download_json( - 'https://api.periscope.tv/api/v2/%s?%s=%s' % (method, attribute, value), value) + 'https://api.periscope.tv/api/v2/%s?broadcast_id=%s' % (method, value), value) def _real_extract(self, url): token = self._match_id(url) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 7ff1d06c4..14f1ccbb4 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -8,20 +8,24 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + ExtractorError, + determine_ext, parse_duration, unified_strdate, + int_or_none, + xpath_text, ) -class RaiIE(InfoExtractor): - _VALID_URL = r'(?P<url>(?P<host>http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it))/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' +class RaiTVIE(InfoExtractor): + _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', - 'md5': 'c064c0b2d09c278fb293116ef5d0a32d', + 'md5': '96382709b61dd64a6b88e0f791e6df4c', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Report del 07/04/2014', 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', 'upload_date': '20140407', @@ -30,16 +34,14 @@ class RaiIE(InfoExtractor): }, { 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', - 'md5': '8bb9c151924ce241b74dd52ef29ceafa', + 'md5': 'd9751b78eac9710d62c2447b224dea39', 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'TG PRIMO TEMPO', - 'description': '', 'upload_date': '20140612', 'duration': 1758, }, - 'skip': 'Error 404', }, { 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', @@ -55,110 +57,106 @@ class RaiIE(InfoExtractor): }, { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html', - 'md5': '35694f062977fe6619943f08ed935730', 'info_dict': { 'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132', 'ext': 'mp4', 'title': 'Alluvione in Sardegna e dissesto idrogeologico', 'description': 'Edizione delle ore 20:30 ', - } + }, + 'skip': 'invalid urls', }, { 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', - 'md5': '02b64456f7cc09f96ff14e7dd489017e', + 'md5': '496ab63e420574447f70d02578333437', 'info_dict': { 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', 'ext': 'flv', 'title': 'Il Candidato - Primo episodio: "Le Primarie"', - 'description': 'Primo appuntamento con "Il candidato" con Filippo Timi, alias Piero Zucca presidente!', - 'uploader': 'RaiTre', + 'description': 'md5:364b604f7db50594678f483353164fb8', + 'upload_date': '20140923', + 'duration': 386, } }, - { - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '037104d2c14132887e5e4cf114569214', - 'info_dict': { - 'id': '0c7a664b-d0f4-4b2c-8835-3f82e46f433e', - 'ext': 'flv', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'uploader': 'RaiTre', - 'upload_date': '20141221', - }, - } ] - def _extract_relinker_url(self, webpage): - return self._proto_relative_url(self._search_regex( - [r'name="videourl" content="([^"]+)"', r'var\s+videoURL(?:_MP4)?\s*=\s*"([^"]+)"'], - webpage, 'relinker url', default=None)) - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - host = mobj.group('host') + video_id = self._match_id(url) + media = self._download_json( + 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id, + video_id, 'Downloading video JSON') - webpage = self._download_webpage(url, video_id) + thumbnails = [] + for image_type in ('image', 'image_medium', 'image_300'): + thumbnail_url = media.get(image_type) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + }) - relinker_url = self._extract_relinker_url(webpage) - - if not relinker_url: - iframe_url = self._search_regex( - [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', - r'drawMediaRaiTV\(["\'](.+?)["\']'], - webpage, 'iframe') - if not iframe_url.startswith('http'): - iframe_url = compat_urlparse.urljoin(url, iframe_url) - webpage = self._download_webpage( - iframe_url, video_id) - relinker_url = self._extract_relinker_url(webpage) - - relinker = self._download_json( - '%s&output=47' % relinker_url, video_id) - - media_url = relinker['video'][0] - ct = relinker.get('ct') - if ct == 'f4m': - formats = self._extract_f4m_formats( - media_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id) - else: - formats = [{ - 'url': media_url, - 'format_id': ct, - }] + subtitles = [] + formats = [] + media_type = media['type'] + if 'Audio' in media_type: + formats.append({ + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + }) + elif 'Video' in media_type: + def fix_xml(xml): + return xml.replace(' tag elementi', '').replace('>/', '</') + + relinker = self._download_xml( + media['mediaUri'] + '&output=43', video_id, transform_source=fix_xml) - json_link = self._html_search_meta( - 'jsonlink', webpage, 'JSON link', default=None) - if json_link: - media = self._download_json( - host + json_link, video_id, 'Downloading video JSON') - title = media.get('name') - description = media.get('desc') - thumbnail = media.get('image_300') or media.get('image_medium') or media.get('image') - duration = parse_duration(media.get('length')) - uploader = media.get('author') - upload_date = unified_strdate(media.get('date')) + has_subtitle = False + + for element in relinker.findall('element'): + media_url = xpath_text(element, 'url') + ext = determine_ext(media_url) + content_type = xpath_text(element, 'content-type') + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + media_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', + fatal=None) + if m3u8_formats: + formats.extend(m3u8_formats) + elif ext == 'f4m': + f4m_formats = self._extract_f4m_formats( + media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', + video_id, f4m_id='hds', fatal=None) + if f4m_formats: + formats.extend(f4m_formats) + elif ext == 'stl': + has_subtitle = True + elif content_type.startswith('video/'): + bitrate = int_or_none(xpath_text(element, 'bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', + }) + elif content_type.startswith('image/'): + thumbnails.append({ + 'url': media_url, + }) + + self._sort_formats(formats) + + if has_subtitle: + webpage = self._download_webpage(url, video_id) + subtitles = self._get_subtitles(video_id, webpage) else: - title = (self._search_regex( - r'var\s+videoTitolo\s*=\s*"(.+?)";', - webpage, 'title', default=None) or self._og_search_title(webpage)).replace('\\"', '"') - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - duration = None - uploader = self._html_search_meta('Editore', webpage, 'uploader') - upload_date = unified_strdate(self._html_search_meta( - 'item-date', webpage, 'upload date', default=None)) - - subtitles = self.extract_subtitles(video_id, webpage) + raise ExtractorError('not a media file') return { 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - 'duration': duration, + 'title': media['name'], + 'description': media.get('desc'), + 'thumbnails': thumbnails, + 'uploader': media.get('author'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(media.get('length')), 'formats': formats, 'subtitles': subtitles, } @@ -177,3 +175,36 @@ class RaiIE(InfoExtractor): 'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions), }] return subtitles + + +class RaiIE(InfoExtractor): + _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _TESTS = [ + { + 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', + 'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7', + 'info_dict': { + 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', + 'ext': 'flv', + 'title': 'Il pacco', + 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'upload_date': '20141221', + }, + } + ] + + @classmethod + def suitable(cls, url): + return False if RaiTVIE.suitable(url) else super(RaiIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + iframe_url = self._search_regex( + [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', + r'drawMediaRaiTV\(["\'](.+?)["\']'], + webpage, 'iframe') + if not iframe_url.startswith('http'): + iframe_url = compat_urlparse.urljoin(url, iframe_url) + return self.url_result(iframe_url) diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index d9df06861..f7fe1fece 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -131,7 +131,7 @@ class RUTVIE(InfoExtractor): is_live = video_type == 'live' json_data = self._download_json( - 'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if is_live else '', video_id), + 'http://player.rutv.ru/iframe/data%s/id/%s' % ('live' if is_live else 'video', video_id), video_id, 'Downloading JSON') if json_data['errors']: diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py deleted file mode 100644 index 5da66ca9e..000000000 --- a/youtube_dl/extractor/soompi.py +++ /dev/null @@ -1,146 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .crunchyroll import CrunchyrollIE - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, - remove_start, - xpath_text, -) - - -class SoompiBaseIE(InfoExtractor): - def _get_episodes(self, webpage, episode_filter=None): - episodes = self._parse_json( - self._search_regex( - r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'), - None) - return list(filter(episode_filter, episodes)) - - -class SoompiIE(SoompiBaseIE, CrunchyrollIE): - IE_NAME = 'soompi' - _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://tv.soompi.com/en/watch/29235', - 'info_dict': { - 'id': '29235', - 'ext': 'mp4', - 'title': 'Episode 1096', - 'description': '2015-05-20' - }, - 'params': { - 'skip_download': True, - }, - }] - - def _get_episode(self, webpage, video_id): - return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0] - - def _get_subtitles(self, config, video_id): - sub_langs = {} - for subtitle in config.findall('./{default}preload/subtitles/subtitle'): - sub_langs[subtitle.attrib['id']] = subtitle.attrib['title'] - - subtitles = {} - for s in config.findall('./{default}preload/subtitle'): - lang_code = sub_langs.get(s.attrib['id']) - if not lang_code: - continue - sub_id = s.get('id') - data = xpath_text(s, './data', 'data') - iv = xpath_text(s, './iv', 'iv') - if not id or not iv or not data: - continue - subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8') - subtitles[lang_code] = self._extract_subtitles(subtitle) - return subtitles - - def _real_extract(self, url): - video_id = self._match_id(url) - - try: - webpage = self._download_webpage( - url, video_id, 'Downloading episode page') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - webpage = ee.cause.read() - block_message = self._html_search_regex( - r'(?s)<div class="block-message">(.+?)</div>', webpage, - 'block message', default=None) - if block_message: - raise ExtractorError(block_message, expected=True) - raise - - formats = [] - config = None - for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage): - config = self._download_xml( - 'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id), - video_id, 'Downloading %s XML' % format_id) - m3u8_url = xpath_text( - config, './{default}preload/stream_info/file', - '%s m3u8 URL' % format_id) - if not m3u8_url: - continue - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', m3u8_id=format_id)) - self._sort_formats(formats) - - episode = self._get_episode(webpage, video_id) - - title = episode['name'] - description = episode.get('description') - duration = int_or_none(episode.get('duration')) - - thumbnails = [{ - 'id': thumbnail_id, - 'url': thumbnail_url, - } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()] - - subtitles = self.extract_subtitles(config, video_id) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnails': thumbnails, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles - } - - -class SoompiShowIE(SoompiBaseIE): - IE_NAME = 'soompi:show' - _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)' - _TESTS = [{ - 'url': 'http://tv.soompi.com/en/shows/liar-game', - 'info_dict': { - 'id': 'liar-game', - 'title': 'Liar Game', - 'description': 'md5:52c02bce0c1a622a95823591d0589b66', - }, - 'playlist_count': 14, - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - - webpage = self._download_webpage( - url, show_id, 'Downloading show page') - - title = remove_start(self._og_search_title(webpage), 'SoompiTV | ') - description = self._og_search_description(webpage) - - entries = [ - self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi') - for episode in self._get_episodes(webpage)] - - return self.playlist_result(entries, show_id, title, description) diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index 5d583c720..74d01183f 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -1,17 +1,18 @@ # encoding: utf-8 from __future__ import unicode_literals -import json +from .ard import ARDMediathekIE +from ..utils import ( + ExtractorError, + get_element_by_attribute, +) -from .common import InfoExtractor -from ..utils import js_to_json - -class SRMediathekIE(InfoExtractor): +class SRMediathekIE(ARDMediathekIE): IE_DESC = 'Saarländischer Rundfunk' _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455', 'info_dict': { 'id': '28455', @@ -20,24 +21,36 @@ class SRMediathekIE(InfoExtractor): 'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ', 'thumbnail': 're:^https?://.*\.jpg$', }, - } + 'skip': 'no longer available', + }, { + 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=37682', + 'info_dict': { + 'id': '37682', + 'ext': 'mp4', + 'title': 'Love, Cakes and Rock\'n\'Roll', + 'description': 'md5:18bf9763631c7d326c22603681e1123d', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'] + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - murls = json.loads(js_to_json(self._search_regex( - r'var mediaURLs\s*=\s*(.*?);\n', webpage, 'video URLs'))) - formats = [{'url': murl} for murl in murls] - self._sort_formats(formats) - - title = json.loads(js_to_json(self._search_regex( - r'var mediaTitles\s*=\s*(.*?);\n', webpage, 'title')))[0] + if '>Der gewünschte Beitrag ist leider nicht mehr verfügbar.<' in webpage: + raise ExtractorError('Video %s is no longer available' % video_id, expected=True) - return { + media_collection_url = self._search_regex( + r'data-mediacollection-ardplayer="([^"]+)"', webpage, 'media collection url') + info = self._extract_media_info(media_collection_url, webpage, video_id) + info.update({ 'id': video_id, - 'title': title, - 'formats': formats, + 'title': get_element_by_attribute('class', 'ardplayer-title', webpage), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), - } + }) + return info diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py new file mode 100644 index 000000000..a363b4d40 --- /dev/null +++ b/youtube_dl/extractor/tele13.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( + js_to_json, + qualities, + determine_ext, +) + + +class Tele13IE(InfoExtractor): + _VALID_URL = r'^http://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' + _TESTS = [ + { + 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'md5': '4cb1fa38adcad8fea88487a078831755', + 'info_dict': { + 'id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'ext': 'mp4', + 'title': 'El círculo de hierro de Michelle Bachelet en su regreso a La Moneda', + }, + 'params': { + # HTTP Error 404: Not Found + 'skip_download': True, + }, + }, + { + 'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok', + 'md5': '867adf6a3b3fef932c68a71d70b70946', + 'info_dict': { + 'id': 'rOoKv2OMpOw', + 'ext': 'mp4', + 'title': 'Shooting star seen on 7-Sep-2015', + 'description': 'md5:7292ff2a34b2f673da77da222ae77e1e', + 'uploader': 'Porjai Jaturongkhakun', + 'upload_date': '20150906', + 'uploader_id': 'UCnLY_3ezwNcDSC_Wc6suZxw', + }, + 'add_ie': ['Youtube'], + } + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + setup_js = self._search_regex(r"(?s)jwplayer\('player-vivo'\).setup\((\{.*?\})\)", webpage, 'setup code') + sources = self._parse_json(self._search_regex(r'sources\s*:\s*(\[[^\]]+\])', setup_js, 'sources'), display_id, js_to_json) + + preference = qualities(['Móvil', 'SD', 'HD']) + formats = [] + urls = [] + for f in sources: + format_url = f['file'] + if format_url and format_url not in urls: + ext = determine_ext(format_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats(format_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif YoutubeIE.suitable(format_url): + return self.url_result(format_url, 'Youtube') + else: + formats.append({ + 'url': format_url, + 'format_id': f.get('label'), + 'preference': preference(f.get('label')), + 'ext': ext, + }) + urls.append(format_url) + self._sort_formats(formats) + + return { + 'id': display_id, + 'title': self._search_regex(r'title\s*:\s*"([^"]+)"', setup_js, 'title'), + 'description': self._html_search_meta('description', webpage, 'description'), + 'thumbnail': self._search_regex(r'image\s*:\s*"([^"]+)"', setup_js, 'thumbnail', default=None), + 'formats': formats, + } diff --git a/youtube_dl/extractor/theintercept.py b/youtube_dl/extractor/theintercept.py new file mode 100644 index 000000000..8cb3c3669 --- /dev/null +++ b/youtube_dl/extractor/theintercept.py @@ -0,0 +1,49 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + parse_iso8601, + int_or_none, + ExtractorError, +) + + +class TheInterceptIE(InfoExtractor): + _VALID_URL = r'https://theintercept.com/fieldofvision/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/', + 'md5': '145f28b41d44aab2f87c0a4ac8ec95bd', + 'info_dict': { + 'id': '46214', + 'ext': 'mp4', + 'title': '#ThisIsACoup – Episode Four: Surrender or Die', + 'description': 'md5:74dd27f0e2fbd50817829f97eaa33140', + 'timestamp': 1450429239, + 'upload_date': '20151218', + 'comment_count': int, + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + json_data = self._parse_json(self._search_regex( + r'initialStoreTree\s*=\s*(?P<json_data>{.+})', webpage, + 'initialStoreTree'), display_id) + + for post in json_data['resources']['posts'].values(): + if post['slug'] == display_id: + return { + '_type': 'url_transparent', + 'url': 'jwplatform:%s' % post['fov_videoid'], + 'id': compat_str(post['ID']), + 'display_id': display_id, + 'title': post['title'], + 'description': post.get('excerpt'), + 'timestamp': parse_iso8601(post.get('date')), + 'comment_count': int_or_none(post.get('comments_number')), + } + raise ExtractorError('Unable to find the current post') diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py new file mode 100644 index 000000000..a47239952 --- /dev/null +++ b/youtube_dl/extractor/toggle.py @@ -0,0 +1,194 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, + sanitized_Request, +) + + +class ToggleIE(InfoExtractor): + IE_NAME = 'toggle' + _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/(?:[^/]+/)+(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', + 'info_dict': { + 'id': '343115', + 'ext': 'mp4', + 'title': 'Lion Moms Premiere', + 'description': 'md5:aea1149404bff4d7f7b6da11fafd8e6b', + 'upload_date': '20150910', + 'timestamp': 1441858274, + }, + 'params': { + 'skip_download': 'm3u8 download', + } + }, { + 'note': 'DRM-protected video', + 'url': 'http://video.toggle.sg/en/movies/dug-s-special-mission/341413', + 'info_dict': { + 'id': '341413', + 'ext': 'wvm', + 'title': 'Dug\'s Special Mission', + 'description': 'md5:e86c6f4458214905c1772398fabc93e0', + 'upload_date': '20150827', + 'timestamp': 1440644006, + }, + 'params': { + 'skip_download': 'DRM-protected wvm download', + } + }, { + # this also tests correct video id extraction + 'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay', + 'url': 'http://video.toggle.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861', + 'info_dict': { + 'id': '332861', + 'ext': 'mp4', + 'title': '28th SEA Games (5 Show) - Episode 11', + 'description': 'md5:3cd4f5f56c7c3b1340c50a863f896faa', + 'upload_date': '20150605', + 'timestamp': 1433480166, + }, + 'params': { + 'skip_download': 'DRM-protected wvm download', + }, + 'skip': 'm3u8 links are geo-restricted' + }, { + 'url': 'http://video.toggle.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', + 'only_matching': True, + }, { + 'url': 'http://video.toggle.sg/zh/series/zero-calling-s2-hd/ep13/336367', + 'only_matching': True, + }, { + 'url': 'http://video.toggle.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302', + 'only_matching': True, + }, { + 'url': 'http://video.toggle.sg/en/movies/seven-days/321936', + 'only_matching': True, + }] + + _FORMAT_PREFERENCES = { + 'wvm-STBMain': -10, + 'wvm-iPadMain': -20, + 'wvm-iPhoneMain': -30, + 'wvm-Android': -40, + } + _API_USER = 'tvpapi_147' + _API_PASS = '11111' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + url, video_id, note='Downloading video page') + + api_user = self._search_regex( + r'apiUser\s*:\s*(["\'])(?P<user>.+?)\1', webpage, 'apiUser', + default=self._API_USER, group='user') + api_pass = self._search_regex( + r'apiPass\s*:\s*(["\'])(?P<pass>.+?)\1', webpage, 'apiPass', + default=self._API_PASS, group='pass') + + params = { + 'initObj': { + 'Locale': { + 'LocaleLanguage': '', + 'LocaleCountry': '', + 'LocaleDevice': '', + 'LocaleUserState': 0 + }, + 'Platform': 0, + 'SiteGuid': 0, + 'DomainID': '0', + 'UDID': '', + 'ApiUser': api_user, + 'ApiPass': api_pass + }, + 'MediaID': video_id, + 'mediaType': 0, + } + + req = sanitized_Request( + 'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo', + json.dumps(params).encode('utf-8')) + info = self._download_json(req, video_id, 'Downloading video info json') + + title = info['MediaName'] + + formats = [] + for video_file in info.get('Files', []): + video_url, vid_format = video_file.get('URL'), video_file.get('Format') + if not video_url or not vid_format: + continue + ext = determine_ext(video_url) + vid_format = vid_format.replace(' ', '') + # if geo-restricted, m3u8 is inaccessible, but mp4 is okay + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, ext='mp4', m3u8_id=vid_format, + note='Downloading %s m3u8 information' % vid_format, + errnote='Failed to download %s m3u8 information' % vid_format, + fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif ext in ('mp4', 'wvm'): + # wvm are drm-protected files + formats.append({ + 'ext': ext, + 'url': video_url, + 'format_id': vid_format, + 'preference': self._FORMAT_PREFERENCES.get(ext + '-' + vid_format) or -1, + 'format_note': 'DRM-protected video' if ext == 'wvm' else None + }) + if not formats: + # Most likely because geo-blocked + raise ExtractorError('No downloadable videos found', expected=True) + self._sort_formats(formats) + + duration = int_or_none(info.get('Duration')) + description = info.get('Description') + created_at = parse_iso8601(info.get('CreationDate') or None) + + average_rating = float_or_none(info.get('Rating')) + view_count = int_or_none(info.get('ViewCounter') or info.get('view_counter')) + like_count = int_or_none(info.get('LikeCounter') or info.get('like_counter')) + + thumbnails = [] + for picture in info.get('Pictures', []): + if not isinstance(picture, dict): + continue + pic_url = picture.get('URL') + if not pic_url: + continue + thumbnail = { + 'url': pic_url, + } + pic_size = picture.get('PicSize', '') + m = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', pic_size) + if m: + thumbnail.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + thumbnails.append(thumbnail) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': created_at, + 'average_rating': average_rating, + 'view_count': view_count, + 'like_count': like_count, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index c1ee1decc..e03e2dbaa 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -5,6 +5,8 @@ from .common import InfoExtractor from ..utils import ( parse_iso8601, int_or_none, + xpath_attr, + xpath_element, ) @@ -15,7 +17,7 @@ class TwentyFourVideoIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.24video.net/video/view/1044982', - 'md5': 'd041af8b5b4246ea466226a0d6693345', + 'md5': 'e09fc0901d9eaeedac872f154931deeb', 'info_dict': { 'id': '1044982', 'ext': 'mp4', @@ -64,33 +66,24 @@ class TwentyFourVideoIE(InfoExtractor): r'<div class="comments-title" id="comments-count">(\d+) комментари', webpage, 'comment count', fatal=False)) - formats = [] + # Sets some cookies + self._download_xml( + r'http://www.24video.net/video/xml/%s?mode=init' % video_id, + video_id, 'Downloading init XML') - pc_video = self._download_xml( + video_xml = self._download_xml( 'http://www.24video.net/video/xml/%s?mode=play' % video_id, - video_id, 'Downloading PC video URL').find('.//video') + video_id, 'Downloading video XML') - formats.append({ - 'url': pc_video.attrib['url'], - 'format_id': 'pc', - 'quality': 1, - }) + video = xpath_element(video_xml, './/video', 'video', fatal=True) - like_count = int_or_none(pc_video.get('ratingPlus')) - dislike_count = int_or_none(pc_video.get('ratingMinus')) - age_limit = 18 if pc_video.get('adult') == 'true' else 0 + formats = [{ + 'url': xpath_attr(video, '', 'url', 'video URL', fatal=True), + }] - mobile_video = self._download_xml( - 'http://www.24video.net/video/xml/%s' % video_id, - video_id, 'Downloading mobile video URL').find('.//video') - - formats.append({ - 'url': mobile_video.attrib['url'], - 'format_id': 'mobile', - 'quality': 0, - }) - - self._sort_formats(formats) + like_count = int_or_none(video.get('ratingPlus')) + dislike_count = int_or_none(video.get('ratingMinus')) + age_limit = 18 if video.get('adult') == 'true' else 0 return { 'id': video_id, diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index f38a72fde..811ee197d 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -4,26 +4,48 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .xstream import XstreamIE from ..utils import ( ExtractorError, float_or_none, ) -class VGTVIE(InfoExtractor): - IE_DESC = 'VGTV and BTTV' +class VGTVIE(XstreamIE): + IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' + + _HOST_TO_APPNAME = { + 'vgtv.no': 'vgtv', + 'bt.no/tv': 'bttv', + 'aftenbladet.no/tv': 'satv', + 'fvn.no/fvntv': 'fvntv', + 'aftenposten.no/webtv': 'aptv', + } + + _APP_NAME_TO_VENDOR = { + 'vgtv': 'vgtv', + 'bttv': 'bt', + 'satv': 'sa', + 'fvntv': 'fvn', + 'aptv': 'ap', + } + _VALID_URL = r'''(?x) - (?: - vgtv:| - http://(?:www\.)? + (?:https?://(?:www\.)? + (?P<host> + %s ) - (?P<host>vgtv|bt) + / (?: - :| - \.no/(?:tv/)?\#!/(?:video|live)/ - ) - (?P<id>[0-9]+) - ''' + \#!/(?:video|live)/| + embed?.*id= + )| + (?P<appname> + %s + ):) + (?P<id>\d+) + ''' % ('|'.join(_HOST_TO_APPNAME.keys()), '|'.join(_APP_NAME_TO_VENDOR.keys())) + _TESTS = [ { # streamType: vod @@ -59,17 +81,18 @@ class VGTVIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'skip': 'Video is no longer available', }, { - # streamType: live + # streamType: wasLive 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', 'info_dict': { 'id': '113063', - 'ext': 'flv', - 'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'ext': 'mp4', + 'title': 'V75 fra Solvalla 30.05.15', 'description': 'md5:b3743425765355855f88e096acc93231', 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 0, + 'duration': 25966, 'timestamp': 1432975582, 'upload_date': '20150530', 'view_count': int, @@ -80,6 +103,20 @@ class VGTVIE(InfoExtractor): }, }, { + 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', + 'md5': 'fd828cd29774a729bf4d4425fe192972', + 'info_dict': { + 'id': '21039', + 'ext': 'mov', + 'title': 'TRAILER: «SWEATSHOP» - I can´t take any more', + 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', + 'duration': 66, + 'timestamp': 1417002452, + 'upload_date': '20141126', + 'view_count': int, + } + }, + { 'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien', 'only_matching': True, }, @@ -89,21 +126,27 @@ class VGTVIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') host = mobj.group('host') - - HOST_WEBSITES = { - 'vgtv': 'vgtv', - 'bt': 'bttv', - } + appname = self._HOST_TO_APPNAME[host] if host else mobj.group('appname') + vendor = self._APP_NAME_TO_VENDOR[appname] data = self._download_json( 'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website' - % (host, video_id, HOST_WEBSITES[host]), + % (vendor, video_id, appname), video_id, 'Downloading media JSON') if data.get('status') == 'inactive': raise ExtractorError( 'Video %s is no longer available' % video_id, expected=True) + info = { + 'formats': [], + } + if len(video_id) == 5: + if appname == 'bttv': + info = self._extract_video_info('btno', video_id) + elif appname == 'aptv': + info = self._extract_video_info('ap', video_id) + streams = data['streamUrls'] stream_type = data.get('streamType') @@ -111,48 +154,53 @@ class VGTVIE(InfoExtractor): hls_url = streams.get('hls') if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) hds_url = streams.get('hds') # wasLive hds are always 404 if hds_url and stream_type != 'wasLive': - formats.extend(self._extract_f4m_formats( - hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - video_id, f4m_id='hds')) + f4m_formats = self._extract_f4m_formats( + hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + mp4_urls = streams.get('pseudostreaming') or [] mp4_url = streams.get('mp4') if mp4_url: - _url = hls_url or hds_url - MP4_URL_TEMPLATE = '%s/%%s.%s' % (mp4_url.rpartition('/')[0], mp4_url.rpartition('.')[-1]) - for mp4_format in _url.split(','): - m = re.search('(?P<width>\d+)_(?P<height>\d+)_(?P<vbr>\d+)', mp4_format) - if not m: - continue - width = int(m.group('width')) - height = int(m.group('height')) - vbr = int(m.group('vbr')) - formats.append({ - 'url': MP4_URL_TEMPLATE % mp4_format, - 'format_id': 'mp4-%s' % vbr, - 'width': width, - 'height': height, - 'vbr': vbr, - 'preference': 1, + mp4_urls.append(mp4_url) + for mp4_url in mp4_urls: + format_info = { + 'url': mp4_url, + } + mobj = re.search('(\d+)_(\d+)_(\d+)', mp4_url) + if mobj: + tbr = int(mobj.group(3)) + format_info.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + 'tbr': tbr, + 'format_id': 'mp4-%s' % tbr, }) - self._sort_formats(formats) + formats.append(format_info) + + info['formats'].extend(formats) + + self._sort_formats(info['formats']) - return { + info.update({ 'id': video_id, - 'title': self._live_title(data['title']), + 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], 'duration': float_or_none(data['duration'], 1000), 'view_count': data['displays'], - 'formats': formats, 'is_live': True if stream_type == 'live' else False, - } + }) + return info class BTArticleIE(InfoExtractor): @@ -161,7 +209,7 @@ class BTArticleIE(InfoExtractor): _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html' _TEST = { 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', - 'md5': 'd055e8ee918ef2844745fcfd1a4175fb', + 'md5': '2acbe8ad129b3469d5ae51b1158878df', 'info_dict': { 'id': '23199', 'ext': 'mp4', @@ -178,15 +226,15 @@ class BTArticleIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, self._match_id(url)) video_id = self._search_regex( - r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id') - return self.url_result('vgtv:bt:%s' % video_id, 'VGTV') + r'<video[^>]+data-id="(\d+)"', webpage, 'video id') + return self.url_result('bttv:%s' % video_id, 'VGTV') class BTVestlendingenIE(InfoExtractor): IE_NAME = 'bt:vestlendingen' IE_DESC = 'Bergens Tidende - Vestlendingen' _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588', 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', 'info_dict': { @@ -197,7 +245,19 @@ class BTVestlendingenIE(InfoExtractor): 'timestamp': 1430473209, 'upload_date': '20150501', }, - } + 'skip': '404 Error', + }, { + 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86255', + 'md5': 'a2893f8632e96389f4bdf36aa9463ceb', + 'info_dict': { + 'id': '86255', + 'ext': 'mov', + 'title': 'Du må tåle å fryse og være sulten', + 'description': 'md5:b8046f4d022d5830ddab04865791d063', + 'upload_date': '20150321', + 'timestamp': 1426942023, + }, + }] def _real_extract(self, url): - return self.url_result('xstream:btno:%s' % self._match_id(url), 'Xstream') + return self.url_result('bttv:%s' % self._match_id(url), 'VGTV') diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index a63c23617..ca3f20a3d 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -30,6 +30,12 @@ class VikiBaseIE(InfoExtractor): _token = None + _ERRORS = { + 'geo': 'Sorry, this content is not available in your region.', + 'upcoming': 'Sorry, this content is not yet available.', + # 'paywall': 'paywall', + } + def _prepare_call(self, path, timestamp=None, post_data=None): path += '?' if '?' not in path else '&' if not timestamp: @@ -67,6 +73,12 @@ class VikiBaseIE(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, error), expected=True) + def _check_errors(self, data): + for reason, status in data.get('blocking', {}).items(): + if status and reason in self._ERRORS: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, self._ERRORS[reason]), expected=True) + def _real_initialize(self): self._login() @@ -193,6 +205,7 @@ class VikiIE(VikiBaseIE): 'timestamp': 1321985454, 'description': 'md5:44b1e46619df3a072294645c770cef36', 'title': 'Love In Magic', + 'age_limit': 13, }, }] @@ -202,6 +215,8 @@ class VikiIE(VikiBaseIE): video = self._call_api( 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') + self._check_errors(video) + title = self.dict_selection(video.get('titles', {}), 'en') if not title: title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id @@ -262,8 +277,11 @@ class VikiIE(VikiBaseIE): r'^(\d+)[pP]$', format_id, 'height', default=None)) for protocol, format_dict in stream_dict.items(): if format_id == 'm3u8': - formats = self._extract_m3u8_formats( - format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol) + m3u8_formats = self._extract_m3u8_formats( + format_dict['url'], video_id, 'mp4', 'm3u8_native', + m3u8_id='m3u8-%s' % protocol, fatal=None) + if m3u8_formats: + formats.extend(m3u8_formats) else: formats.append({ 'url': format_dict['url'], @@ -315,6 +333,8 @@ class VikiChannelIE(VikiBaseIE): 'containers/%s.json' % channel_id, channel_id, 'Downloading channel JSON') + self._check_errors(channel) + title = self.dict_selection(channel['titles'], 'en') description = self.dict_selection(channel['descriptions'], 'en') diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py index 71584c291..76c91bd92 100644 --- a/youtube_dl/extractor/xstream.py +++ b/youtube_dl/extractor/xstream.py @@ -42,11 +42,7 @@ class XstreamIE(InfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - partner_id = mobj.group('partner_id') - video_id = mobj.group('id') - + def _extract_video_info(self, partner_id, video_id): data = self._download_xml( 'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s' % (partner_id, video_id), @@ -97,6 +93,7 @@ class XstreamIE(InfoExtractor): formats.append({ 'url': link.get('href'), 'format_id': link.get('rel'), + 'preference': 1, }) thumbnails = [{ @@ -113,3 +110,10 @@ class XstreamIE(InfoExtractor): 'formats': formats, 'thumbnails': thumbnails, } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + partner_id = mobj.group('partner_id') + video_id = mobj.group('id') + + return self._extract_video_info(partner_id, video_id) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4556a16fb..4aac2cc03 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,6 +26,7 @@ from ..compat import ( from ..utils import ( clean_html, encode_dict, + error_to_compat_str, ExtractorError, float_or_none, get_element_by_attribute, @@ -903,7 +904,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, video_id, note=False) except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err)) + self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) return {} sub_lang_list = {} @@ -1774,6 +1775,10 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): }, }] + @classmethod + def suitable(cls, url): + return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url) + def _real_extract(self, url): channel_id = self._match_id(url) @@ -1847,10 +1852,10 @@ class YoutubeUserIE(YoutubeChannelIE): return super(YoutubeUserIE, cls).suitable(url) -class YoutubeUserPlaylistsIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com user playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/user/(?P<id>[^/]+)/playlists' - IE_NAME = 'youtube:user:playlists' +class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): + IE_DESC = 'YouTube.com user/channel playlists' + _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' + IE_NAME = 'youtube:playlists' _TESTS = [{ 'url': 'http://www.youtube.com/user/ThirstForScience/playlists', @@ -1867,6 +1872,13 @@ class YoutubeUserPlaylistsIE(YoutubePlaylistsBaseInfoExtractor): 'id': 'igorkle1', 'title': 'Игорь Клейнер', }, + }, { + 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', + 'playlist_mincount': 17, + 'info_dict': { + 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', + 'title': 'Chem Player', + }, }] diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index a795f56b3..9a3331a69 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -131,14 +131,23 @@ class ZDFIE(InfoExtractor): class ZDFChannelIE(InfoExtractor): - _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/)(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/(?:[^/]+/)?)(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.zdf.de/ZDFmediathek#/kanaluebersicht/1586442/sendung/Titanic', 'info_dict': { 'id': '1586442', }, 'playlist_count': 3, - } + }, { + 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/aktuellste/332', + 'only_matching': True, + }, { + 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/meist-gesehen/332', + 'only_matching': True, + }, { + 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/_/1798716?bc=nrt;nrm?flash=off', + 'only_matching': True, + }] _PAGE_SIZE = 50 def _fetch_page(self, channel_id, page): diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 2191e8b89..a7440c582 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -232,7 +232,7 @@ class JSInterpreter(object): def extract_function(self, funcname): func_m = re.search( r'''(?x) - (?:function\s+%s|[{;]%s\s*=\s*function|var\s+%s\s*=\s*function)\s* + (?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s* \((?P<args>[^)]*)\)\s* \{(?P<code>[^}]+)\}''' % ( re.escape(funcname), re.escape(funcname), re.escape(funcname)), diff --git a/youtube_dl/update.py b/youtube_dl/update.py index 074eb64a7..995b8ed96 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -9,7 +9,7 @@ import subprocess import sys from zipimport import zipimporter -from .compat import compat_str +from .utils import encode_compat_str from .version import __version__ @@ -61,7 +61,7 @@ def update_self(to_screen, verbose, opener): newversion = opener.open(VERSION_URL).read().decode('utf-8').strip() except Exception: if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: can\'t find the current version. Please try again later.') return if newversion == __version__: @@ -74,7 +74,7 @@ def update_self(to_screen, verbose, opener): versions_info = json.loads(versions_info) except Exception: if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: can\'t obtain versions info. Please try again later.') return if 'signature' not in versions_info: @@ -123,7 +123,7 @@ def update_self(to_screen, verbose, opener): urlh.close() except (IOError, OSError): if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to download latest version') return @@ -137,7 +137,7 @@ def update_self(to_screen, verbose, opener): outf.write(newcontent) except (IOError, OSError): if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to write the new version') return @@ -157,7 +157,7 @@ start /b "" cmd /c del "%%~f0"&exit /b" return # Do not show premature success messages except (IOError, OSError): if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to overwrite current version') return @@ -169,7 +169,7 @@ start /b "" cmd /c del "%%~f0"&exit /b" urlh.close() except (IOError, OSError): if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to download latest version') return @@ -183,7 +183,7 @@ start /b "" cmd /c del "%%~f0"&exit /b" outf.write(newcontent) except (IOError, OSError): if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to overwrite current version') return diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 91917fc96..1737ac5f6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1712,6 +1712,10 @@ def encode_dict(d, encoding='utf-8'): return dict((encode(k), encode(v)) for k, v in d.items()) +def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): + return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) + + US_RATINGS = { 'G': 0, 'PG': 10, @@ -1806,6 +1810,15 @@ def args_to_str(args): return ' '.join(shlex_quote(a) for a in args) +def error_to_compat_str(err): + err_str = str(err) + # On python 2 error byte string must be decoded with proper + # encoding rather than ascii + if sys.version_info[0] < 3: + err_str = err_str.decode(preferredencoding()) + return err_str + + def mimetype2ext(mt): _, _, res = mt.rpartition('/') @@ -1976,15 +1989,15 @@ def match_filter_func(filter_str): def parse_dfxp_time_expr(time_expr): if not time_expr: - return 0.0 + return mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr) if mobj: return float(mobj.group('time_offset')) - mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr) + mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr) if mobj: - return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3)) + return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.')) def srt_subtitles_timecode(seconds): @@ -2020,10 +2033,15 @@ def dfxp2srt(dfxp_data): raise ValueError('Invalid dfxp/TTML subtitle') for para, index in zip(paras, itertools.count(1)): - begin_time = parse_dfxp_time_expr(para.attrib['begin']) + begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) end_time = parse_dfxp_time_expr(para.attrib.get('end')) + dur = parse_dfxp_time_expr(para.attrib.get('dur')) + if begin_time is None: + continue if not end_time: - end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur']) + if not dur: + continue + end_time = begin_time + dur out.append('%d\n%s --> %s\n%s\n\n' % ( index, srt_subtitles_timecode(begin_time), diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 20b44b94d..255d64269 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.12.13' +__version__ = '2015.12.23' |