diff options
Diffstat (limited to 'youtube_dl/extractor')
94 files changed, 3585 insertions, 1813 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c1dd87550..971047ad4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -15,7 +15,6 @@ from .adobetv import ( AdobeTVVideoIE, ) from .adultswim import AdultSwimIE -from .aftenposten import AftenpostenIE from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE @@ -26,7 +25,10 @@ from .aol import AolIE from .allocine import AllocineIE from .aparat import AparatIE from .appleconnect import AppleConnectIE -from .appletrailers import AppleTrailersIE +from .appletrailers import ( + AppleTrailersIE, + AppleTrailersSectionIE, +) from .archiveorg import ArchiveOrgIE from .ard import ( ARDIE, @@ -61,8 +63,11 @@ from .beatportpro import BeatportProIE from .bet import BetIE from .bild import BildIE from .bilibili import BiliBiliIE +from .bleacherreport import ( + BleacherReportIE, + BleacherReportCMSIE, +) from .blinkx import BlinkxIE -from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE from .bpb import BpbIE from .br import BRIE @@ -78,7 +83,6 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) -from .canal13cl import Canal13clIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cbs import CBSIE @@ -131,7 +135,12 @@ from .dailymotion import ( ) from .daum import DaumIE from .dbtv import DBTVIE -from .dcn import DCNIE +from .dcn import ( + DCNIE, + DCNVideoIE, + DCNLiveIE, + DCNSeasonIE, +) from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .democracynow import DemocracynowIE @@ -206,6 +215,7 @@ from .francetv import ( from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freevideo import FreeVideoIE +from .funimation import FunimationIE from .funnyordie import FunnyOrDieIE from .gameinformer import GameInformerIE from .gamekings import GamekingsIE @@ -231,9 +241,11 @@ from .globo import ( from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE +from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE +from .gputechconf import GPUTechConfIE from .groupon import GrouponIE from .hark import HarkIE from .hearthisat import HearThisAtIE @@ -246,12 +258,17 @@ from .history import HistoryIE from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE from .hotnewhiphop import HotNewHipHopIE +from .hotstar import HotStarIE from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE from .huffpost import HuffPostIE from .hypem import HypemIE from .iconosquare import IconosquareIE -from .ign import IGNIE, OneUPIE +from .ign import ( + IGNIE, + OneUPIE, + PCMagIE, +) from .imdb import ( ImdbIE, ImdbListIE @@ -280,6 +297,7 @@ from .jadorecettepub import JadoreCettePubIE from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .jukebox import JukeboxIE +from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE from .kanalplay import KanalPlayIE @@ -334,6 +352,7 @@ from .lynda import ( from .m6 import M6IE from .macgamestore import MacGameStoreIE from .mailru import MailRuIE +from .makertv import MakerTVIE from .malemotion import MalemotionIE from .mdr import MDRIE from .metacafe import MetacafeIE @@ -357,7 +376,6 @@ from .motherless import MotherlessIE from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE -from .movshare import MovShareIE from .mtv import ( MTVIE, MTVServicesEmbeddedIE, @@ -423,7 +441,13 @@ from .noco import NocoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE from .nova import NovaIE -from .novamov import NovaMovIE +from .novamov import ( + NovaMovIE, + WholeCloudIE, + NowVideoIE, + VideoWeedIE, + CloudTimeIE, +) from .nowness import ( NownessIE, NownessPlaylistIE, @@ -433,7 +457,6 @@ from .nowtv import ( NowTVIE, NowTVListIE, ) -from .nowvideo import NowVideoIE from .npo import ( NPOIE, NPOLiveIE, @@ -514,7 +537,10 @@ from .radiode import RadioDeIE from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE -from .rai import RaiIE +from .rai import ( + RaiTVIE, + RaiIE, +) from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redtube import RedTubeIE @@ -580,10 +606,6 @@ from .snagfilms import ( ) from .snotr import SnotrIE from .sohu import SohuIE -from .soompi import ( - SoompiIE, - SoompiShowIE, -) from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, @@ -642,6 +664,7 @@ from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .tele13 import Tele13IE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE @@ -651,6 +674,7 @@ from .tenplay import TenPlayIE from .testurl import TestURLIE from .testtube import TestTubeIE from .tf1 import TF1IE +from .theintercept import TheInterceptIE from .theonion import TheOnionIE from .theplatform import ( ThePlatformIE, @@ -670,6 +694,7 @@ from .tnaflix import ( EMPFlixIE, MovieFapIE, ) +from .toggle import ToggleIE from .thvideo import ( THVideoIE, THVideoPlaylistIE @@ -683,7 +708,13 @@ from .tube8 import Tube8IE from .tubitv import TubiTvIE from .tudou import TudouIE from .tumblr import TumblrIE -from .tunein import TuneInIE +from .tunein import ( + TuneInClipIE, + TuneInStationIE, + TuneInProgramIE, + TuneInTopicIE, + TuneInShortenerIE, +) from .turbo import TurboIE from .tutv import TutvIE from .tv2 import ( @@ -744,7 +775,6 @@ from .videofyme import VideofyMeIE from .videomega import VideoMegaIE from .videopremium import VideoPremiumIE from .videott import VideoTtIE -from .videoweed import VideoWeedIE from .vidme import VidmeIE from .vidzi import VidziIE from .vier import VierIE, VierVideosIE @@ -846,7 +876,7 @@ from .youtube import ( YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeUserIE, - YoutubeUserPlaylistsIE, + YoutubePlaylistsIE, YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index c0e5d1abf..6a29e587f 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -23,6 +23,7 @@ class ABCIE(InfoExtractor): 'title': 'Australia to help staff Ebola treatment centre in Sierra Leone', 'description': 'md5:809ad29c67a05f54eb41f2a105693a67', }, + 'skip': 'this video has expired', }, { 'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326', 'md5': 'db2a5369238b51f9811ad815b69dc086', @@ -36,6 +37,7 @@ class ABCIE(InfoExtractor): 'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill', }, 'add_ie': ['Youtube'], + 'skip': 'Not accessible from Travis CI server', }, { 'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080', 'md5': 'b96eee7c9edf4fc5a358a0252881cc1f', @@ -58,6 +60,9 @@ class ABCIE(InfoExtractor): r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);', webpage) if mobj is None: + expired = self._html_search_regex(r'(?s)class="expired-(?:video|audio)".+?<span>(.+?)</span>', webpage, 'expired', None) + if expired: + raise ExtractorError('%s said: %s' % (self.IE_NAME, expired), expected=True) raise ExtractorError('Unable to extract video urls') urls_info = self._parse_json( diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abc7news.py index c04949c21..122dc9099 100644 --- a/youtube_dl/extractor/abc7news.py +++ b/youtube_dl/extractor/abc7news.py @@ -44,7 +44,6 @@ class Abc7NewsIE(InfoExtractor): 'contentURL', webpage, 'm3u8 url', fatal=True) formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') - self._sort_formats(formats) title = self._og_search_title(webpage).strip() description = self._og_search_description(webpage).strip() diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 3ae618e71..bf21a6887 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -68,7 +68,7 @@ class AdultSwimIE(InfoExtractor): 'md5': '3e346a2ab0087d687a05e1e7f3b3e529', 'info_dict': { 'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', }, @@ -79,6 +79,10 @@ class AdultSwimIE(InfoExtractor): 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', }, + 'params': { + # m3u8 download + 'skip_download': True, + } }] @staticmethod diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py deleted file mode 100644 index 0c00acfb5..000000000 --- a/youtube_dl/extractor/aftenposten.py +++ /dev/null @@ -1,23 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class AftenpostenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', - 'md5': 'fd828cd29774a729bf4d4425fe192972', - 'info_dict': { - 'id': '21039', - 'ext': 'mov', - 'title': 'TRAILER: "Sweatshop" - I can´t take any more', - 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', - 'timestamp': 1416927969, - 'upload_date': '20141125', - } - } - - def _real_extract(self, url): - return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream') diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py new file mode 100644 index 000000000..dcc3c97f1 --- /dev/null +++ b/youtube_dl/extractor/amp.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class AMPIE(InfoExtractor): + # parse Akamai Adaptive Media Player feed + def _extract_feed_info(self, url): + item = self._download_json( + url, None, 'Downloading Akamai AMP feed', + 'Unable to download Akamai AMP feed')['channel']['item'] + + video_id = item['guid'] + + def get_media_node(name, default=None): + media_name = 'media-%s' % name + media_group = item.get('media-group') or item + return media_group.get(media_name) or item.get(media_name) or item.get(name, default) + + thumbnails = [] + media_thumbnail = get_media_node('thumbnail') + if media_thumbnail: + if isinstance(media_thumbnail, dict): + media_thumbnail = [media_thumbnail] + for thumbnail_data in media_thumbnail: + thumbnail = thumbnail_data['@attributes'] + thumbnails.append({ + 'url': self._proto_relative_url(thumbnail['url'], 'http:'), + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + subtitles = {} + media_subtitle = get_media_node('subTitle') + if media_subtitle: + if isinstance(media_subtitle, dict): + media_subtitle = [media_subtitle] + for subtitle_data in media_subtitle: + subtitle = subtitle_data['@attributes'] + lang = subtitle.get('lang') or 'en' + subtitles[lang] = [{'url': subtitle['href']}] + + formats = [] + media_content = get_media_node('content') + if isinstance(media_content, dict): + media_content = [media_content] + for media_data in media_content: + media = media_data['@attributes'] + media_type = media['type'] + if media_type == 'video/f4m': + f4m_formats = self._extract_f4m_formats( + media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', + video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + elif media_type == 'application/x-mpegURL': + m3u8_formats = self._extract_m3u8_formats( + media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + else: + formats.append({ + 'format_id': media_data['media-category']['@attributes']['label'], + 'url': media['url'], + 'tbr': int_or_none(media.get('bitrate')), + 'filesize': int_or_none(media.get('fileSize')), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': get_media_node('title'), + 'description': get_media_node('description'), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(item.get('pubDate'), ' '), + 'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index f68dc3236..62ed0c918 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -11,6 +11,7 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): + IE_NAME = 'appletrailers' _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)' _TESTS = [{ 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', @@ -64,6 +65,12 @@ class AppleTrailersIE(InfoExtractor): }, ] }, { + 'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', + 'info_dict': { + 'id': 'blackthorn', + }, + 'playlist_mincount': 2, + }, { 'url': 'http://trailers.apple.com/ca/metropole/autrui/', 'only_matching': True, }] @@ -79,7 +86,7 @@ class AppleTrailersIE(InfoExtractor): def fix_html(s): s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s) - s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s) + s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s) # The ' in the onClick attributes are not escaped, it couldn't be parsed # like: http://trailers.apple.com/trailers/wb/gravity/ @@ -96,6 +103,9 @@ class AppleTrailersIE(InfoExtractor): trailer_info_json = self._search_regex(self._JSON_RE, on_click, 'trailer info') trailer_info = json.loads(trailer_info_json) + first_url = trailer_info.get('url') + if not first_url: + continue title = trailer_info['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() thumbnail = li.find('.//img').attrib['src'] @@ -107,7 +117,6 @@ class AppleTrailersIE(InfoExtractor): if m: duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) - first_url = trailer_info['url'] trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') @@ -144,3 +153,76 @@ class AppleTrailersIE(InfoExtractor): 'id': movie, 'entries': playlist, } + + +class AppleTrailersSectionIE(InfoExtractor): + IE_NAME = 'appletrailers:section' + _SECTIONS = { + 'justadded': { + 'feed_path': 'just_added', + 'title': 'Just Added', + }, + 'exclusive': { + 'feed_path': 'exclusive', + 'title': 'Exclusive', + }, + 'justhd': { + 'feed_path': 'just_hd', + 'title': 'Just HD', + }, + 'mostpopular': { + 'feed_path': 'most_pop', + 'title': 'Most Popular', + }, + 'moviestudios': { + 'feed_path': 'studios', + 'title': 'Movie Studios', + }, + } + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS) + _TESTS = [{ + 'url': 'http://trailers.apple.com/#section=justadded', + 'info_dict': { + 'title': 'Just Added', + 'id': 'justadded', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=exclusive', + 'info_dict': { + 'title': 'Exclusive', + 'id': 'exclusive', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=justhd', + 'info_dict': { + 'title': 'Just HD', + 'id': 'justhd', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=mostpopular', + 'info_dict': { + 'title': 'Most Popular', + 'id': 'mostpopular', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=moviestudios', + 'info_dict': { + 'title': 'Movie Studios', + 'id': 'moviestudios', + }, + 'playlist_mincount': 80, + }] + + def _real_extract(self, url): + section = self._match_id(url) + section_data = self._download_json( + 'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'], + section) + entries = [ + self.url_result('http://trailers.apple.com' + e['location']) + for e in section_data] + return self.playlist_result(entries, section, self._SECTIONS[section]['title']) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 73be6d204..687eb9f82 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -110,13 +110,19 @@ class ARDMediathekIE(InfoExtractor): server = stream.get('_server') for stream_url in stream_urls: ext = determine_ext(stream_url) + if quality != 'auto' and ext in ('f4m', 'm3u8'): + continue if ext == 'f4m': - formats.extend(self._extract_f4m_formats( + f4m_formats = self._extract_f4m_formats( stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', - video_id, preference=-1, f4m_id='hds')) + video_id, preference=-1, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', preference=1, m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + stream_url, video_id, 'mp4', preference=1, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) else: if server and server.startswith('rtmp'): f = { diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 2a00da3ee..10301a8ea 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -68,9 +68,13 @@ class ArteTVPlus7IE(InfoExtractor): def _extract_url_info(cls, url): mobj = re.match(cls._VALID_URL, url) lang = mobj.group('lang') - # This is not a real id, it can be for example AJT for the news - # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal - video_id = mobj.group('id') + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + if 'vid' in query: + video_id = query['vid'][0] + else: + # This is not a real id, it can be for example AJT for the news + # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal + video_id = mobj.group('id') return video_id, lang def _real_extract(self, url): @@ -79,9 +83,15 @@ class ArteTVPlus7IE(InfoExtractor): return self._extract_from_webpage(webpage, video_id, lang) def _extract_from_webpage(self, webpage, video_id, lang): + patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') + ids = (video_id, '') + # some pages contain multiple videos (like + # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), + # so we first try to look for json URLs that contain the video id from + # the 'vid' parameter. + patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] json_url = self._html_search_regex( - [r'arte_vp_url=["\'](.*?)["\']', r'data-url=["\']([^"]+)["\']'], - webpage, 'json vp url', default=None) + patterns, webpage, 'json vp url', default=None) if not json_url: iframe_url = self._html_search_regex( r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 50e47ba0a..7ac3044c7 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals import time import hmac +import hashlib +import re from .common import InfoExtractor from ..compat import ( @@ -32,6 +34,19 @@ class AtresPlayerIE(InfoExtractor): 'duration': 5527.6, 'thumbnail': 're:^https?://.*\.jpg$', }, + 'skip': 'This video is only available for registered users' + }, + { + 'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html', + 'md5': '0d0e918533bbd4b263f2de4d197d4aac', + 'info_dict': { + 'id': 'capitulo-112-david-bustamante', + 'ext': 'flv', + 'title': 'David Bustamante', + 'description': 'md5:f33f1c0a05be57f6708d4dd83a3b81c6', + 'duration': 1439.0, + 'thumbnail': 're:^https?://.*\.jpg$', + }, }, { 'url': 'http://www.atresplayer.com/television/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_2014122400174.html', @@ -50,6 +65,13 @@ class AtresPlayerIE(InfoExtractor): _LOGIN_URL = 'https://servicios.atresplayer.com/j_spring_security_check' + _ERRORS = { + 'UNPUBLISHED': 'We\'re sorry, but this video is not yet available.', + 'DELETED': 'This video has expired and is no longer available for online streaming.', + 'GEOUNPUBLISHED': 'We\'re sorry, but this video is not available in your region due to right restrictions.', + # 'PREMIUM': 'PREMIUM', + } + def _real_initialize(self): self._login() @@ -83,58 +105,81 @@ class AtresPlayerIE(InfoExtractor): episode_id = self._search_regex( r'episode="([^"]+)"', webpage, 'episode id') + request = sanitized_Request( + self._PLAYER_URL_TEMPLATE % episode_id, + headers={'User-Agent': self._USER_AGENT}) + player = self._download_json(request, episode_id, 'Downloading player JSON') + + episode_type = player.get('typeOfEpisode') + error_message = self._ERRORS.get(episode_type) + if error_message: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) + + formats = [] + video_url = player.get('urlVideo') + if video_url: + format_info = { + 'url': video_url, + 'format_id': 'http', + } + mobj = re.search(r'(?P<bitrate>\d+)K_(?P<width>\d+)x(?P<height>\d+)', video_url) + if mobj: + format_info.update({ + 'width': int_or_none(mobj.group('width')), + 'height': int_or_none(mobj.group('height')), + 'tbr': int_or_none(mobj.group('bitrate')), + }) + formats.append(format_info) + + m3u8_url = player.get('urlVideoHls') + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, episode_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + timestamp = int_or_none(self._download_webpage( self._TIME_API_URL, video_id, 'Downloading timestamp', fatal=False), 1000, time.time()) timestamp_shifted = compat_str(timestamp + self._TIMESTAMP_SHIFT) token = hmac.new( self._MAGIC.encode('ascii'), - (episode_id + timestamp_shifted).encode('utf-8') + (episode_id + timestamp_shifted).encode('utf-8'), hashlib.md5 ).hexdigest() - formats = [] - for fmt in ['windows', 'android_tablet']: - request = sanitized_Request( - self._URL_VIDEO_TEMPLATE.format(fmt, episode_id, timestamp_shifted, token)) - request.add_header('User-Agent', self._USER_AGENT) - - fmt_json = self._download_json( - request, video_id, 'Downloading %s video JSON' % fmt) - - result = fmt_json.get('resultDes') - if result.lower() != 'ok': - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, result), expected=True) - - for format_id, video_url in fmt_json['resultObject'].items(): - if format_id == 'token' or not video_url.startswith('http'): - continue - if video_url.endswith('/Manifest'): - if 'geodeswowsmpra3player' in video_url: - f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] - f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) - # this videos are protected by DRM, the f4m downloader doesn't support them - continue - else: - f4m_url = video_url[:-9] + '/manifest.f4m' - formats.extend(self._extract_f4m_formats(f4m_url, video_id)) - else: - formats.append({ - 'url': video_url, - 'format_id': 'android-%s' % format_id, - 'preference': 1, - }) - self._sort_formats(formats) + request = sanitized_Request( + self._URL_VIDEO_TEMPLATE.format('windows', episode_id, timestamp_shifted, token), + headers={'User-Agent': self._USER_AGENT}) - player = self._download_json( - self._PLAYER_URL_TEMPLATE % episode_id, - episode_id) + fmt_json = self._download_json( + request, video_id, 'Downloading windows video JSON') + + result = fmt_json.get('resultDes') + if result.lower() != 'ok': + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, result), expected=True) + + for format_id, video_url in fmt_json['resultObject'].items(): + if format_id == 'token' or not video_url.startswith('http'): + continue + if 'geodeswowsmpra3player' in video_url: + f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] + f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) + # this videos are protected by DRM, the f4m downloader doesn't support them + continue + else: + f4m_url = video_url[:-9] + '/manifest.f4m' + f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + self._sort_formats(formats) path_data = player.get('pathData') episode = self._download_xml( - self._EPISODE_URL_TEMPLATE % path_data, - video_id, 'Downloading episode XML') + self._EPISODE_URL_TEMPLATE % path_data, video_id, + 'Downloading episode XML') duration = float_or_none(xpath_text( episode, './media/asset/info/technical/contentDuration', 'duration')) diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index b0b089dee..4382a302b 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -15,7 +15,7 @@ class AudiMediaIE(InfoExtractor): 'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test', 'md5': '79a8b71c46d49042609795ab59779b66', 'info_dict': { - 'id': '1564', + 'id': '1565', 'ext': 'mp4', 'title': '60 Seconds of Audi Sport 104/2015 - WEC Bahrain, Rookie Test', 'description': 'md5:60e5d30a78ced725f7b8d34370762941', diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index 693ba22c6..3eed91279 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -56,7 +56,7 @@ class AudiomackIE(InfoExtractor): # API is inconsistent with errors if 'url' not in api_response or not api_response['url'] or 'error' in api_response: - raise ExtractorError('Invalid url %s', url) + raise ExtractorError('Invalid url %s' % url) # Audiomack wraps a lot of soundcloud tracks in their branded wrapper # if so, pass the work off to the soundcloud extractor diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index d89e34ba0..691aecc0d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -733,6 +733,7 @@ class BBCIE(BBCCoUkIE): # article with multiple videos embedded with playlist.sxml (e.g. # http://www.bbc.com/sport/0/football/34475836) playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage) + playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage)) if playlists: entries = [ self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index e63c2ac00..c8d921daf 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -34,15 +34,29 @@ class BeegIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'http://beeg.com/api/v3/video/%s' % video_id, video_id) + 'http://beeg.com/api/v5/video/%s' % video_id, video_id) + + def split(o, e): + def cut(s, x): + n.append(s[:x]) + return s[x:] + n = [] + r = len(o) % e + if r > 0: + o = cut(o, r) + while len(o) > e: + o = cut(o, e) + n.append(o) + return n def decrypt_key(key): - # Reverse engineered from http://static.beeg.com/cpl/1067.js - a = '8RPUUCS35ZWp3ADnKcSmpH71ZusrROo' + # Reverse engineered from http://static.beeg.com/cpl/1105.js + a = '5ShMcIQlssOd7zChAIOlmeTZDaUxULbJRnywYaiB' e = compat_urllib_parse_unquote(key) - return ''.join([ - compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 25) + o = ''.join([ + compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 21) for n in range(len(e))]) + return ''.join(split(o, 3)[::-1]) def decrypt_url(encrypted_url): encrypted_url = self._proto_relative_url( diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py new file mode 100644 index 000000000..38bda3af5 --- /dev/null +++ b/youtube_dl/extractor/bleacherreport.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .amp import AMPIE +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) + + +class BleacherReportIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', + 'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', + 'info_dict': { + 'id': '2496438', + 'ext': 'mp4', + 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', + 'uploader_id': 3992341, + 'description': 'CFB, ACC, Florida State', + 'timestamp': 1434380212, + 'upload_date': '20150615', + 'uploader': 'Team Stream Now ', + }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', + 'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', + 'info_dict': { + 'id': '2586817', + 'ext': 'mp4', + 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', + 'timestamp': 1446839961, + 'uploader': 'Sean Fay', + 'description': 'md5:825e94e0f3521df52fa83b2ed198fa20', + 'uploader_id': 6466954, + 'upload_date': '20151011', + }, + 'add_ie': ['Youtube'], + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + + article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article'] + + thumbnails = [] + primary_photo = article_data.get('primaryPhoto') + if primary_photo: + thumbnails = [{ + 'url': primary_photo['url'], + 'width': primary_photo.get('width'), + 'height': primary_photo.get('height'), + }] + + info = { + '_type': 'url_transparent', + 'id': article_id, + 'title': article_data['title'], + 'uploader': article_data.get('author', {}).get('name'), + 'uploader_id': article_data.get('authorId'), + 'timestamp': parse_iso8601(article_data.get('createdAt')), + 'thumbnails': thumbnails, + 'comment_count': int_or_none(article_data.get('commentsCount')), + 'view_count': int_or_none(article_data.get('hitCount')), + } + + video = article_data.get('video') + if video: + video_type = video['type'] + if video_type == 'cms.bleacherreport.com': + info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] + elif video_type == 'ooyala.com': + info['url'] = 'ooyala:%s' % video['id'] + elif video_type == 'youtube.com': + info['url'] = video['id'] + elif video_type == 'vine.co': + info['url'] = 'https://vine.co/v/%s' % video['id'] + else: + info['url'] = video_type + video['id'] + return info + else: + raise ExtractorError('no video in the article', expected=True) + + +class BleacherReportCMSIE(AMPIE): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' + _TESTS = [{ + 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'md5': '8c2c12e3af7805152675446c905d159b', + 'info_dict': { + 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'ext': 'flv', + 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', + 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._extract_feed_info('http://cms.bleacherreport.com/media/items/%s/akamai.json' % video_id) + info['id'] = video_id + return info diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py deleted file mode 100644 index 35375f7b1..000000000 --- a/youtube_dl/extractor/bliptv.py +++ /dev/null @@ -1,290 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - -from ..compat import compat_urlparse -from ..utils import ( - clean_html, - int_or_none, - parse_iso8601, - sanitized_Request, - unescapeHTML, - xpath_text, - xpath_with_ns, -) - - -class BlipTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+_]+)))' - - _TESTS = [ - { - 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', - 'md5': '80baf1ec5c3d2019037c1c707d676b9f', - 'info_dict': { - 'id': '5779306', - 'ext': 'm4v', - 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', - 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', - 'timestamp': 1323138843, - 'upload_date': '20111206', - 'uploader': 'cbr', - 'uploader_id': '679425', - 'duration': 81, - } - }, - { - # https://github.com/rg3/youtube-dl/pull/2274 - 'note': 'Video with subtitles', - 'url': 'http://blip.tv/play/h6Uag5OEVgI.html', - 'md5': '309f9d25b820b086ca163ffac8031806', - 'info_dict': { - 'id': '6586561', - 'ext': 'mp4', - 'title': 'Red vs. Blue Season 11 Episode 1', - 'description': 'One-Zero-One', - 'timestamp': 1371261608, - 'upload_date': '20130615', - 'uploader': 'redvsblue', - 'uploader_id': '792887', - 'duration': 279, - } - }, - { - # https://bugzilla.redhat.com/show_bug.cgi?id=967465 - 'url': 'http://a.blip.tv/api.swf#h6Uag5KbVwI', - 'md5': '314e87b1ebe7a48fcbfdd51b791ce5a6', - 'info_dict': { - 'id': '6573122', - 'ext': 'mov', - 'upload_date': '20130520', - 'description': 'Two hapless space marines argue over what to do when they realize they have an astronomically huge problem on their hands.', - 'title': 'Red vs. Blue Season 11 Trailer', - 'timestamp': 1369029609, - 'uploader': 'redvsblue', - 'uploader_id': '792887', - } - }, - { - 'url': 'http://blip.tv/play/gbk766dkj4Yn', - 'md5': 'fe0a33f022d49399a241e84a8ea8b8e3', - 'info_dict': { - 'id': '1749452', - 'ext': 'mp4', - 'upload_date': '20090208', - 'description': 'Witness the first appearance of the Nostalgia Critic character, as Doug reviews the movie Transformers.', - 'title': 'Nostalgia Critic: Transformers', - 'timestamp': 1234068723, - 'uploader': 'NostalgiaCritic', - 'uploader_id': '246467', - } - }, - { - # https://github.com/rg3/youtube-dl/pull/4404 - 'note': 'Audio only', - 'url': 'http://blip.tv/hilarios-productions/weekly-manga-recap-kingdom-7119982', - 'md5': '76c0a56f24e769ceaab21fbb6416a351', - 'info_dict': { - 'id': '7103299', - 'ext': 'flv', - 'title': 'Weekly Manga Recap: Kingdom', - 'description': 'And then Shin breaks the enemy line, and he's all like HWAH! And then he slices a guy and it's all like FWASHING! And... it's really hard to describe the best parts of this series without breaking down into sound effects, okay?', - 'timestamp': 1417660321, - 'upload_date': '20141204', - 'uploader': 'The Rollo T', - 'uploader_id': '407429', - 'duration': 7251, - 'vcodec': 'none', - } - }, - { - # missing duration - 'url': 'http://blip.tv/rss/flash/6700880', - 'info_dict': { - 'id': '6684191', - 'ext': 'm4v', - 'title': 'Cowboy Bebop: Gateway Shuffle Review', - 'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8', - 'timestamp': 1386639757, - 'upload_date': '20131210', - 'uploader': 'sfdebris', - 'uploader_id': '706520', - } - } - ] - - @staticmethod - def _extract_url(webpage): - mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) - if mobj: - return 'http://blip.tv/a/a-' + mobj.group(1) - mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage) - if mobj: - return mobj.group(1) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - lookup_id = mobj.group('lookup_id') - - # See https://github.com/rg3/youtube-dl/issues/857 and - # https://github.com/rg3/youtube-dl/issues/4197 - if lookup_id: - urlh = self._request_webpage( - 'http://blip.tv/play/%s' % lookup_id, lookup_id, 'Resolving lookup id') - url = compat_urlparse.urlparse(urlh.geturl()) - qs = compat_urlparse.parse_qs(url.query) - mobj = re.match(self._VALID_URL, qs['file'][0]) - - video_id = mobj.group('id') - - rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS') - - def _x(p): - return xpath_with_ns(p, { - 'blip': 'http://blip.tv/dtd/blip/1.0', - 'media': 'http://search.yahoo.com/mrss/', - 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', - }) - - item = rss.find('channel/item') - - video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id - title = xpath_text(item, 'title', 'title', fatal=True) - description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description')) - timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp')) - uploader = xpath_text(item, _x('blip:user'), 'uploader') - uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id') - duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration')) - media_thumbnail = item.find(_x('media:thumbnail')) - thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None - else xpath_text(item, 'image', 'thumbnail')) - categories = [category.text for category in item.findall('category') if category is not None] - - formats = [] - subtitles_urls = {} - - media_group = item.find(_x('media:group')) - for media_content in media_group.findall(_x('media:content')): - url = media_content.get('url') - role = media_content.get(_x('blip:role')) - msg = self._download_webpage( - url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url', - video_id, 'Resolving URL for %s' % role) - real_url = compat_urlparse.parse_qs(msg.strip())['message'][0] - - media_type = media_content.get('type') - if media_type == 'text/srt' or url.endswith('.srt'): - LANGS = { - 'english': 'en', - } - lang = role.rpartition('-')[-1].strip().lower() - langcode = LANGS.get(lang, lang) - subtitles_urls[langcode] = url - elif media_type.startswith('video/'): - formats.append({ - 'url': real_url, - 'format_id': role, - 'format_note': media_type, - 'vcodec': media_content.get(_x('blip:vcodec')) or 'none', - 'acodec': media_content.get(_x('blip:acodec')), - 'filesize': media_content.get('filesize'), - 'width': int_or_none(media_content.get('width')), - 'height': int_or_none(media_content.get('height')), - }) - self._check_formats(formats, video_id) - self._sort_formats(formats) - - subtitles = self.extract_subtitles(video_id, subtitles_urls) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'duration': duration, - 'thumbnail': thumbnail, - 'categories': categories, - 'formats': formats, - 'subtitles': subtitles, - } - - def _get_subtitles(self, video_id, subtitles_urls): - subtitles = {} - for lang, url in subtitles_urls.items(): - # For some weird reason, blip.tv serves a video instead of subtitles - # when we request with a common UA - req = sanitized_Request(url) - req.add_header('User-Agent', 'youtube-dl') - subtitles[lang] = [{ - # The extension is 'srt' but it's actually an 'ass' file - 'ext': 'ass', - 'data': self._download_webpage(req, None, note=False), - }] - return subtitles - - -class BlipTVUserIE(InfoExtractor): - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$' - _PAGE_SIZE = 12 - IE_NAME = 'blip.tv:user' - _TEST = { - 'url': 'http://blip.tv/actone', - 'info_dict': { - 'id': 'actone', - 'title': 'Act One: The Series', - }, - 'playlist_count': 5, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - username = mobj.group(1) - - page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' - - page = self._download_webpage(url, username, 'Downloading user page') - mobj = re.search(r'data-users-id="([^"]+)"', page) - page_base = page_base % mobj.group(1) - title = self._og_search_title(page) - - # Download video ids using BlipTV Ajax calls. Result size per - # query is limited (currently to 12 videos) so we need to query - # page by page until there are no video ids - it means we got - # all of them. - - video_ids = [] - pagenum = 1 - - while True: - url = page_base + "&page=" + str(pagenum) - page = self._download_webpage( - url, username, 'Downloading video ids from page %d' % pagenum) - - # Extract video identifiers - ids_in_page = [] - - for mobj in re.finditer(r'href="/([^"]+)"', page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(unescapeHTML(mobj.group(1))) - - video_ids.extend(ids_in_page) - - # A little optimization - if current page is not - # "full", ie. does not contain PAGE_SIZE video ids then - # we can assume that this page is the last one - there - # are no more ids on further pages - no need to query - # again. - - if len(ids_in_page) < self._PAGE_SIZE: - break - - pagenum += 1 - - urls = ['http://blip.tv/%s' % video_id for video_id in video_ids] - url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls] - return self.playlist_result( - url_entries, playlist_title=title, playlist_id=username) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 66e394e10..e66854538 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -1,18 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, parse_duration, + xpath_element, + xpath_text, ) class BRIE(InfoExtractor): IE_DESC = 'Bayerischer Rundfunk Mediathek' - _VALID_URL = r'https?://(?:www\.)?br\.de/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' - _BASE_URL = 'http://www.br.de' + _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' _TESTS = [ { @@ -22,7 +25,7 @@ class BRIE(InfoExtractor): 'id': '48f656ef-287e-486f-be86-459122db22cc', 'ext': 'mp4', 'title': 'Die böse Überraschung', - 'description': 'Betriebliche Altersvorsorge: Die böse Überraschung', + 'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9', 'duration': 180, 'uploader': 'Reinhard Weber', 'upload_date': '20150422', @@ -30,23 +33,23 @@ class BRIE(InfoExtractor): }, { 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', - 'md5': 'a44396d73ab6a68a69a568fae10705bb', + 'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef', 'info_dict': { 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Manfred Schreiber ist tot', - 'description': 'Abendschau kompakt: Manfred Schreiber ist tot', + 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97', 'duration': 26, } }, { - 'url': 'http://www.br.de/radio/br-klassik/sendungen/allegro/premiere-urauffuehrung-the-land-2015-dance-festival-muenchen-100.html', + 'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html', 'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d', 'info_dict': { 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b', 'ext': 'aac', 'title': 'Kurzweilig und sehr bewegend', - 'description': '"The Land" von Peeping Tom: Kurzweilig und sehr bewegend', + 'description': 'md5:0351996e3283d64adeb38ede91fac54e', 'duration': 296, } }, @@ -57,7 +60,7 @@ class BRIE(InfoExtractor): 'id': '6ba73750-d405-45d3-861d-1ce8c524e059', 'ext': 'mp4', 'title': 'Umweltbewusster Häuslebauer', - 'description': 'Uwe Erdelt: Umweltbewusster Häuslebauer', + 'description': 'md5:d52dae9792d00226348c1dbb13c9bae2', 'duration': 116, } }, @@ -68,7 +71,7 @@ class BRIE(InfoExtractor): 'id': 'd982c9ce-8648-4753-b358-98abb8aec43d', 'ext': 'mp4', 'title': 'Folge 1 - Metaphysik', - 'description': 'Kant für Anfänger: Folge 1 - Metaphysik', + 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3', 'duration': 893, 'uploader': 'Eva Maria Steimle', 'upload_date': '20140117', @@ -77,28 +80,31 @@ class BRIE(InfoExtractor): ] def _real_extract(self, url): - display_id = self._match_id(url) + base_url, display_id = re.search(self._VALID_URL, url).groups() page = self._download_webpage(url, display_id) xml_url = self._search_regex( r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') - xml = self._download_xml(self._BASE_URL + xml_url, None) + xml = self._download_xml(base_url + xml_url, display_id) medias = [] for xml_media in xml.findall('video') + xml.findall('audio'): + media_id = xml_media.get('externalId') media = { - 'id': xml_media.get('externalId'), - 'title': xml_media.find('title').text, - 'duration': parse_duration(xml_media.find('duration').text), - 'formats': self._extract_formats(xml_media.find('assets')), - 'thumbnails': self._extract_thumbnails(xml_media.find('teaserImage/variants')), - 'description': ' '.join(xml_media.find('shareTitle').text.splitlines()), - 'webpage_url': xml_media.find('permalink').text + 'id': media_id, + 'title': xpath_text(xml_media, 'title', 'title', True), + 'duration': parse_duration(xpath_text(xml_media, 'duration')), + 'formats': self._extract_formats(xpath_element( + xml_media, 'assets'), media_id), + 'thumbnails': self._extract_thumbnails(xpath_element( + xml_media, 'teaserImage/variants'), base_url), + 'description': xpath_text(xml_media, 'desc'), + 'webpage_url': xpath_text(xml_media, 'permalink'), + 'uploader': xpath_text(xml_media, 'author'), } - if xml_media.find('author').text: - media['uploader'] = xml_media.find('author').text - if xml_media.find('broadcastDate').text: - media['upload_date'] = ''.join(reversed(xml_media.find('broadcastDate').text.split('.'))) + broadcast_date = xpath_text(xml_media, 'broadcastDate') + if broadcast_date: + media['upload_date'] = ''.join(reversed(broadcast_date.split('.'))) medias.append(media) if len(medias) > 1: @@ -109,35 +115,58 @@ class BRIE(InfoExtractor): raise ExtractorError('No media entries found') return medias[0] - def _extract_formats(self, assets): - - def text_or_none(asset, tag): - elem = asset.find(tag) - return None if elem is None else elem.text - - formats = [{ - 'url': text_or_none(asset, 'downloadUrl'), - 'ext': text_or_none(asset, 'mediaType'), - 'format_id': asset.get('type'), - 'width': int_or_none(text_or_none(asset, 'frameWidth')), - 'height': int_or_none(text_or_none(asset, 'frameHeight')), - 'tbr': int_or_none(text_or_none(asset, 'bitrateVideo')), - 'abr': int_or_none(text_or_none(asset, 'bitrateAudio')), - 'vcodec': text_or_none(asset, 'codecVideo'), - 'acodec': text_or_none(asset, 'codecAudio'), - 'container': text_or_none(asset, 'mediaType'), - 'filesize': int_or_none(text_or_none(asset, 'size')), - } for asset in assets.findall('asset') - if asset.find('downloadUrl') is not None] - + def _extract_formats(self, assets, media_id): + formats = [] + for asset in assets.findall('asset'): + format_url = xpath_text(asset, ['downloadUrl', 'url']) + asset_type = asset.get('type') + if asset_type == 'HDS': + f4m_formats = self._extract_f4m_formats( + format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + elif asset_type == 'HLS': + m3u8_formats = self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + else: + format_info = { + 'ext': xpath_text(asset, 'mediaType'), + 'width': int_or_none(xpath_text(asset, 'frameWidth')), + 'height': int_or_none(xpath_text(asset, 'frameHeight')), + 'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')), + 'abr': int_or_none(xpath_text(asset, 'bitrateAudio')), + 'vcodec': xpath_text(asset, 'codecVideo'), + 'acodec': xpath_text(asset, 'codecAudio'), + 'container': xpath_text(asset, 'mediaType'), + 'filesize': int_or_none(xpath_text(asset, 'size')), + } + format_url = self._proto_relative_url(format_url) + if format_url: + http_format_info = format_info.copy() + http_format_info.update({ + 'url': format_url, + 'format_id': 'http-%s' % asset_type, + }) + formats.append(http_format_info) + server_prefix = xpath_text(asset, 'serverPrefix') + if server_prefix: + rtmp_format_info = format_info.copy() + rtmp_format_info.update({ + 'url': server_prefix, + 'play_path': xpath_text(asset, 'fileName'), + 'format_id': 'rtmp-%s' % asset_type, + }) + formats.append(rtmp_format_info) self._sort_formats(formats) return formats - def _extract_thumbnails(self, variants): + def _extract_thumbnails(self, variants, base_url): thumbnails = [{ - 'url': self._BASE_URL + variant.find('url').text, - 'width': int_or_none(variant.find('width').text), - 'height': int_or_none(variant.find('height').text), - } for variant in variants.findall('variant')] + 'url': base_url + xpath_text(variant, 'url'), + 'width': int_or_none(xpath_text(variant, 'width')), + 'height': int_or_none(xpath_text(variant, 'height')), + } for variant in variants.findall('variant') if xpath_text(variant, 'url')] thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) return thumbnails diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f5ebae1e6..03a4f446e 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -355,7 +355,7 @@ class BrightcoveLegacyIE(InfoExtractor): class BrightcoveNewIE(InfoExtractor): IE_NAME = 'brightcove:new' - _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+)' + _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>(?:ref:)?\d+)' _TESTS = [{ 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', 'md5': 'c8100925723840d4b0d243f7025703be', @@ -387,14 +387,24 @@ class BrightcoveNewIE(InfoExtractor): 'params': { 'skip_download': True, } + }, { + # ref: prefixed video id + 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', + 'only_matching': True, }] @staticmethod + def _extract_url(webpage): + urls = BrightcoveNewIE._extract_urls(webpage) + return urls[0] if urls else None + + @staticmethod def _extract_urls(webpage): # Reference: # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe - # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript) + # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html + # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player entries = [] @@ -407,9 +417,10 @@ class BrightcoveNewIE(InfoExtractor): for video_id, account_id, player_id, embed in re.findall( # According to examples from [3] it's unclear whether video id # may be optional and what to do when it is + # According to [4] data-video-id may be prefixed with ref: r'''(?sx) <video[^>]+ - data-video-id=["\'](\d+)["\'][^>]*>.*? + data-video-id=["\']((?:ref:)?\d+)["\'][^>]*>.*? </video>.*? <script[^>]+ src=["\'](?:https?:)?//players\.brightcove\.net/ diff --git a/youtube_dl/extractor/canal13cl.py b/youtube_dl/extractor/canal13cl.py deleted file mode 100644 index 93241fefe..000000000 --- a/youtube_dl/extractor/canal13cl.py +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class Canal13clIE(InfoExtractor): - _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', - 'md5': '4cb1fa38adcad8fea88487a078831755', - 'info_dict': { - 'id': '1403022125', - 'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', - 'ext': 'mp4', - 'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda', - 'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - - webpage = self._download_webpage(url, display_id) - - title = self._html_search_meta( - 'twitter:title', webpage, 'title', fatal=True) - description = self._html_search_meta( - 'twitter:description', webpage, 'description') - url = self._html_search_regex( - r'articuloVideo = \"(.*?)\"', webpage, 'url') - real_id = self._search_regex( - r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id) - thumbnail = self._html_search_regex( - r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail') - - return { - 'id': real_id, - 'display_id': display_id, - 'url': url, - 'title': title, - 'description': description, - 'ext': 'mp4', - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index 0b67ba67d..242fba311 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -23,6 +23,8 @@ class ChaturbateIE(InfoExtractor): 'only_matching': True, }] + _ROOM_OFFLINE = 'Room is currently offline' + def _real_extract(self, url): video_id = self._match_id(url) @@ -34,9 +36,16 @@ class ChaturbateIE(InfoExtractor): if not m3u8_url: error = self._search_regex( - r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>', - webpage, 'error', group='error') - raise ExtractorError(error, expected=True) + [r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>', + r'<div[^>]+id=(["\'])defchat\1[^>]*>\s*<p><strong>(?P<error>[^<]+)<'], + webpage, 'error', group='error', default=None) + if not error: + if any(p not in webpage for p in ( + self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')): + error = self._ROOM_OFFLINE + if error: + raise ExtractorError(error, expected=True) + raise ExtractorError('Unable to find stream URL') formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index fd1770dac..6d9cd8abd 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ExtractorError -from .bliptv import BlipTVIE from .screenwavemedia import ScreenwaveMediaIE @@ -34,18 +33,17 @@ class CinemassacreIE(InfoExtractor): }, }, { - # blip.tv embedded video + # Youtube embedded video 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', - 'md5': 'ca9b3c8dd5a66f9375daeb5135f5a3de', + 'md5': 'df4cf8a1dcedaec79a73d96d83b99023', 'info_dict': { - 'id': '4065369', - 'ext': 'flv', + 'id': 'OEVzPCY2T-g', + 'ext': 'mp4', 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', 'upload_date': '20061207', - 'uploader': 'cinemassacre', - 'uploader_id': '250778', - 'timestamp': 1283233867, - 'description': 'md5:0a108c78d130676b207d0f6d029ecffd', + 'uploader': 'Cinemassacre', + 'uploader_id': 'JamesNintendoNerd', + 'description': 'md5:784734696c2b8b7f4b8625cc799e07f6', } }, { @@ -89,8 +87,6 @@ class CinemassacreIE(InfoExtractor): ], webpage, 'player data URL', default=None, group='url') if not playerdata_url: - playerdata_url = BlipTVIE._extract_url(webpage) - if not playerdata_url: raise ExtractorError('Unable to find player data') video_title = self._html_search_regex( diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index d46592cc5..2996b6b09 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import determine_ext +from ..utils import int_or_none _translation_table = { @@ -42,31 +42,26 @@ class CliphunterIE(InfoExtractor): video_title = self._search_regex( r'mediaTitle = "([^"]+)"', webpage, 'title') - fmts = {} - for fmt in ('mp4', 'flv'): - fmt_list = self._parse_json(self._search_regex( - r'var %sjson\s*=\s*(\[.*?\]);' % fmt, webpage, '%s formats' % fmt), video_id) - for f in fmt_list: - fmts[f['fname']] = _decode(f['sUrl']) - - qualities = self._parse_json(self._search_regex( - r'var player_btns\s*=\s*(.*?);\n', webpage, 'quality info'), video_id) + gexo_files = self._parse_json( + self._search_regex( + r'var\s+gexoFiles\s*=\s*({.+?});', webpage, 'gexo files'), + video_id) formats = [] - for fname, url in fmts.items(): - f = { - 'url': url, - } - if fname in qualities: - qual = qualities[fname] - f.update({ - 'format_id': '%s_%sp' % (determine_ext(url), qual['h']), - 'width': qual['w'], - 'height': qual['h'], - 'tbr': qual['br'], - }) - formats.append(f) - + for format_id, f in gexo_files.items(): + video_url = f.get('url') + if not video_url: + continue + fmt = f.get('fmt') + height = f.get('h') + format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id + formats.append({ + 'url': _decode(video_url), + 'format_id': format_id, + 'width': int_or_none(f.get('w')), + 'height': int_or_none(height), + 'tbr': int_or_none(f.get('br')), + }) self._sort_formats(formats) thumbnail = self._search_regex( diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 5dd69bff7..5c3908f72 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -1,15 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import json +from .theplatform import ThePlatformIE +from ..utils import int_or_none -from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) - -class CNETIE(InfoExtractor): +class CNETIE(ThePlatformIE): _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/' _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', @@ -18,25 +14,20 @@ class CNETIE(InfoExtractor): 'ext': 'flv', 'title': 'Hands-on with Microsoft Windows 8.1 Update', 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', - 'thumbnail': 're:^http://.*/flmswindows8.jpg$', 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', 'uploader': 'Sarah Mitroff', + 'duration': 70, }, - 'params': { - 'skip_download': 'requires rtmpdump', - } }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', 'info_dict': { 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', 'ext': 'flv', + 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', - 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', - }, - 'params': { - 'skip_download': True, # requires rtmpdump + 'duration': 1482, }, }] @@ -45,26 +36,13 @@ class CNETIE(InfoExtractor): webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"<div class=\"cnetVideoPlayer\"\s+.*?data-cnet-video-options='([^']+)'", + r"data-cnet-video(?:-uvp)?-options='([^']+)'", webpage, 'data json') - data = json.loads(data_json) - vdata = data['video'] - if not vdata: - vdata = data['videos'][0] - if not vdata: - raise ExtractorError('Cannot find video data') - - mpx_account = data['config']['players']['default']['mpx_account'] - vid = vdata['files'].get('rtmp', vdata['files']['hds']) - tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid) + data = self._parse_json(data_json, display_id) + vdata = data.get('video') or data['videos'][0] video_id = vdata['id'] - title = vdata.get('headline') - if title is None: - title = vdata.get('title') - if title is None: - raise ExtractorError('Cannot find title!') - thumbnail = vdata.get('image', {}).get('path') + title = vdata['title'] author = vdata.get('author') if author: uploader = '%s %s' % (author['firstName'], author['lastName']) @@ -73,13 +51,34 @@ class CNETIE(InfoExtractor): uploader = None uploader_id = None + mpx_account = data['config']['uvpConfig']['default']['mpx_account'] + + metadata = self.get_metadata('%s/%s' % (mpx_account, list(vdata['files'].values())[0]), video_id) + description = vdata.get('description') or metadata.get('description') + duration = int_or_none(vdata.get('duration')) or metadata.get('duration') + + formats = [] + subtitles = {} + for (fkey, vid) in vdata['files'].items(): + if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: + continue + release_url = 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true' % (mpx_account, vid) + if fkey == 'hds': + release_url += '&manifest=f4m' + tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + self._sort_formats(formats) + return { - '_type': 'url_transparent', - 'url': tp_link, 'id': video_id, 'display_id': display_id, 'title': title, + 'description': description, + 'thumbnail': metadata.get('thumbnail'), + 'duration': duration, 'uploader': uploader, 'uploader_id': uploader_id, - 'thumbnail': thumbnail, + 'subtitles': subtitles, + 'formats': formats, } diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 81f3d7697..2efa200b5 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -1,10 +1,12 @@ # encoding: utf-8 from __future__ import unicode_literals -import json - from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, +) class ComCarCoffIE(InfoExtractor): @@ -16,6 +18,7 @@ class ComCarCoffIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20141127', 'timestamp': 1417107600, + 'duration': 1232, 'title': 'Happy Thanksgiving Miranda', 'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.', 'thumbnail': 'http://ccc.crackle.com/images/s5e4_thumb.jpg', @@ -31,9 +34,10 @@ class ComCarCoffIE(InfoExtractor): display_id = 'comediansincarsgettingcoffee.com' webpage = self._download_webpage(url, display_id) - full_data = json.loads(self._search_regex( - r'<script type="application/json" id="videoData">(?P<json>.+?)</script>', - webpage, 'full data json')) + full_data = self._parse_json( + self._search_regex( + r'window\.app\s*=\s*({.+?});\n', webpage, 'full data json'), + display_id)['videoData'] video_id = full_data['activeVideo']['video'] video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id] @@ -45,12 +49,18 @@ class ComCarCoffIE(InfoExtractor): formats = self._extract_m3u8_formats( video_data['mediaUrl'], video_id, ext='mp4') + timestamp = int_or_none(video_data.get('pubDateTime')) or parse_iso8601( + video_data.get('pubDate')) + duration = int_or_none(video_data.get('durationSeconds')) or parse_duration( + video_data.get('duration')) + return { 'id': video_id, 'display_id': display_id, 'title': video_data['title'], 'description': video_data.get('description'), - 'timestamp': parse_iso8601(video_data.get('pubDate')), + 'timestamp': timestamp, + 'duration': duration, 'thumbnails': thumbnails, 'formats': formats, 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))), diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 76f5b8b05..34a28c126 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -29,6 +29,7 @@ from ..utils import ( clean_html, compiled_regex_type, determine_ext, + error_to_compat_str, ExtractorError, fix_xml_ampersands, float_or_none, @@ -332,7 +333,8 @@ class InfoExtractor(object): return False if errnote is None: errnote = 'Unable to download webpage' - errmsg = '%s: %s' % (errnote, compat_str(err)) + + errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) if fatal: raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) else: @@ -622,7 +624,7 @@ class InfoExtractor(object): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning('parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) return (username, password) @@ -882,7 +884,7 @@ class InfoExtractor(object): fatal=fatal) if manifest is False: - return manifest + return [] formats = [] manifest_version = '1.0' @@ -953,7 +955,7 @@ class InfoExtractor(object): errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) if res is False: - return res + return [] m3u8_doc, urlh = res m3u8_url = urlh.geturl() last_info = None diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 7b685d157..b3ee67018 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -58,18 +58,23 @@ class CSpanIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + video_type = None webpage = self._download_webpage(url, video_id) - matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage) - if matches: + # We first look for clipid, because clipprog always appears before + patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] + results = list(filter(None, (re.search(p, webpage) for p in patterns))) + if results: + matches = results[0] video_type, video_id = matches.groups() - if video_type == 'prog': - video_type = 'program' + video_type = 'clip' if video_type == 'id' else 'program' else: senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) if senate_isvp_url: title = self._og_search_title(webpage) surl = smuggle_url(senate_isvp_url, {'force_title': title}) return self.url_result(surl, 'SenateISVP', video_id, title) + if video_type is None or video_id is None: + raise ExtractorError('unable to find video id and type') def get_text_attr(d, attr): return d.get(attr, {}).get('#text') diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index ab7f3aec4..0c5b6617f 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -7,10 +7,10 @@ import itertools from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( - ExtractorError, determine_ext, + error_to_compat_str, + ExtractorError, int_or_none, parse_iso8601, sanitized_Request, @@ -99,6 +99,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor): { 'url': 'http://www.dailymotion.com/video/xhza0o', 'only_matching': True, + }, + # with subtitles + { + 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', + 'only_matching': True, } ] @@ -122,7 +127,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): webpage, 'comment count', fatal=False)) player_v5 = self._search_regex( - [r'buildPlayer\(({.+?})\);', r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);'], + [r'buildPlayer\(({.+?})\);\n', # See https://github.com/rg3/youtube-dl/issues/7826 + r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', + r'buildPlayer\(({.+?})\);'], webpage, 'player v5', default=None) if player_v5: player = self._parse_json(player_v5, video_id) @@ -172,11 +179,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor): uploader_id = metadata.get('owner', {}).get('id') subtitles = {} - for subtitle_lang, subtitle in metadata.get('subtitles', {}).get('data', {}).items(): - subtitles[subtitle_lang] = [{ - 'ext': determine_ext(subtitle_url), - 'url': subtitle_url, - } for subtitle_url in subtitle.get('urls', [])] + subtitles_data = metadata.get('subtitles', {}).get('data', {}) + if subtitles_data and isinstance(subtitles_data, dict): + for subtitle_lang, subtitle in subtitles_data.items(): + subtitles[subtitle_lang] = [{ + 'ext': determine_ext(subtitle_url), + 'url': subtitle_url, + } for subtitle_url in subtitle.get('urls', [])] return { 'id': video_id, @@ -269,7 +278,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, video_id, note=False) except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err)) + self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) return {} info = json.loads(sub_list) if (info['total'] > 0): diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 934da765e..9a94cf361 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -25,6 +25,18 @@ class DaumIE(InfoExtractor): 'duration': 3868, }, }, { + # Test for https://github.com/rg3/youtube-dl/issues/7949 + 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=M1O35s8HPOo0&clipid=73147290', + 'md5': 'c92d78bcee4424451f1667f275c1dc97', + 'info_dict': { + 'id': '73147290', + 'ext': 'mp4', + 'title': '싸이 - 나팔바지 [유희열의 스케치북] 299회 20151218', + 'description': '싸이 - 나팔바지', + 'upload_date': '20151219', + 'duration': 232, + }, + }, { 'url': 'http://tvpot.daum.net/v/vab4dyeDBysyBssyukBUjBz', 'only_matching': True, }, { @@ -37,9 +49,11 @@ class DaumIE(InfoExtractor): video_id = mobj.group('id') canonical_url = 'http://tvpot.daum.net/v/%s' % video_id webpage = self._download_webpage(canonical_url, video_id) + og_url = self._og_search_url(webpage, default=None) or self._search_regex( + r'<link[^>]+rel=(["\'])canonical\1[^>]+href=(["\'])(?P<url>.+?)\2', + webpage, 'canonical url', group='url') full_id = self._search_regex( - r'src=["\']http://videofarm\.daum\.net/controller/video/viewer/Video\.html\?.*?vid=(.+?)[&"\']', - webpage, 'full id') + r'tvpot\.daum\.net/v/([^/]+)', og_url, 'full id') query = compat_urllib_parse.urlencode({'vid': full_id}) info = self._download_xml( 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 9737cff14..0d140f12f 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -1,26 +1,89 @@ # coding: utf-8 from __future__ import unicode_literals +import re +import base64 + from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( int_or_none, parse_iso8601, sanitized_Request, + smuggle_url, + unsmuggle_url, ) class DCNIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' + + def _real_extract(self, url): + show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() + if video_id and int(video_id) > 0: + return self.url_result( + 'http://www.dcndigital.ae/media/%s' % video_id, 'DCNVideo') + elif season_id and int(season_id) > 0: + return self.url_result(smuggle_url( + 'http://www.dcndigital.ae/program/season/%s' % season_id, + {'show_id': show_id}), 'DCNSeason') + else: + return self.url_result( + 'http://www.dcndigital.ae/program/%s' % show_id, 'DCNSeason') + + +class DCNBaseIE(InfoExtractor): + def _extract_video_info(self, video_data, video_id, is_live): + title = video_data.get('title_en') or video_data['title_ar'] + img = video_data.get('img') + thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None + duration = int_or_none(video_data.get('duration')) + description = video_data.get('description_en') or video_data.get('description_ar') + timestamp = parse_iso8601(video_data.get('create_time'), ' ') + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'is_live': is_live, + } + + def _extract_video_formats(self, webpage, video_id, entry_protocol): + formats = [] + m3u8_url = self._html_search_regex( + r'file\s*:\s*"([^"]+)', webpage, 'm3u8 url', fatal=False) + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=None) + if m3u8_formats: + formats.extend(m3u8_formats) + + rtsp_url = self._search_regex( + r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) + if rtsp_url: + formats.append({ + 'url': rtsp_url, + 'format_id': 'rtsp', + }) + + self._sort_formats(formats) + return formats + + +class DCNVideoIE(DCNBaseIE): + IE_NAME = 'dcn:video' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' _TEST = { - 'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887', + 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', 'info_dict': { 'id': '17375', 'ext': 'mp4', 'title': 'رحلة العمر : الحلقة 1', 'description': 'md5:0156e935d870acb8ef0a66d24070c6d6', - 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 2041, 'timestamp': 1227504126, 'upload_date': '20081124', @@ -37,46 +100,95 @@ class DCNIE(InfoExtractor): request = sanitized_Request( 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id, headers={'Origin': 'http://www.dcndigital.ae'}) - - video = self._download_json(request, video_id) - title = video.get('title_en') or video['title_ar'] + video_data = self._download_json(request, video_id) + info = self._extract_video_info(video_data, video_id, False) webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + compat_urllib_parse.urlencode({ - 'id': video['id'], - 'user_id': video['user_id'], - 'signature': video['signature'], + 'id': video_data['id'], + 'user_id': video_data['user_id'], + 'signature': video_data['signature'], 'countries': 'Q0M=', 'filter': 'DENY', }), video_id) + info['formats'] = self._extract_video_formats(webpage, video_id, 'm3u8_native') + return info - m3u8_url = self._html_search_regex(r'file:\s*"([^"]+)', webpage, 'm3u8 url') - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - rtsp_url = self._search_regex( - r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) - if rtsp_url: - formats.append({ - 'url': rtsp_url, - 'format_id': 'rtsp', +class DCNLiveIE(DCNBaseIE): + IE_NAME = 'dcn:live' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)' + + def _real_extract(self, url): + channel_id = self._match_id(url) + + request = sanitized_Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id, + headers={'Origin': 'http://www.dcndigital.ae'}) + + channel_data = self._download_json(request, channel_id) + info = self._extract_video_info(channel_data, channel_id, True) + + webpage = self._download_webpage( + 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + + compat_urllib_parse.urlencode({ + 'id': base64.b64encode(channel_data['user_id'].encode()).decode(), + 'channelid': base64.b64encode(channel_data['id'].encode()).decode(), + 'signature': channel_data['signature'], + 'countries': 'Q0M=', + 'filter': 'DENY', + }), channel_id) + info['formats'] = self._extract_video_formats(webpage, channel_id, 'm3u8') + return info + + +class DCNSeasonIE(InfoExtractor): + IE_NAME = 'dcn:season' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' + _TEST = { + 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', + 'info_dict': + { + 'id': '7910', + 'title': 'محاضرات الشيخ الشعراوي', + }, + 'playlist_mincount': 27, + } + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + show_id, season_id = re.match(self._VALID_URL, url).groups() + + data = {} + if season_id: + data['season'] = season_id + show_id = smuggled_data.get('show_id') + if show_id is None: + request = sanitized_Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, + headers={'Origin': 'http://www.dcndigital.ae'}) + season = self._download_json(request, season_id) + show_id = season['id'] + data['show_id'] = show_id + request = sanitized_Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/show', + compat_urllib_parse.urlencode(data), + { + 'Origin': 'http://www.dcndigital.ae', + 'Content-Type': 'application/x-www-form-urlencoded' }) - self._sort_formats(formats) + show = self._download_json(request, show_id) + if not season_id: + season_id = show['default_season'] + for season in show['seasons']: + if season['id'] == season_id: + title = season.get('title_en') or season['title_ar'] - img = video.get('img') - thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None - duration = int_or_none(video.get('duration')) - description = video.get('description_en') or video.get('description_ar') - timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ') + entries = [] + for video in show['videos']: + entries.append(self.url_result( + 'http://www.dcndigital.ae/media/%s' % video['id'], 'DCNVideo')) - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - } + return self.playlist_result(entries, season_id, title) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index d836c1a6c..60ed438f8 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import itertools -from .common import InfoExtractor +from .amp import AMPIE from ..compat import ( compat_HTTPError, compat_urllib_parse, @@ -12,14 +12,11 @@ from ..compat import ( from ..utils import ( ExtractorError, clean_html, - determine_ext, - int_or_none, - parse_iso8601, sanitized_Request, ) -class DramaFeverBaseIE(InfoExtractor): +class DramaFeverBaseIE(AMPIE): _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _NETRC_MACHINE = 'dramafever' @@ -80,60 +77,25 @@ class DramaFeverIE(DramaFeverBaseIE): 'timestamp': 1404336058, 'upload_date': '20140702', 'duration': 343, - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): video_id = self._match_id(url).replace('/', '.') try: - feed = self._download_json( - 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id, - video_id, 'Downloading episode JSON')['channel']['item'] + info = self._extract_feed_info( + 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): raise ExtractorError( 'Currently unavailable in your country.', expected=True) raise - media_group = feed.get('media-group', {}) - - formats = [] - for media_content in media_group['media-content']: - src = media_content.get('@attributes', {}).get('url') - if not src: - continue - ext = determine_ext(src) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src, video_id, f4m_id='hds')) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id='hls')) - else: - formats.append({ - 'url': src, - }) - self._sort_formats(formats) - - title = media_group.get('media-title') - description = media_group.get('media-description') - duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration')) - thumbnail = self._proto_relative_url( - media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url')) - timestamp = parse_iso8601(feed.get('pubDate'), ' ') - - subtitles = {} - for media_subtitle in media_group.get('media-subTitle', []): - lang = media_subtitle.get('@attributes', {}).get('lang') - href = media_subtitle.get('@attributes', {}).get('href') - if not lang or not href: - continue - subtitles[lang] = [{ - 'ext': 'ttml', - 'url': href, - }] - series_id, episode_number = video_id.split('.') episode_info = self._download_json( # We only need a single episode info, so restricting page size to one episode @@ -146,21 +108,12 @@ class DramaFeverIE(DramaFeverBaseIE): if value: subfile = value[0].get('subfile') or value[0].get('new_subfile') if subfile and subfile != 'http://www.dramafever.com/st/': - subtitles.setdefault('English', []).append({ + info['subtitiles'].setdefault('English', []).append({ 'ext': 'srt', 'url': subfile, }) - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } + return info class DramaFeverSeriesIE(DramaFeverBaseIE): diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py index 02c6a4615..476cce2d0 100644 --- a/youtube_dl/extractor/ellentv.py +++ b/youtube_dl/extractor/ellentv.py @@ -13,12 +13,12 @@ class EllenTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)' _TEST = { 'url': 'http://www.ellentv.com/videos/0-ipq1gsai/', - 'md5': '8e3c576bf2e9bfff4d76565f56f94c9c', + 'md5': '4294cf98bc165f218aaa0b89e0fd8042', 'info_dict': { 'id': '0_ipq1gsai', - 'ext': 'mp4', + 'ext': 'mov', 'title': 'Fast Fingers of Fate', - 'description': 'md5:587e79fbbd0d73b148bc596d99ce48e6', + 'description': 'md5:3539013ddcbfa64b2a6d1b38d910868a', 'timestamp': 1428035648, 'upload_date': '20150403', 'uploader_id': 'batchUser', diff --git a/youtube_dl/extractor/esri.py b/youtube_dl/extractor/esri.py index bf5d2019f..d4205d7fb 100644 --- a/youtube_dl/extractor/esri.py +++ b/youtube_dl/extractor/esri.py @@ -61,7 +61,7 @@ class EsriVideoIE(InfoExtractor): webpage, 'duration', fatal=False)) upload_date = unified_strdate(self._html_search_meta( - 'last-modified', webpage, 'upload date', fatal=None)) + 'last-modified', webpage, 'upload date', fatal=False)) return { 'id': video_id, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 321eec59e..5e43f2359 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -7,11 +7,11 @@ import socket from .common import InfoExtractor from ..compat import ( compat_http_client, - compat_str, compat_urllib_error, compat_urllib_parse_unquote, ) from ..utils import ( + error_to_compat_str, ExtractorError, limit_length, sanitized_Request, @@ -74,7 +74,7 @@ class FacebookIE(InfoExtractor): return login_page_req = sanitized_Request(self._LOGIN_URL) - login_page_req.add_header('Cookie', 'locale=en_US') + self._set_cookie('facebook.com', 'locale', 'en_US') login_page = self._download_webpage(login_page_req, None, note='Downloading login page', errnote='Unable to download login page') @@ -100,13 +100,25 @@ class FacebookIE(InfoExtractor): login_results = self._download_webpage(request, None, note='Logging in', errnote='unable to fetch login page') if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None: + error = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?login_error_box.*?\1[^>]*><div[^>]*>.*?</div><div[^>]*>(?P<error>.+?)</div>', + login_results, 'login error', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') return + fb_dtsg = self._search_regex( + r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg', default=None) + h = self._search_regex( + r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h', default=None) + + if not fb_dtsg or not h: + return + check_form = { - 'fb_dtsg': self._search_regex(r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg'), - 'h': self._search_regex( - r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h'), + 'fb_dtsg': fb_dtsg, + 'h': h, 'name_action_selected': 'dont_save', } check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) @@ -116,7 +128,7 @@ class FacebookIE(InfoExtractor): if re.search(r'id="checkpointSubmitButton"', check_response) is not None: self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning('unable to log in: %s' % compat_str(err)) + self._downloader.report_warning('unable to log in: %s' % error_to_compat_str(err)) return def _real_initialize(self): diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index cebdd0193..6f9b003c2 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -2,6 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + xpath_element, + xpath_text, + int_or_none, +) class FazIE(InfoExtractor): @@ -37,31 +42,32 @@ class FazIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + description = self._og_search_description(webpage) config_xml_url = self._search_regex( - r'writeFLV\(\'(.+?)\',', webpage, 'config xml url') + r'videoXMLURL\s*=\s*"([^"]+)', webpage, 'config xml url') config = self._download_xml( config_xml_url, video_id, 'Downloading config xml') - encodings = config.find('ENCODINGS') + encodings = xpath_element(config, 'ENCODINGS', 'encodings', True) formats = [] for pref, code in enumerate(['LOW', 'HIGH', 'HQ']): - encoding = encodings.find(code) - if encoding is None: - continue - encoding_url = encoding.find('FILENAME').text - formats.append({ - 'url': encoding_url, - 'format_id': code.lower(), - 'quality': pref, - }) + encoding = xpath_element(encodings, code) + if encoding: + encoding_url = xpath_text(encoding, 'FILENAME') + if encoding_url: + formats.append({ + 'url': encoding_url, + 'format_id': code.lower(), + 'quality': pref, + 'tbr': int_or_none(xpath_text(encoding, 'AVERAGEBITRATE')), + }) self._sort_formats(formats) - descr = self._html_search_regex( - r'<p class="Content Copy">(.*?)</p>', webpage, 'description', fatal=False) return { 'id': video_id, 'title': self._og_search_title(webpage), 'formats': formats, - 'description': descr, - 'thumbnail': config.find('STILL/STILL_BIG').text, + 'description': description.strip() if description else None, + 'thumbnail': xpath_text(config, 'STILL/STILL_BIG'), + 'duration': int_or_none(xpath_text(config, 'DURATION')), } diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index 40ea27895..5f6e65dae 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -1,12 +1,10 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( clean_html, determine_ext, - ExtractorError, + js_to_json, ) @@ -32,24 +30,22 @@ class FKTVIE(InfoExtractor): 'http://fernsehkritik.tv/folge-%s/play' % episode, episode) title = clean_html(self._html_search_regex( '<h3>([^<]+)</h3>', webpage, 'title')) - matches = re.search( - r'(?s)<video(?:(?!poster)[^>])+(?:poster="([^"]+)")?[^>]*>(.*)</video>', - webpage) - if matches is None: - raise ExtractorError('Unable to extract the video') - - poster, sources = matches.groups() - if poster is None: - self.report_warning('unable to extract thumbnail') - - urls = re.findall(r'<source[^>]+src="([^"]+)"', sources) - formats = [{ - 'url': furl, - 'format_id': determine_ext(furl), - } for furl in urls] + thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False) + sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json) + + formats = [] + for source in sources: + furl = source.get('src') + if furl: + formats.append({ + 'url': furl, + 'format_id': determine_ext(furl), + }) + self._sort_formats(formats) + return { 'id': episode, 'title': title, 'formats': formats, - 'thumbnail': poster, + 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 91cd46e76..18f439df9 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -1,67 +1,93 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, - find_xpath_attr, - sanitized_Request, + int_or_none, + qualities, ) class FlickrIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' + _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/[\w\-_@]+/(?P<id>\d+)' _TEST = { 'url': 'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/', - 'md5': '6fdc01adbc89d72fc9c4f15b4a4ba87b', + 'md5': '164fe3fa6c22e18d448d4d5af2330f31', 'info_dict': { 'id': '5645318632', - 'ext': 'mp4', - "description": "Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.", - "uploader_id": "forestwander-nature-pictures", - "title": "Dark Hollow Waterfalls" + 'ext': 'mpg', + 'description': 'Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.', + 'title': 'Dark Hollow Waterfalls', + 'duration': 19, + 'timestamp': 1303528740, + 'upload_date': '20110423', + 'uploader_id': '10922353@N03', + 'uploader': 'Forest Wander', + 'comment_count': int, + 'view_count': int, + 'tags': list, } } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + _API_BASE_URL = 'https://api.flickr.com/services/rest?' - video_id = mobj.group('id') - video_uploader_id = mobj.group('uploader_id') - webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id - req = sanitized_Request(webpage_url) - req.add_header( - 'User-Agent', - # it needs a more recent version - 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20150101 Firefox/38.0 (Chrome)') - webpage = self._download_webpage(req, video_id) + def _call_api(self, method, video_id, api_key, note, secret=None): + query = { + 'photo_id': video_id, + 'method': 'flickr.%s' % method, + 'api_key': api_key, + 'format': 'json', + 'nojsoncallback': 1, + } + if secret: + query['secret'] = secret + data = self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note) + if data['stat'] != 'ok': + raise ExtractorError(data['message']) + return data - secret = self._search_regex(r'secret"\s*:\s*"(\w+)"', webpage, 'secret') + def _real_extract(self, url): + video_id = self._match_id(url) - first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' - first_xml = self._download_xml(first_url, video_id, 'Downloading first data webpage') + api_key = self._download_json( + 'https://www.flickr.com/hermes_error_beacon.gne', video_id, + 'Downloading api key')['site_key'] - node_id = find_xpath_attr( - first_xml, './/{http://video.yahoo.com/YEP/1.0/}Item', 'id', - 'id').text + video_info = self._call_api( + 'photos.getInfo', video_id, api_key, 'Downloading video info')['photo'] + if video_info['media'] == 'video': + streams = self._call_api( + 'video.getStreamInfo', video_id, api_key, + 'Downloading streams info', video_info['secret'])['streams'] - second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' - second_xml = self._download_xml(second_url, video_id, 'Downloading second data webpage') + preference = qualities( + ['288p', 'iphone_wifi', '100', '300', '700', '360p', 'appletv', '720p', '1080p', 'orig']) - self.report_extraction(video_id) + formats = [] + for stream in streams['stream']: + stream_type = str(stream.get('type')) + formats.append({ + 'format_id': stream_type, + 'url': stream['_content'], + 'preference': preference(stream_type), + }) + self._sort_formats(formats) - stream = second_xml.find('.//STREAM') - if stream is None: - raise ExtractorError('Unable to extract video url') - video_url = stream.attrib['APP'] + stream.attrib['FULLPATH'] + owner = video_info.get('owner', {}) - return { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader_id': video_uploader_id, - } + return { + 'id': video_id, + 'title': video_info['title']['_content'], + 'description': video_info.get('description', {}).get('_content'), + 'formats': formats, + 'timestamp': int_or_none(video_info.get('dateuploaded')), + 'duration': int_or_none(video_info.get('video', {}).get('duration')), + 'uploader_id': owner.get('nsid'), + 'uploader': owner.get('realname'), + 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), + 'view_count': int_or_none(video_info.get('views')), + 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])] + } + else: + raise ExtractorError('not a video', expected=True) diff --git a/youtube_dl/extractor/footyroom.py b/youtube_dl/extractor/footyroom.py index 4c7dbca40..370fd006f 100644 --- a/youtube_dl/extractor/footyroom.py +++ b/youtube_dl/extractor/footyroom.py @@ -13,6 +13,7 @@ class FootyRoomIE(InfoExtractor): 'title': 'Schalke 04 0 – 2 Real Madrid', }, 'playlist_count': 3, + 'skip': 'Video for this match is not available', }, { 'url': 'http://footyroom.com/georgia-0-2-germany-2015-03/', 'info_dict': { diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 3a4a59135..318ac013d 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -2,14 +2,10 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor -from ..utils import ( - parse_iso8601, - int_or_none, -) +from .amp import AMPIE -class FoxNewsIE(InfoExtractor): +class FoxNewsIE(AMPIE): IE_DESC = 'Fox News and Fox Business Video' _VALID_URL = r'https?://(?P<host>video\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)' _TESTS = [ @@ -20,10 +16,10 @@ class FoxNewsIE(InfoExtractor): 'id': '3937480', 'ext': 'flv', 'title': 'Frozen in Time', - 'description': 'Doctors baffled by 16-year-old girl that is the size of a toddler', + 'description': '16-year-old girl is size of toddler', 'duration': 265, - 'timestamp': 1304411491, - 'upload_date': '20110503', + # 'timestamp': 1304411491, + # 'upload_date': '20110503', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -34,10 +30,10 @@ class FoxNewsIE(InfoExtractor): 'id': '3922535568001', 'ext': 'mp4', 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal", - 'description': "Congressman discusses the president's executive action", + 'description': "Congressman discusses president's plan", 'duration': 292, - 'timestamp': 1417662047, - 'upload_date': '20141204', + # 'timestamp': 1417662047, + # 'upload_date': '20141204', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -52,52 +48,9 @@ class FoxNewsIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - host = mobj.group('host') + host, video_id = re.match(self._VALID_URL, url).groups() - video = self._download_json( - 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id) - - item = video['channel']['item'] - title = item['title'] - description = item['description'] - timestamp = parse_iso8601(item['dc-date']) - - media_group = item['media-group'] - duration = None - formats = [] - for media in media_group['media-content']: - attributes = media['@attributes'] - video_url = attributes['url'] - if video_url.endswith('.f4m'): - formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id)) - elif video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'flv')) - elif not video_url.endswith('.smil'): - duration = int_or_none(attributes.get('duration')) - formats.append({ - 'url': video_url, - 'format_id': media['media-category']['@attributes']['label'], - 'preference': 1, - 'vbr': int_or_none(attributes.get('bitrate')), - 'filesize': int_or_none(attributes.get('fileSize')) - }) - self._sort_formats(formats) - - media_thumbnail = media_group['media-thumbnail']['@attributes'] - thumbnails = [{ - 'url': media_thumbnail['url'], - 'width': int_or_none(media_thumbnail.get('width')), - 'height': int_or_none(media_thumbnail.get('height')), - }] if media_thumbnail else [] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } + info = self._extract_feed_info( + 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) + info['id'] = video_id + return info diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 6613ee17a..fdc51f44f 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import int_or_none @@ -23,8 +21,7 @@ class FranceInterIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -33,7 +30,7 @@ class FranceInterIE(InfoExtractor): video_url = 'http://www.franceinter.fr/' + path title = self._html_search_regex( - r'<span class="title">(.+?)</span>', webpage, 'title') + r'<span class="title-diffusion">(.+?)</span>', webpage, 'title') description = self._html_search_regex( r'<span class="description">(.*?)</span>', webpage, 'description', fatal=False) diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py new file mode 100644 index 000000000..d1a95d87f --- /dev/null +++ b/youtube_dl/extractor/funimation.py @@ -0,0 +1,193 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + encode_dict, + int_or_none, + sanitized_Request, + ExtractorError, + urlencode_postdata +) + + +class FunimationIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?funimation\.com/shows/[^/]+/videos/(?:official|promotional)/(?P<id>[^/?#&]+)' + + _NETRC_MACHINE = 'funimation' + + _TESTS = [{ + 'url': 'http://www.funimation.com/shows/air/videos/official/breeze', + 'info_dict': { + 'id': '658', + 'display_id': 'breeze', + 'ext': 'mp4', + 'title': 'Air - 1 - Breeze', + 'description': 'md5:1769f43cd5fc130ace8fd87232207892', + 'thumbnail': 're:https?://.*\.jpg', + }, + }, { + 'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play', + 'info_dict': { + 'id': '31128', + 'display_id': 'role-play', + 'ext': 'mp4', + 'title': '.hack//SIGN - 1 - Role Play', + 'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd', + 'thumbnail': 're:https?://.*\.jpg', + }, + }, { + 'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview', + 'info_dict': { + 'id': '9635', + 'display_id': 'broadcast-dub-preview', + 'ext': 'mp4', + 'title': 'Attack on Titan: Junior High - Broadcast Dub Preview', + 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803', + 'thumbnail': 're:https?://.*\.(?:jpg|png)', + }, + }] + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + data = urlencode_postdata(encode_dict({ + 'email_field': username, + 'password_field': password, + })) + login_request = sanitized_Request('http://www.funimation.com/login', data, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0', + 'Content-Type': 'application/x-www-form-urlencoded' + }) + login_page = self._download_webpage( + login_request, None, 'Logging in as %s' % username) + if any(p in login_page for p in ('funimation.com/logout', '>Log Out<')): + return + error = self._html_search_regex( + r'(?s)<div[^>]+id=["\']errorMessages["\'][^>]*>(.+?)</div>', + login_page, 'error messages', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + display_id = self._match_id(url) + + errors = [] + formats = [] + + ERRORS_MAP = { + 'ERROR_MATURE_CONTENT_LOGGED_IN': 'matureContentLoggedIn', + 'ERROR_MATURE_CONTENT_LOGGED_OUT': 'matureContentLoggedOut', + 'ERROR_SUBSCRIPTION_LOGGED_OUT': 'subscriptionLoggedOut', + 'ERROR_VIDEO_EXPIRED': 'videoExpired', + 'ERROR_TERRITORY_UNAVAILABLE': 'territoryUnavailable', + 'SVODBASIC_SUBSCRIPTION_IN_PLAYER': 'basicSubscription', + 'SVODNON_SUBSCRIPTION_IN_PLAYER': 'nonSubscription', + 'ERROR_PLAYER_NOT_RESPONDING': 'playerNotResponding', + 'ERROR_UNABLE_TO_CONNECT_TO_CDN': 'unableToConnectToCDN', + 'ERROR_STREAM_NOT_FOUND': 'streamNotFound', + } + + USER_AGENTS = ( + # PC UA is served with m3u8 that provides some bonus lower quality formats + ('pc', 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'), + # Mobile UA allows to extract direct links and also does not fail when + # PC UA fails with hulu error (e.g. + # http://www.funimation.com/shows/hacksign/videos/official/role-play) + ('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'), + ) + + for kind, user_agent in USER_AGENTS: + request = sanitized_Request(url) + request.add_header('User-Agent', user_agent) + webpage = self._download_webpage( + request, display_id, 'Downloading %s webpage' % kind) + + playlist = self._parse_json( + self._search_regex( + r'var\s+playersData\s*=\s*(\[.+?\]);\n', + webpage, 'players data'), + display_id)[0]['playlist'] + + items = next(item['items'] for item in playlist if item.get('items')) + item = next(item for item in items if item.get('itemAK') == display_id) + + error_messages = {} + video_error_messages = self._search_regex( + r'var\s+videoErrorMessages\s*=\s*({.+?});\n', + webpage, 'error messages', default=None) + if video_error_messages: + error_messages_json = self._parse_json(video_error_messages, display_id, fatal=False) + if error_messages_json: + for _, error in error_messages_json.items(): + type_ = error.get('type') + description = error.get('description') + content = error.get('content') + if type_ == 'text' and description and content: + error_message = ERRORS_MAP.get(description) + if error_message: + error_messages[error_message] = content + + for video in item.get('videoSet', []): + auth_token = video.get('authToken') + if not auth_token: + continue + funimation_id = video.get('FUNImationID') or video.get('videoId') + preference = 1 if video.get('languageMode') == 'dub' else 0 + if not auth_token.startswith('?'): + auth_token = '?%s' % auth_token + for quality, height in (('sd', 480), ('hd', 720), ('hd1080', 1080)): + format_url = video.get('%sUrl' % quality) + if not format_url: + continue + if not format_url.startswith(('http', '//')): + errors.append(format_url) + continue + if determine_ext(format_url) == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + format_url + auth_token, display_id, 'mp4', entry_protocol='m3u8_native', + preference=preference, m3u8_id='%s-hls' % funimation_id, fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + else: + tbr = int_or_none(self._search_regex( + r'-(\d+)[Kk]', format_url, 'tbr', default=None)) + formats.append({ + 'url': format_url + auth_token, + 'format_id': '%s-http-%dp' % (funimation_id, height), + 'height': height, + 'tbr': tbr, + 'preference': preference, + }) + + if not formats and errors: + raise ExtractorError( + '%s returned error: %s' + % (self.IE_NAME, clean_html(error_messages.get(errors[0], errors[0]))), + expected=True) + + self._sort_formats(formats) + + title = item['title'] + artist = item.get('artist') + if artist: + title = '%s - %s' % (artist, title) + description = self._og_search_description(webpage) or item.get('description') + thumbnail = self._og_search_thumbnail(webpage) or item.get('posterUrl') + video_id = item.get('itemId') or display_id + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e3bdff2d8..3c3066e38 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -44,7 +44,6 @@ from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE -from .bliptv import BlipTVIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE @@ -54,6 +53,9 @@ from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE from .screenwavemedia import ScreenwaveMediaIE from .mtv import MTVServicesEmbeddedIE +from .pladform import PladformIE +from .googledrive import GoogleDriveIE +from .jwplatform import JWPlatformIE class GenericIE(InfoExtractor): @@ -1439,11 +1441,6 @@ class GenericIE(InfoExtractor): 'id': match.group('id') } - # Look for embedded blip.tv player - bliptv_url = BlipTVIE._extract_url(webpage) - if bliptv_url: - return self.url_result(bliptv_url, 'BlipTV') - # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: @@ -1741,10 +1738,9 @@ class GenericIE(InfoExtractor): return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') # Look for Pladform embeds - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Pladform') + pladform_url = PladformIE._extract_url(webpage) + if pladform_url: + return self.url_result(pladform_url) # Look for Playwire embeds mobj = re.search( @@ -1769,6 +1765,11 @@ class GenericIE(InfoExtractor): if nbc_sports_url: return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + # Look for Google Drive embeds + google_drive_url = GoogleDriveIE._extract_url(webpage) + if google_drive_url: + return self.url_result(google_drive_url, 'GoogleDrive') + # Look for UDN embeds mobj = re.search( r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) @@ -1796,6 +1797,11 @@ class GenericIE(InfoExtractor): if snagfilms_url: return self.url_result(snagfilms_url) + # Look for JWPlatform embeds + jwplatform_url = JWPlatformIE._extract_url(webpage) + if jwplatform_url: + return self.url_result(jwplatform_url, 'JWPlatform') + # Look for ScreenwaveMedia embeds mobj = re.search(ScreenwaveMediaIE.EMBED_PATTERN, webpage) if mobj is not None: diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py new file mode 100644 index 000000000..f354c9c7a --- /dev/null +++ b/youtube_dl/extractor/googledrive.py @@ -0,0 +1,88 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class GoogleDriveIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})' + _TEST = { + 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', + 'md5': '881f7700aec4f538571fa1e0eed4a7b6', + 'info_dict': { + 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', + 'ext': 'mp4', + 'title': 'Big Buck Bunny.mp4', + 'duration': 46, + } + } + _FORMATS_EXT = { + '5': 'flv', + '6': 'flv', + '13': '3gp', + '17': '3gp', + '18': 'mp4', + '22': 'mp4', + '34': 'flv', + '35': 'flv', + '36': '3gp', + '37': 'mp4', + '38': 'mp4', + '43': 'webm', + '44': 'webm', + '45': 'webm', + '46': 'webm', + '59': 'mp4', + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', + webpage) + if mobj: + return 'https://drive.google.com/file/d/%s' % mobj.group('id') + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://docs.google.com/file/d/%s' % video_id, video_id, encoding='unicode_escape') + + reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) + if reason: + raise ExtractorError(reason) + + title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title') + duration = int_or_none(self._search_regex( + r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None)) + fmt_stream_map = self._search_regex( + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',') + fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') + + formats = [] + for fmt, fmt_stream in zip(fmt_list, fmt_stream_map): + fmt_id, fmt_url = fmt_stream.split('|') + resolution = fmt.split('/')[1] + width, height = resolution.split('x') + formats.append({ + 'url': fmt_url, + 'format_id': fmt_id, + 'resolution': resolution, + 'width': int_or_none(width), + 'height': int_or_none(height), + 'ext': self._FORMATS_EXT[fmt_id], + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/gputechconf.py b/youtube_dl/extractor/gputechconf.py new file mode 100644 index 000000000..145b55bf3 --- /dev/null +++ b/youtube_dl/extractor/gputechconf.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + xpath_element, + xpath_text, + int_or_none, + parse_duration, +) + + +class GPUTechConfIE(InfoExtractor): + _VALID_URL = r'https?://on-demand\.gputechconf\.com/gtc/2015/video/S(?P<id>\d+)\.html' + _TEST = { + 'url': 'http://on-demand.gputechconf.com/gtc/2015/video/S5156.html', + 'md5': 'a8862a00a0fd65b8b43acc5b8e33f798', + 'info_dict': { + 'id': '5156', + 'ext': 'mp4', + 'title': 'Coordinating More Than 3 Million CUDA Threads for Social Network Analysis', + 'duration': 1219, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + root_path = self._search_regex(r'var\s+rootPath\s*=\s*"([^"]+)', webpage, 'root path', 'http://evt.dispeak.com/nvidia/events/gtc15/') + xml_file_id = self._search_regex(r'var\s+xmlFileId\s*=\s*"([^"]+)', webpage, 'xml file id') + + doc = self._download_xml('%sxml/%s.xml' % (root_path, xml_file_id), video_id) + + metadata = xpath_element(doc, 'metadata') + http_host = xpath_text(metadata, 'httpHost', 'http host', True) + mbr_videos = xpath_element(metadata, 'MBRVideos') + + formats = [] + for mbr_video in mbr_videos.findall('MBRVideo'): + stream_name = xpath_text(mbr_video, 'streamName') + if stream_name: + formats.append({ + 'url': 'http://%s/%s' % (http_host, stream_name.replace('mp4:', '')), + 'tbr': int_or_none(xpath_text(mbr_video, 'bitrate')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': xpath_text(metadata, 'title'), + 'duration': parse_duration(xpath_text(metadata, 'endTime')), + 'creator': xpath_text(metadata, 'speaker'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py new file mode 100644 index 000000000..05d27e75d --- /dev/null +++ b/youtube_dl/extractor/hotstar.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, +) + + +class HotStarIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hotstar\.com/.*?[/-](?P<id>\d{10})' + _TEST = { + 'url': 'http://www.hotstar.com/on-air-with-aib--english-1000076273', + 'info_dict': { + 'id': '1000076273', + 'ext': 'mp4', + 'title': 'On Air With AIB - English', + 'description': 'md5:c957d8868e9bc793ccb813691cc4c434', + 'timestamp': 1447227000, + 'upload_date': '20151111', + 'duration': 381, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + + _GET_CONTENT_TEMPLATE = 'http://account.hotstar.com/AVS/besc?action=GetAggregatedContentDetails&channel=PCTV&contentId=%s' + _GET_CDN_TEMPLATE = 'http://getcdn.hotstar.com/AVS/besc?action=GetCDN&asJson=Y&channel=%s&id=%s&type=%s' + + def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', fatal=True): + json_data = super(HotStarIE, self)._download_json(url_or_request, video_id, note, fatal=fatal) + if json_data['resultCode'] != 'OK': + if fatal: + raise ExtractorError(json_data['errorDescription']) + return None + return json_data['resultObj'] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + self._GET_CONTENT_TEMPLATE % video_id, + video_id)['contentInfo'][0] + + formats = [] + # PCTV for extracting f4m manifest + for f in ('TABLET',): + format_data = self._download_json( + self._GET_CDN_TEMPLATE % (f, video_id, 'VOD'), + video_id, 'Downloading %s JSON metadata' % f, fatal=False) + if format_data: + format_url = format_data['src'] + ext = determine_ext(format_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats(format_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif ext == 'f4m': + # produce broken files + continue + else: + formats.append({ + 'url': format_url, + 'width': int_or_none(format_data.get('width')), + 'height': int_or_none(format_data.get('height')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['episodeTitle'], + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': int_or_none(video_data.get('broadcastDate')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index bf2d2041b..a2e18c8a7 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -3,6 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) class IGNIE(InfoExtractor): @@ -11,25 +15,24 @@ class IGNIE(InfoExtractor): Some videos of it.ign.com are also supported """ - _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles|(?:[^/]*/feature))(/.+)?/(?P<name_or_id>.+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?P<type>videos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P<name_or_id>.+)' IE_NAME = 'ign.com' - _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' - _DESCRIPTION_RE = [ - r'<span class="page-object-description">(.+?)</span>', - r'id="my_show_video">.*?<p>(.*?)</p>', - r'<meta name="description" content="(.*?)"', - ] + _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s' + _EMBED_RE = r'<iframe[^>]+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']' _TESTS = [ { 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - 'md5': 'eac8bdc1890980122c3b66f14bdd02e9', + 'md5': 'febda82c4bafecd2d44b6e1a18a595f8', 'info_dict': { 'id': '8f862beef863986b2785559b9e1aa599', 'ext': 'mp4', 'title': 'The Last of Us Review', 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', + 'timestamp': 1370440800, + 'upload_date': '20130605', + 'uploader_id': 'cberidon@ign.com', } }, { @@ -44,6 +47,9 @@ class IGNIE(InfoExtractor): 'ext': 'mp4', 'title': 'GTA 5 Video Review', 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + 'timestamp': 1379339880, + 'upload_date': '20130916', + 'uploader_id': 'danieljkrupa@gmail.com', }, }, { @@ -52,6 +58,9 @@ class IGNIE(InfoExtractor): 'ext': 'mp4', 'title': '26 Twisted Moments from GTA 5 in Slow Motion', 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', + 'timestamp': 1386878820, + 'upload_date': '20131212', + 'uploader_id': 'togilvie@ign.com', }, }, ], @@ -66,12 +75,20 @@ class IGNIE(InfoExtractor): 'id': '078fdd005f6d3c02f63d795faa1b984f', 'ext': 'mp4', 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', - 'description': ( - 'Giant skeletons, bloody hunts, and captivating' - ' natural beauty take our breath away.' - ), + 'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.', + 'timestamp': 1408047180, + 'upload_date': '20140814', + 'uploader_id': 'jamesduggan1990@gmail.com', }, }, + { + 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', + 'only_matching': True, + }, + { + 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', + 'only_matching': True, + }, ] def _find_video_id(self, webpage): @@ -82,7 +99,7 @@ class IGNIE(InfoExtractor): r'<object id="vid_(.+?)"', r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"', ] - return self._search_regex(res_id, webpage, 'video id') + return self._search_regex(res_id, webpage, 'video id', default=None) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -91,7 +108,7 @@ class IGNIE(InfoExtractor): webpage = self._download_webpage(url, name_or_id) if page_type != 'video': multiple_urls = re.findall( - '<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', + r'<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', webpage) if multiple_urls: entries = [self.url_result(u, ie='IGN') for u in multiple_urls] @@ -102,22 +119,50 @@ class IGNIE(InfoExtractor): } video_id = self._find_video_id(webpage) - result = self._get_video_info(video_id) - description = self._html_search_regex(self._DESCRIPTION_RE, - webpage, 'video description', flags=re.DOTALL) - result['description'] = description - return result + if not video_id: + return self.url_result(self._search_regex(self._EMBED_RE, webpage, 'embed url')) + return self._get_video_info(video_id) def _get_video_info(self, video_id): - config_url = self._CONFIG_URL_TEMPLATE % video_id - config = self._download_json(config_url, video_id) - media = config['playlist']['media'] + api_data = self._download_json(self._API_URL_TEMPLATE % video_id, video_id) + + formats = [] + m3u8_url = api_data['refs'].get('m3uUrl') + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + f4m_url = api_data['refs'].get('f4mUrl') + if f4m_url: + f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + for asset in api_data['assets']: + formats.append({ + 'url': asset['url'], + 'tbr': asset.get('actual_bitrate_kbps'), + 'fps': asset.get('frame_rate'), + 'height': int_or_none(asset.get('height')), + 'width': int_or_none(asset.get('width')), + }) + self._sort_formats(formats) + + thumbnails = [{ + 'url': thumbnail['url'] + } for thumbnail in api_data.get('thumbnails', [])] + + metadata = api_data['metadata'] return { - 'id': media['metadata']['videoId'], - 'url': media['url'], - 'title': media['metadata']['title'], - 'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'), + 'id': api_data.get('videoId') or video_id, + 'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'], + 'description': metadata.get('description'), + 'timestamp': parse_iso8601(metadata.get('publishDate')), + 'duration': int_or_none(metadata.get('duration')), + 'display_id': metadata.get('slug') or video_id, + 'uploader_id': metadata.get('creator'), + 'thumbnails': thumbnails, + 'formats': formats, } @@ -125,16 +170,17 @@ class OneUPIE(IGNIE): _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)\.html' IE_NAME = '1up.com' - _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>' - _TESTS = [{ 'url': 'http://gamevideos.1up.com/video/id/34976.html', - 'md5': '68a54ce4ebc772e4b71e3123d413163d', + 'md5': 'c9cc69e07acb675c31a16719f909e347', 'info_dict': { 'id': '34976', 'ext': 'mp4', 'title': 'Sniper Elite V2 - Trailer', - 'description': 'md5:5d289b722f5a6d940ca3136e9dae89cf', + 'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826', + 'timestamp': 1313099220, + 'upload_date': '20110811', + 'uploader_id': 'IGN', } }] @@ -143,3 +189,36 @@ class OneUPIE(IGNIE): result = super(OneUPIE, self)._real_extract(url) result['id'] = mobj.group('name_or_id') return result + + +class PCMagIE(IGNIE): + _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?P<type>videos|article2)(/.+)?/(?P<name_or_id>.+)' + IE_NAME = 'pcmag' + + _EMBED_RE = r'iframe.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content.html?[^"]*url=([^"]+)["&]' + + _TESTS = [{ + 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', + 'md5': '212d6154fd0361a2781075f1febbe9ad', + 'info_dict': { + 'id': 'ee10d774b508c9b8ec07e763b9125b91', + 'ext': 'mp4', + 'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?', + 'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3', + 'timestamp': 1420571160, + 'upload_date': '20150106', + 'uploader_id': 'cozzipix@gmail.com', + } + }, { + 'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp', + 'md5': '94130c1ca07ba0adb6088350681f16c1', + 'info_dict': { + 'id': '042e560ba94823d43afcb12ddf7142ca', + 'ext': 'mp4', + 'title': 'HTC\'s Weird New Re Camera - What\'s New Now', + 'description': 'md5:53433c45df96d2ea5d0fda18be2ca908', + 'timestamp': 1412953920, + 'upload_date': '20141010', + 'uploader_id': 'chris_snyder@pcmag.com', + } + }] diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 70c8ca64e..85e9344aa 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -13,7 +13,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!gallery)(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|topic/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -21,7 +21,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', + 'description': 'Imgur: The most awesome images on the Internet.', }, }, { 'url': 'https://imgur.com/A61SaA1', @@ -29,8 +29,20 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', + 'description': 'Imgur: The most awesome images on the Internet.', }, + }, { + 'url': 'https://imgur.com/gallery/YcAQlkx', + 'info_dict': { + 'id': 'YcAQlkx', + 'ext': 'mp4', + 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', + 'description': 'Imgur: The most awesome images on the Internet.' + + } + }, { + 'url': 'http://imgur.com/topic/Funny/N8rOudd', + 'only_matching': True, }] def _real_extract(self, url): @@ -100,25 +112,38 @@ class ImgurIE(InfoExtractor): class ImgurAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/gallery/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:a|gallery|topic/[^/]+)/)?(?P<id>[a-zA-Z0-9]{5})(?:[/?#&]+)?$' - _TEST = { + _TESTS = [{ 'url': 'http://imgur.com/gallery/Q95ko', 'info_dict': { 'id': 'Q95ko', }, 'playlist_count': 25, - } + }, { + 'url': 'http://imgur.com/a/j6Orj', + 'only_matching': True, + }, { + 'url': 'http://imgur.com/topic/Aww/ll5Vk', + 'only_matching': True, + }] def _real_extract(self, url): album_id = self._match_id(url) album_images = self._download_json( 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, - album_id)['data']['images'] - - entries = [ - self.url_result('http://imgur.com/%s' % image['hash']) - for image in album_images if image.get('hash')] - - return self.playlist_result(entries, album_id) + album_id, fatal=False) + + if album_images: + data = album_images.get('data') + if data and isinstance(data, dict): + images = data.get('images') + if images and isinstance(images, list): + entries = [ + self.url_result('http://imgur.com/%s' % image['hash']) + for image in images if image.get('hash')] + return self.playlist_result(entries, album_id) + + # Fallback to single video + return self.url_result('http://imgur.com/%s' % album_id, ImgurIE.ie_key()) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 71cfd12c5..016af2084 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -1,3 +1,5 @@ +# coding: utf-8 + from __future__ import unicode_literals import base64 @@ -5,8 +7,9 @@ import base64 from .common import InfoExtractor from ..compat import ( compat_urllib_parse_unquote, - compat_urlparse, + compat_parse_qs, ) +from ..utils import determine_ext class InfoQIE(InfoExtractor): @@ -16,7 +19,7 @@ class InfoQIE(InfoExtractor): 'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things', 'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2', 'info_dict': { - 'id': '12-jan-pythonthings', + 'id': 'A-Few-of-My-Favorite-Python-Things', 'ext': 'mp4', 'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.', 'title': 'A Few of My Favorite [Python] Things', @@ -24,40 +27,84 @@ class InfoQIE(InfoExtractor): }, { 'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript', 'only_matching': True, + }, { + 'url': 'http://www.infoq.com/cn/presentations/openstack-continued-delivery', + 'md5': '4918d0cca1497f2244572caf626687ef', + 'info_dict': { + 'id': 'openstack-continued-delivery', + 'title': 'OpenStack持续交付之路', + 'ext': 'flv', + 'description': 'md5:308d981fb28fa42f49f9568322c683ff', + }, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + def _extract_bokecc_videos(self, webpage, video_id): + # TODO: bokecc.com is a Chinese video cloud platform + # It should have an independent extractor but I don't have other + # examples using bokecc + player_params_str = self._html_search_regex( + r'<script[^>]+src="http://p\.bokecc\.com/player\?([^"]+)', + webpage, 'player params', default=None) - video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title') - video_description = self._html_search_meta('description', webpage, 'description') + player_params = compat_parse_qs(player_params_str) + + info_xml = self._download_xml( + 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( + player_params['siteid'][0], player_params['vid'][0]), video_id) + + return [{ + 'format_id': 'bokecc', + 'url': quality.find('./copy').attrib['playurl'], + 'preference': int(quality.attrib['value']), + } for quality in info_xml.findall('./video/quality')] + def _extract_rtmp_videos(self, webpage): # The server URL is hardcoded video_url = 'rtmpe://video.infoq.com/cfx/st/' # Extract video URL encoded_id = self._search_regex( - r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id') + r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id', default=None) + real_id = compat_urllib_parse_unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8')) playpath = 'mp4:' + real_id - video_filename = playpath.split('/')[-1] - video_id, extension = video_filename.split('.') - - http_base = self._search_regex( - r'EXPRESSINSTALL_SWF\s*=\s*[^"]*"((?:https?:)?//[^/"]+/)', webpage, - 'HTTP base URL') - - formats = [{ + return [{ 'format_id': 'rtmp', 'url': video_url, - 'ext': extension, + 'ext': determine_ext(playpath), 'play_path': playpath, - }, { + }] + + def _extract_http_videos(self, webpage): + http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') + + policy = self._search_regex(r'InfoQConstants.scp\s*=\s*\'([^\']+)\'', webpage, 'policy') + signature = self._search_regex(r'InfoQConstants.scs\s*=\s*\'([^\']+)\'', webpage, 'signature') + key_pair_id = self._search_regex(r'InfoQConstants.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id') + + return [{ 'format_id': 'http', - 'url': compat_urlparse.urljoin(url, http_base) + real_id, + 'url': http_video_url, + 'http_headers': { + 'Cookie': 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( + policy, signature, key_pair_id), + }, }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title') + video_description = self._html_search_meta('description', webpage, 'description') + + if '/cn/' in url: + # for China videos, HTTP video URL exists but always fails with 403 + formats = self._extract_bokecc_videos(webpage, video_id) + else: + formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage) + self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index c158f2064..e5e16ca3b 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -47,7 +47,7 @@ class InstagramIE(InfoExtractor): class InstagramUserIE(InfoExtractor): - _VALID_URL = r'https://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' _TEST = { diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 2df1da3f0..66a70a181 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -16,7 +16,7 @@ class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' - _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html' + _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html' _TESTS = [{ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', @@ -84,6 +84,15 @@ class IqiyiIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html', + 'only_matching': True, + }, { + 'url': 'http://www.iqiyi.com/a_19rrhbc6kt.html', + 'only_matching': True, + }, { + 'url': 'http://yule.iqiyi.com/pcb.html', + 'only_matching': True, }] _FORMATS_MAP = [ @@ -205,9 +214,8 @@ class IqiyiIE(InfoExtractor): def get_enc_key(self, swf_url, video_id): # TODO: automatic key extraction - # last update at 2015-10-22 for Zombie::bite - # '7223c67061dbea1259d0ceb44f44b6d62288f4f80c972170de5201d2321060270e05'[2:66][0::2] - enc_key = '2c76de15dcb44bd28ff0927d50d31620' + # last update at 2015-12-18 for Zombie::bite + enc_key = '8b6b683780897eb8d9a48a02ccc4817d'[::-1] return enc_key def _real_extract(self, url): diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py new file mode 100644 index 000000000..a92adf2b3 --- /dev/null +++ b/youtube_dl/extractor/jwplatform.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class JWPlatformIE(InfoExtractor): + _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' + _TEST = { + 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', + 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', + 'info_dict': { + 'id': 'nPripu9l', + 'ext': 'mov', + 'title': 'Big Buck Bunny Trailer', + 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', + 'upload_date': '20081127', + 'timestamp': 1227796140, + } + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) + video_data = json_data['playlist'][0] + subtitles = {} + for track in video_data['tracks']: + if track['kind'] == 'captions': + subtitles[track['label']] = [{'url': self._proto_relative_url(track['file'])}] + + formats = [] + for source in video_data['sources']: + source_url = self._proto_relative_url(source['file']) + source_type = source.get('type') or '' + if source_type == 'application/vnd.apple.mpegurl': + m3u8_formats = self._extract_m3u8_formats( + source_url, video_id, 'mp4', 'm3u8_native', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif source_type.startswith('audio'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + }) + else: + formats.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['title'], + 'description': video_data.get('description'), + 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 583b1a5ad..4807c8110 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -45,7 +45,7 @@ class KalturaIE(InfoExtractor): 'info_dict': { 'id': '1_1jc2y3e4', 'ext': 'mp4', - 'title': 'Track 4', + 'title': 'Straight from the Heart', 'upload_date': '20131219', 'uploader_id': 'mlundberg@wolfgangsvault.com', 'description': 'The Allman Brothers Band, 12/16/1981', @@ -115,12 +115,9 @@ class KalturaIE(InfoExtractor): 'version': '-1', }, { - 'action': 'getContextData', - 'contextDataParams:objectType': 'KalturaEntryContextDataParams', - 'contextDataParams:referrer': 'http://www.kaltura.com/', - 'contextDataParams:streamerType': 'http', + 'action': 'getbyentryid', 'entryId': video_id, - 'service': 'baseentry', + 'service': 'flavorAsset', }, ] return self._kaltura_api_call( @@ -133,7 +130,7 @@ class KalturaIE(InfoExtractor): partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5') entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5') - info, source_data = self._get_video_info(entry_id, partner_id) + info, flavor_assets = self._get_video_info(entry_id, partner_id) source_url = smuggled_data.get('source_url') if source_url: @@ -144,7 +141,10 @@ class KalturaIE(InfoExtractor): referrer = None formats = [] - for f in source_data['flavorAssets']: + for f in flavor_assets: + # Continue if asset is not ready + if f['status'] != 2: + continue video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id']) if referrer: video_url += '?referrer=%s' % referrer @@ -160,6 +160,14 @@ class KalturaIE(InfoExtractor): 'width': int_or_none(f.get('width')), 'url': video_url, }) + m3u8_url = info['dataUrl'].replace('format/url', 'format/applehttp') + if referrer: + m3u8_url += '?referrer=%s' % referrer + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + self._check_formats(formats, entry_id) self._sort_formats(formats) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 6d7733e41..688eb2308 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -1,27 +1,29 @@ from __future__ import unicode_literals import re -import json import itertools from .common import InfoExtractor from ..compat import ( compat_str, - compat_urllib_parse_urlparse, compat_urlparse, ) from ..utils import ( - ExtractorError, find_xpath_attr, - int_or_none, - orderedSet, + xpath_attr, xpath_with_ns, + xpath_text, + orderedSet, + int_or_none, + float_or_none, + parse_iso8601, + determine_ext, ) class LivestreamIE(InfoExtractor): IE_NAME = 'livestream' - _VALID_URL = r'https?://(?:new\.)?livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>[0-9]+)(?:/player)?)?/?(?:$|[?#])' + _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?' _TESTS = [{ 'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', 'md5': '53274c76ba7754fb0e8d072716f2292b', @@ -29,7 +31,9 @@ class LivestreamIE(InfoExtractor): 'id': '4719370', 'ext': 'mp4', 'title': 'Live from Webster Hall NYC', + 'timestamp': 1350008072, 'upload_date': '20121012', + 'duration': 5968.0, 'like_count': int, 'view_count': int, 'thumbnail': 're:^http://.*\.jpg$' @@ -55,39 +59,20 @@ class LivestreamIE(InfoExtractor): 'url': 'http://livestream.com/bsww/concacafbeachsoccercampeonato2015', 'only_matching': True, }] + _API_URL_TEMPLATE = 'http://livestream.com/api/accounts/%s/events/%s' + + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + base_ele = find_xpath_attr( + smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase') + base = base_ele.get('content') if base_ele else 'http://livestreamvod-f.akamaihd.net/' - def _parse_smil(self, video_id, smil_url): formats = [] - _SWITCH_XPATH = ( - './/{http://www.w3.org/2001/SMIL20/Language}body/' - '{http://www.w3.org/2001/SMIL20/Language}switch') - smil_doc = self._download_xml( - smil_url, video_id, - note='Downloading SMIL information', - errnote='Unable to download SMIL information', - fatal=False) - if smil_doc is False: # Download failed - return formats - title_node = find_xpath_attr( - smil_doc, './/{http://www.w3.org/2001/SMIL20/Language}meta', - 'name', 'title') - if title_node is None: - self.report_warning('Cannot find SMIL id') - switch_node = smil_doc.find(_SWITCH_XPATH) - else: - title_id = title_node.attrib['content'] - switch_node = find_xpath_attr( - smil_doc, _SWITCH_XPATH, 'id', title_id) - if switch_node is None: - raise ExtractorError('Cannot find switch node') - video_nodes = switch_node.findall( - '{http://www.w3.org/2001/SMIL20/Language}video') + video_nodes = smil.findall(self._xpath_ns('.//video', namespace)) for vn in video_nodes: - tbr = int_or_none(vn.attrib.get('system-bitrate')) + tbr = int_or_none(vn.attrib.get('system-bitrate'), 1000) furl = ( - 'http://livestream-f.akamaihd.net/%s?v=3.0.3&fp=WIN%%2014,0,0,145' % - (vn.attrib['src'])) + '%s%s?v=3.0.3&fp=WIN%%2014,0,0,145' % (base, vn.attrib['src'])) if 'clipBegin' in vn.attrib: furl += '&ssek=' + vn.attrib['clipBegin'] formats.append({ @@ -106,97 +91,151 @@ class LivestreamIE(InfoExtractor): ('sd', 'progressive_url'), ('hd', 'progressive_url_hd'), ) - formats = [{ - 'format_id': format_id, - 'url': video_data[key], - 'quality': i + 1, - } for i, (format_id, key) in enumerate(FORMAT_KEYS) - if video_data.get(key)] + + formats = [] + for format_id, key in FORMAT_KEYS: + video_url = video_data.get(key) + if video_url: + ext = determine_ext(video_url) + if ext == 'm3u8': + continue + bitrate = int_or_none(self._search_regex( + r'(\d+)\.%s' % ext, video_url, 'bitrate', default=None)) + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'tbr': bitrate, + 'ext': ext, + }) smil_url = video_data.get('smil_url') if smil_url: - formats.extend(self._parse_smil(video_id, smil_url)) + smil_formats = self._extract_smil_formats(smil_url, video_id) + if smil_formats: + formats.extend(smil_formats) + + m3u8_url = video_data.get('m3u8_url') + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + + f4m_url = video_data.get('f4m_url') + if f4m_url: + f4m_formats = self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) self._sort_formats(formats) + comments = [{ + 'author_id': comment.get('author_id'), + 'author': comment.get('author', {}).get('full_name'), + 'id': comment.get('id'), + 'text': comment['text'], + 'timestamp': parse_iso8601(comment.get('created_at')), + } for comment in video_data.get('comments', {}).get('data', [])] + return { 'id': video_id, 'formats': formats, 'title': video_data['caption'], + 'description': video_data.get('description'), 'thumbnail': video_data.get('thumbnail_url'), - 'upload_date': video_data['updated_at'].replace('-', '')[:8], + 'duration': float_or_none(video_data.get('duration'), 1000), + 'timestamp': parse_iso8601(video_data.get('publish_at')), 'like_count': video_data.get('likes', {}).get('total'), + 'comment_count': video_data.get('comments', {}).get('total'), 'view_count': video_data.get('views'), + 'comments': comments, } - def _extract_event(self, info): - event_id = compat_str(info['id']) - account = compat_str(info['owner_account_id']) - root_url = ( - 'https://new.livestream.com/api/accounts/{account}/events/{event}/' - 'feed.json'.format(account=account, event=event_id)) - - def _extract_videos(): - last_video = None - for i in itertools.count(1): - if last_video is None: - info_url = root_url - else: - info_url = '{root}?&id={id}&newer=-1&type=video'.format( - root=root_url, id=last_video) - videos_info = self._download_json(info_url, event_id, 'Downloading page {0}'.format(i))['data'] - videos_info = [v['data'] for v in videos_info if v['type'] == 'video'] - if not videos_info: - break - for v in videos_info: - yield self._extract_video_info(v) - last_video = videos_info[-1]['id'] - return self.playlist_result(_extract_videos(), event_id, info['full_name']) + def _extract_stream_info(self, stream_info): + broadcast_id = stream_info['broadcast_id'] + is_live = stream_info.get('is_live') + + formats = [] + smil_url = stream_info.get('play_url') + if smil_url: + smil_formats = self._extract_smil_formats(smil_url, broadcast_id) + if smil_formats: + formats.extend(smil_formats) + + entry_protocol = 'm3u8' if is_live else 'm3u8_native' + m3u8_url = stream_info.get('m3u8_url') + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, broadcast_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + + rtsp_url = stream_info.get('rtsp_url') + if rtsp_url: + formats.append({ + 'url': rtsp_url, + 'format_id': 'rtsp', + }) + self._sort_formats(formats) + + return { + 'id': broadcast_id, + 'formats': formats, + 'title': self._live_title(stream_info['stream_title']) if is_live else stream_info['stream_title'], + 'thumbnail': stream_info.get('thumbnail_url'), + 'is_live': is_live, + } + + def _extract_event(self, event_data): + event_id = compat_str(event_data['id']) + account_id = compat_str(event_data['owner_account_id']) + feed_root_url = self._API_URL_TEMPLATE % (account_id, event_id) + '/feed.json' + + stream_info = event_data.get('stream_info') + if stream_info: + return self._extract_stream_info(stream_info) + + last_video = None + entries = [] + for i in itertools.count(1): + if last_video is None: + info_url = feed_root_url + else: + info_url = '{root}?&id={id}&newer=-1&type=video'.format( + root=feed_root_url, id=last_video) + videos_info = self._download_json( + info_url, event_id, 'Downloading page {0}'.format(i))['data'] + videos_info = [v['data'] for v in videos_info if v['type'] == 'video'] + if not videos_info: + break + for v in videos_info: + entries.append(self.url_result( + 'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v['id']), + 'Livestream', v['id'], v['caption'])) + last_video = videos_info[-1]['id'] + return self.playlist_result(entries, event_id, event_data['full_name']) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - event_name = mobj.group('event_name') - webpage = self._download_webpage(url, video_id or event_name) - - og_video = self._og_search_video_url( - webpage, 'player url', fatal=False, default=None) - if og_video is not None: - query_str = compat_urllib_parse_urlparse(og_video).query - query = compat_urlparse.parse_qs(query_str) - if 'play_url' in query: - api_url = query['play_url'][0].replace('.smil', '') - info = json.loads(self._download_webpage( - api_url, video_id, 'Downloading video info')) - return self._extract_video_info(info) - - config_json = self._search_regex( - r'window.config = ({.*?});', webpage, 'window config') - info = json.loads(config_json)['event'] - - def is_relevant(vdata, vid): - result = vdata['type'] == 'video' - if video_id is not None: - result = result and compat_str(vdata['data']['id']) == vid - return result - - if video_id is None: - # This is an event page: - return self._extract_event(info) + event = mobj.group('event_id') or mobj.group('event_name') + account = mobj.group('account_id') or mobj.group('account_name') + api_url = self._API_URL_TEMPLATE % (account, event) + if video_id: + video_data = self._download_json( + api_url + '/videos/%s' % video_id, video_id) + return self._extract_video_info(video_data) else: - videos = [self._extract_video_info(video_data['data']) - for video_data in info['feed']['data'] - if is_relevant(video_data, video_id)] - if not videos: - raise ExtractorError('Cannot find video %s' % video_id) - return videos[0] + event_data = self._download_json(api_url, video_id) + return self._extract_event(event_data) # The original version of Livestream uses a different system class LivestreamOriginalIE(InfoExtractor): IE_NAME = 'livestream:original' _VALID_URL = r'''(?x)https?://original\.livestream\.com/ - (?P<user>[^/]+)/(?P<type>video|folder) - (?:\?.*?Id=|/)(?P<id>.*?)(&|$) + (?P<user>[^/\?#]+)(?:/(?P<type>video|folder) + (?:(?:\?.*?Id=|/)(?P<id>.*?)(&|$))?)? ''' _TESTS = [{ 'url': 'http://original.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', @@ -204,6 +243,8 @@ class LivestreamOriginalIE(InfoExtractor): 'id': 'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', 'ext': 'mp4', 'title': 'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital', + 'duration': 771.301, + 'view_count': int, }, }, { 'url': 'https://original.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3', @@ -211,26 +252,62 @@ class LivestreamOriginalIE(InfoExtractor): 'id': 'a07bf706-d0e4-4e75-a747-b021d84f2fd3', }, 'playlist_mincount': 4, + }, { + # live stream + 'url': 'http://original.livestream.com/znsbahamas', + 'only_matching': True, }] - def _extract_video(self, user, video_id): - api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id) - + def _extract_video_info(self, user, video_id): + api_url = 'http://x%sx.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id=%s' % (user, video_id) info = self._download_xml(api_url, video_id) - # this url is used on mobile devices - stream_url = 'http://x{0}x.api.channel.livestream.com/3.0/getstream.json?id={1}'.format(user, video_id) - stream_info = self._download_json(stream_url, video_id) + item = info.find('channel').find('item') - ns = {'media': 'http://search.yahoo.com/mrss'} - thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url'] + title = xpath_text(item, 'title') + media_ns = {'media': 'http://search.yahoo.com/mrss'} + thumbnail_url = xpath_attr( + item, xpath_with_ns('media:thumbnail', media_ns), 'url') + duration = float_or_none(xpath_attr( + item, xpath_with_ns('media:content', media_ns), 'duration')) + ls_ns = {'ls': 'http://api.channel.livestream.com/2.0'} + view_count = int_or_none(xpath_text( + item, xpath_with_ns('ls:viewsCount', ls_ns))) return { 'id': video_id, - 'title': item.find('title').text, - 'url': stream_info['progressiveUrl'], + 'title': title, 'thumbnail': thumbnail_url, + 'duration': duration, + 'view_count': view_count, } + def _extract_video_formats(self, video_data, video_id, entry_protocol): + formats = [] + + progressive_url = video_data.get('progressiveUrl') + if progressive_url: + formats.append({ + 'url': progressive_url, + 'format_id': 'http', + }) + + m3u8_url = video_data.get('httpUrl') + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + + rtsp_url = video_data.get('rtspUrl') + if rtsp_url: + formats.append({ + 'url': rtsp_url, + 'format_id': 'rtsp', + }) + + self._sort_formats(formats) + return formats + def _extract_folder(self, url, folder_id): webpage = self._download_webpage(url, folder_id) paths = orderedSet(re.findall( @@ -239,24 +316,45 @@ class LivestreamOriginalIE(InfoExtractor): <a\s+href="(?=https?://livestre\.am/) )([^"]+)"''', webpage)) - return { - '_type': 'playlist', - 'id': folder_id, - 'entries': [{ - '_type': 'url', - 'url': compat_urlparse.urljoin(url, p), - } for p in paths], - } + entries = [{ + '_type': 'url', + 'url': compat_urlparse.urljoin(url, p), + } for p in paths] + + return self.playlist_result(entries, folder_id) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - id = mobj.group('id') user = mobj.group('user') url_type = mobj.group('type') + content_id = mobj.group('id') if url_type == 'folder': - return self._extract_folder(url, id) + return self._extract_folder(url, content_id) else: - return self._extract_video(user, id) + # this url is used on mobile devices + stream_url = 'http://x%sx.api.channel.livestream.com/3.0/getstream.json' % user + info = {} + if content_id: + stream_url += '?id=%s' % content_id + info = self._extract_video_info(user, content_id) + else: + content_id = user + webpage = self._download_webpage(url, content_id) + info = { + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._search_regex(r'channelLogo.src\s*=\s*"([^"]+)"', webpage, 'thumbnail', None), + } + video_data = self._download_json(stream_url, content_id) + is_live = video_data.get('isLive') + entry_protocol = 'm3u8' if is_live else 'm3u8_native' + info.update({ + 'id': content_id, + 'title': self._live_title(info['title']) if is_live else info['title'], + 'formats': self._extract_video_formats(video_data, content_id, entry_protocol), + 'is_live': is_live, + }) + return info # The server doesn't support HEAD request, the generic extractor can't detect diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index e3236f7b5..863efd896 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -1,12 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( - determine_ext, - js_to_json, + int_or_none, parse_duration, remove_end, ) @@ -23,9 +20,11 @@ class LRTIE(InfoExtractor): 'title': 'Septynios Kauno dienos', 'description': 'md5:24d84534c7dc76581e59f5689462411a', 'duration': 1783, + 'view_count': int, + 'like_count': int, }, 'params': { - 'skip_download': True, # HLS download + 'skip_download': True, # m3u8 download }, } @@ -34,29 +33,23 @@ class LRTIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = remove_end(self._og_search_title(webpage), ' - LRT') + m3u8_url = self._search_regex( + r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*location\.hash\.substring\(1\)', + webpage, 'm3u8 url', group='url') + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage) duration = parse_duration(self._search_regex( - r"'duration':\s*'([^']+)',", webpage, - 'duration', fatal=False, default=None)) + r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1', + webpage, 'duration', default=None, group='duration')) - formats = [] - for js in re.findall(r'(?s)config:\s*(\{.*?\})', webpage): - data = self._parse_json(js, video_id, transform_source=js_to_json) - if 'provider' not in data: - continue - if data['provider'] == 'rtmp': - formats.append({ - 'format_id': 'rtmp', - 'ext': determine_ext(data['file']), - 'url': data['streamer'], - 'play_path': 'mp4:%s' % data['file'], - 'preference': -1, - 'rtmp_real_time': True, - }) - else: - formats.extend( - self._extract_m3u8_formats(data['file'], video_id, 'mp4')) + view_count = int_or_none(self._html_search_regex( + r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>', + webpage, 'view count', fatal=False, group='count')) + like_count = int_or_none(self._search_regex( + r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<', + webpage, 'like count', fatal=False, group='count')) return { 'id': video_id, @@ -65,4 +58,6 @@ class LRTIE(InfoExtractor): 'thumbnail': thumbnail, 'description': description, 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, } diff --git a/youtube_dl/extractor/makertv.py b/youtube_dl/extractor/makertv.py new file mode 100644 index 000000000..3c34d4604 --- /dev/null +++ b/youtube_dl/extractor/makertv.py @@ -0,0 +1,32 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MakerTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)*video|makerplayer.com/embed/maker)/(?P<id>[a-zA-Z0-9]{12})' + _TEST = { + 'url': 'http://www.maker.tv/video/Fh3QgymL9gsc', + 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', + 'info_dict': { + 'id': 'Fh3QgymL9gsc', + 'ext': 'mp4', + 'title': 'Maze Runner: The Scorch Trials Official Movie Review', + 'description': 'md5:11ff3362d7ef1d679fdb649f6413975a', + 'upload_date': '20150918', + 'timestamp': 1442549540, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + jwplatform_id = self._search_regex(r'jw_?id="([^"]+)"', webpage, 'jwplatform id') + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'jwplatform:%s' % jwplatform_id, + 'ie_key': 'JWPlatform', + } diff --git a/youtube_dl/extractor/movshare.py b/youtube_dl/extractor/movshare.py deleted file mode 100644 index 6101063f2..000000000 --- a/youtube_dl/extractor/movshare.py +++ /dev/null @@ -1,27 +0,0 @@ -from __future__ import unicode_literals - -from .novamov import NovaMovIE - - -class MovShareIE(NovaMovIE): - IE_NAME = 'movshare' - IE_DESC = 'MovShare' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'movshare\.(?:net|sx|ag)'} - - _HOST = 'www.movshare.net' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r'<strong>Title:</strong> ([^<]+)</p>' - _DESCRIPTION_REGEX = r'<strong>Description:</strong> ([^<]+)</p>' - - _TEST = { - 'url': 'http://www.movshare.net/video/559e28be54d96', - 'md5': 'abd31a2132947262c50429e1d16c1bfd', - 'info_dict': { - 'id': '559e28be54d96', - 'ext': 'flv', - 'title': 'dissapeared image', - 'description': 'optical illusion dissapeared image magic illusion', - } - } diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index e683d24c4..340c922bd 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -3,14 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, find_xpath_attr, lowercase_escape, + smuggle_url, unescapeHTML, ) @@ -62,12 +60,13 @@ class NBCIE(InfoExtractor): theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex( [ r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"', + r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"', r'"embedURL"\s*:\s*"([^"]+)"' ], webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/'))) if theplatform_url.startswith('//'): theplatform_url = 'http:' + theplatform_url - return self.url_result(theplatform_url) + return self.url_result(smuggle_url(theplatform_url, {'source_url': url})) class NBCSportsVPlayerIE(InfoExtractor): @@ -187,7 +186,7 @@ class NBCNewsIE(InfoExtractor): 'title': info.find('headline').text, 'ext': 'flv', 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, - 'description': compat_str(info.find('caption').text), + 'description': info.find('caption').text, 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, } else: diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 16213eed9..894c51399 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -88,10 +88,10 @@ class NDRIE(NDRBaseIE): 'embedURL', webpage, 'embed URL', fatal=True) description = self._search_regex( r'<p[^>]+itemprop="description">([^<]+)</p>', - webpage, 'description', fatal=False) + webpage, 'description', default=None) or self._og_search_description(webpage) timestamp = parse_iso8601( self._search_regex( - r'<span itemprop="datePublished" content="([^"]+)">', + r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"', webpage, 'upload date', fatal=False)) return { '_type': 'url_transparent', diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 76bd21e6d..d440313d5 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -9,6 +9,7 @@ from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse, + compat_urlparse, ) from ..utils import ( clean_html, @@ -82,14 +83,21 @@ class NocoIE(InfoExtractor): if 'erreur' in login: raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True) + @staticmethod + def _ts(): + return int(time.time() * 1000) + def _call_api(self, path, video_id, note, sub_lang=None): - ts = compat_str(int(time.time() * 1000)) + ts = compat_str(self._ts() + self._ts_offset) tk = hashlib.md5((hashlib.md5(ts.encode('ascii')).hexdigest() + '#8S?uCraTedap6a').encode('ascii')).hexdigest() url = self._API_URL_TEMPLATE % (path, ts, tk) if sub_lang: url += self._SUB_LANG_TEMPLATE % sub_lang - resp = self._download_json(url, video_id, note) + request = sanitized_Request(url) + request.add_header('Referer', self._referer) + + resp = self._download_json(request, video_id, note) if isinstance(resp, dict) and resp.get('error'): self._raise_error(resp['error'], resp['description']) @@ -102,8 +110,22 @@ class NocoIE(InfoExtractor): expected=True) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) + + # Timestamp adjustment offset between server time and local time + # must be calculated in order to use timestamps closest to server's + # in all API requests (see https://github.com/rg3/youtube-dl/issues/7864) + webpage = self._download_webpage(url, video_id) + + player_url = self._search_regex( + r'(["\'])(?P<player>https?://noco\.tv/(?:[^/]+/)+NocoPlayer.+?\.swf.*?)\1', + webpage, 'noco player', group='player', + default='http://noco.tv/cdata/js/player/NocoPlayer-v1.2.40.swf') + + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(player_url).query) + ts = int_or_none(qs.get('ts', [None])[0]) + self._ts_offset = ts - self._ts() if ts else 0 + self._referer = player_url medias = self._call_api( 'shows/%s/medias' % video_id, @@ -155,8 +177,8 @@ class NocoIE(InfoExtractor): 'format_id': format_id_extended, 'width': int_or_none(fmt.get('res_width')), 'height': int_or_none(fmt.get('res_lines')), - 'abr': int_or_none(fmt.get('audiobitrate')), - 'vbr': int_or_none(fmt.get('videobitrate')), + 'abr': int_or_none(fmt.get('audiobitrate'), 1000), + 'vbr': int_or_none(fmt.get('videobitrate'), 1000), 'filesize': int_or_none(fmt.get('filesize')), 'format_note': qualities[format_id].get('quality_name'), 'quality': qualities[format_id].get('priority'), diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index 6163e8855..d68c1ad79 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -17,15 +17,16 @@ class NovaMovIE(InfoExtractor): IE_NAME = 'novamov' IE_DESC = 'NovaMov' - _VALID_URL_TEMPLATE = r'http://(?:(?:www\.)?%(host)s/(?:file|video)/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<id>[a-z\d]{13})' + _VALID_URL_TEMPLATE = r'http://(?:(?:www\.)?%(host)s/(?:file|video|mobile/#/videos)/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<id>[a-z\d]{13})' _VALID_URL = _VALID_URL_TEMPLATE % {'host': 'novamov\.com'} _HOST = 'www.novamov.com' _FILE_DELETED_REGEX = r'This file no longer exists on our servers!</h2>' - _FILEKEY_REGEX = r'flashvars\.filekey="(?P<filekey>[^"]+)";' + _FILEKEY_REGEX = r'flashvars\.filekey=(?P<filekey>"?[^"]+"?);' _TITLE_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>' _DESCRIPTION_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>' + _URL_TEMPLATE = 'http://%s/video/%s' _TEST = { 'url': 'http://www.novamov.com/video/4rurhn9x446jj', @@ -39,20 +40,28 @@ class NovaMovIE(InfoExtractor): 'skip': '"Invalid token" errors abound (in web interface as well as youtube-dl, there is nothing we can do about it.)' } + def _check_existence(self, webpage, video_id): + if re.search(self._FILE_DELETED_REGEX, webpage) is not None: + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + def _real_extract(self, url): video_id = self._match_id(url) - url = 'http://%s/video/%s' % (self._HOST, video_id) + url = self._URL_TEMPLATE % (self._HOST, video_id) webpage = self._download_webpage( url, video_id, 'Downloading video page') - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) + self._check_existence(webpage, video_id) def extract_filekey(default=NO_DEFAULT): - return self._search_regex( + filekey = self._search_regex( self._FILEKEY_REGEX, webpage, 'filekey', default=default) + if filekey is not default and (filekey[0] != '"' or filekey[-1] != '"'): + return self._search_regex( + r'var\s+%s\s*=\s*"([^"]+)"' % re.escape(filekey), webpage, 'filekey', default=default) + else: + return filekey filekey = extract_filekey(default=None) @@ -69,6 +78,7 @@ class NovaMovIE(InfoExtractor): request.add_header('Referer', post_url) webpage = self._download_webpage( request, video_id, 'Downloading continue to the video page') + self._check_existence(webpage, video_id) filekey = extract_filekey() @@ -92,3 +102,89 @@ class NovaMovIE(InfoExtractor): 'title': title, 'description': description } + + +class WholeCloudIE(NovaMovIE): + IE_NAME = 'wholecloud' + IE_DESC = 'WholeCloud' + + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': '(?:wholecloud\.net|movshare\.(?:net|sx|ag))'} + + _HOST = 'www.wholecloud.net' + + _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' + _TITLE_REGEX = r'<strong>Title:</strong> ([^<]+)</p>' + _DESCRIPTION_REGEX = r'<strong>Description:</strong> ([^<]+)</p>' + + _TEST = { + 'url': 'http://www.wholecloud.net/video/559e28be54d96', + 'md5': 'abd31a2132947262c50429e1d16c1bfd', + 'info_dict': { + 'id': '559e28be54d96', + 'ext': 'flv', + 'title': 'dissapeared image', + 'description': 'optical illusion dissapeared image magic illusion', + } + } + + +class NowVideoIE(NovaMovIE): + IE_NAME = 'nowvideo' + IE_DESC = 'NowVideo' + + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:to|ch|ec|sx|eu|at|ag|co|li)'} + + _HOST = 'www.nowvideo.to' + + _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' + _TITLE_REGEX = r'<h4>([^<]+)</h4>' + _DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>' + + _TEST = { + 'url': 'http://www.nowvideo.sx/video/f1d6fce9a968b', + 'md5': '12c82cad4f2084881d8bc60ee29df092', + 'info_dict': { + 'id': 'f1d6fce9a968b', + 'ext': 'flv', + 'title': 'youtubedl test video BaWjenozKc', + 'description': 'Description', + }, + } + + +class VideoWeedIE(NovaMovIE): + IE_NAME = 'videoweed' + IE_DESC = 'VideoWeed' + + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'videoweed\.(?:es|com)'} + + _HOST = 'www.videoweed.es' + + _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' + _TITLE_REGEX = r'<h1 class="text_shadow">([^<]+)</h1>' + _URL_TEMPLATE = 'http://%s/file/%s' + + _TEST = { + 'url': 'http://www.videoweed.es/file/b42178afbea14', + 'md5': 'abd31a2132947262c50429e1d16c1bfd', + 'info_dict': { + 'id': 'b42178afbea14', + 'ext': 'flv', + 'title': 'optical illusion dissapeared image magic illusion', + 'description': '' + }, + } + + +class CloudTimeIE(NovaMovIE): + IE_NAME = 'cloudtime' + IE_DESC = 'CloudTime' + + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'cloudtime\.to'} + + _HOST = 'www.cloudtime.to' + + _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' + _TITLE_REGEX = r'<div[^>]+class=["\']video_det["\'][^>]*>\s*<strong>([^<]+)</strong>' + + _TEST = None diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index d480fb58c..446f5901c 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -1,7 +1,10 @@ # encoding: utf-8 from __future__ import unicode_literals -from .brightcove import BrightcoveLegacyIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -23,9 +26,12 @@ class NownessBaseIE(InfoExtractor): note='Downloading player JavaScript', errnote='Unable to download player JavaScript') bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code) - if bc_url is None: - raise ExtractorError('Could not find player definition') - return self.url_result(bc_url, 'BrightcoveLegacy') + if bc_url: + return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) + bc_url = BrightcoveNewIE._extract_url(player_code) + if bc_url: + return self.url_result(bc_url, BrightcoveNewIE.ie_key()) + raise ExtractorError('Could not find player definition') elif source == 'vimeo': return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') elif source == 'youtube': diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 67e34b294..fd107aca2 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -71,7 +71,7 @@ class NowTVBaseIE(InfoExtractor): class NowTVIE(NowTVBaseIE): - _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:list/[^/]+/)?(?P<id>[^/]+)/(?:player|preview)' + _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P<id>[^/]+)/(?:player|preview)' _TESTS = [{ # rtl @@ -190,6 +190,9 @@ class NowTVIE(NowTVBaseIE): }, { 'url': 'http://www.nowtv.de/rtl2/echtzeit/list/aktuell/schnelles-geld-am-ende-der-welt/player', 'only_matching': True, + }, { + 'url': 'http://www.nowtv.de/rtl2/zuhause-im-glueck/jahr/2015/11/eine-erschuetternde-diagnose/player', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py deleted file mode 100644 index 57ee3d366..000000000 --- a/youtube_dl/extractor/nowvideo.py +++ /dev/null @@ -1,28 +0,0 @@ -from __future__ import unicode_literals - -from .novamov import NovaMovIE - - -class NowVideoIE(NovaMovIE): - IE_NAME = 'nowvideo' - IE_DESC = 'NowVideo' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:to|ch|ec|sx|eu|at|ag|co|li)'} - - _HOST = 'www.nowvideo.to' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _FILEKEY_REGEX = r'var fkzd="([^"]+)";' - _TITLE_REGEX = r'<h4>([^<]+)</h4>' - _DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>' - - _TEST = { - 'url': 'http://www.nowvideo.ch/video/0mw0yow7b6dxa', - 'md5': 'f8fbbc8add72bd95b7850c6a02fc8817', - 'info_dict': { - 'id': '0mw0yow7b6dxa', - 'ext': 'flv', - 'title': 'youtubedl test video _BaW_jenozKc.mp4', - 'description': 'Description', - } - } diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 35067e271..8603fd692 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -27,6 +27,7 @@ class OoyalaBaseIE(InfoExtractor): 'duration': float_or_none(metadata.get('duration'), 1000), } + urls = [] formats = [] for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'): auth_data = self._download_json( @@ -38,20 +39,28 @@ class OoyalaBaseIE(InfoExtractor): if cur_auth_data['authorized']: for stream in cur_auth_data['streams']: url = base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8') + if url in urls: + continue + urls.append(url) delivery_type = stream['delivery_type'] - if delivery_type == 'remote_asset': - video_info['url'] = url - return video_info - if delivery_type == 'hls': - formats.extend(self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - elif delivery_type == 'hds': - formats.extend(self._extract_f4m_formats(url, embed_code, -1, 'hds', fatal=False)) + if delivery_type == 'hls' or '.m3u8' in url: + m3u8_formats = self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif delivery_type == 'hds' or '.f4m' in url: + f4m_formats = self._extract_f4m_formats(url, embed_code, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + elif '.smil' in url: + smil_formats = self._extract_smil_formats(url, embed_code, fatal=False) + if smil_formats: + formats.extend(smil_formats) else: formats.append({ 'url': url, 'ext': stream.get('delivery_type'), 'vcodec': stream.get('video_codec'), - 'format_id': '%s-%s-%sp' % (stream.get('profile'), delivery_type, stream.get('height')), + 'format_id': delivery_type, 'width': int_or_none(stream.get('width')), 'height': int_or_none(stream.get('height')), 'abr': int_or_none(stream.get('audio_bitrate')), diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index b787e2a73..97e8ffc97 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -15,16 +15,181 @@ from ..utils import ( class PBSIE(InfoExtractor): + _STATIONS = ( + (r'(?:video|www|player)\.pbs\.org', 'PBS: Public Broadcasting Service'), # http://www.pbs.org/ + (r'video\.aptv\.org', 'APT - Alabama Public Television (WBIQ)'), # http://aptv.org/ + (r'video\.gpb\.org', 'GPB/Georgia Public Broadcasting (WGTV)'), # http://www.gpb.org/ + (r'video\.mpbonline\.org', 'Mississippi Public Broadcasting (WMPN)'), # http://www.mpbonline.org + (r'video\.wnpt\.org', 'Nashville Public Television (WNPT)'), # http://www.wnpt.org + (r'video\.wfsu\.org', 'WFSU-TV (WFSU)'), # http://wfsu.org/ + (r'video\.wsre\.org', 'WSRE (WSRE)'), # http://www.wsre.org + (r'video\.wtcitv\.org', 'WTCI (WTCI)'), # http://www.wtcitv.org + (r'video\.pba\.org', 'WPBA/Channel 30 (WPBA)'), # http://pba.org/ + (r'video\.alaskapublic\.org', 'Alaska Public Media (KAKM)'), # http://alaskapublic.org/kakm + # (r'kuac\.org', 'KUAC (KUAC)'), # http://kuac.org/kuac-tv/ + # (r'ktoo\.org', '360 North (KTOO)'), # http://www.ktoo.org/ + # (r'azpm\.org', 'KUAT 6 (KUAT)'), # http://www.azpm.org/ + (r'video\.azpbs\.org', 'Arizona PBS (KAET)'), # http://www.azpbs.org + (r'portal\.knme\.org', 'KNME-TV/Channel 5 (KNME)'), # http://www.newmexicopbs.org/ + (r'video\.vegaspbs\.org', 'Vegas PBS (KLVX)'), # http://vegaspbs.org/ + (r'watch\.aetn\.org', 'AETN/ARKANSAS ETV NETWORK (KETS)'), # http://www.aetn.org/ + (r'video\.ket\.org', 'KET (WKLE)'), # http://www.ket.org/ + (r'video\.wkno\.org', 'WKNO/Channel 10 (WKNO)'), # http://www.wkno.org/ + (r'video\.lpb\.org', 'LPB/LOUISIANA PUBLIC BROADCASTING (WLPB)'), # http://www.lpb.org/ + (r'videos\.oeta\.tv', 'OETA (KETA)'), # http://www.oeta.tv + (r'video\.optv\.org', 'Ozarks Public Television (KOZK)'), # http://www.optv.org/ + (r'watch\.wsiu\.org', 'WSIU Public Broadcasting (WSIU)'), # http://www.wsiu.org/ + (r'video\.keet\.org', 'KEET TV (KEET)'), # http://www.keet.org + (r'pbs\.kixe\.org', 'KIXE/Channel 9 (KIXE)'), # http://kixe.org/ + (r'video\.kpbs\.org', 'KPBS San Diego (KPBS)'), # http://www.kpbs.org/ + (r'video\.kqed\.org', 'KQED (KQED)'), # http://www.kqed.org + (r'vids\.kvie\.org', 'KVIE Public Television (KVIE)'), # http://www.kvie.org + (r'video\.pbssocal\.org', 'PBS SoCal/KOCE (KOCE)'), # http://www.pbssocal.org/ + (r'video\.valleypbs\.org', 'ValleyPBS (KVPT)'), # http://www.valleypbs.org/ + (r'video\.cptv\.org', 'CONNECTICUT PUBLIC TELEVISION (WEDH)'), # http://cptv.org + (r'watch\.knpb\.org', 'KNPB Channel 5 (KNPB)'), # http://www.knpb.org/ + (r'video\.soptv\.org', 'SOPTV (KSYS)'), # http://www.soptv.org + # (r'klcs\.org', 'KLCS/Channel 58 (KLCS)'), # http://www.klcs.org + # (r'krcb\.org', 'KRCB Television & Radio (KRCB)'), # http://www.krcb.org + # (r'kvcr\.org', 'KVCR TV/DT/FM :: Vision for the Future (KVCR)'), # http://kvcr.org + (r'video\.rmpbs\.org', 'Rocky Mountain PBS (KRMA)'), # http://www.rmpbs.org + (r'video\.kenw\.org', 'KENW-TV3 (KENW)'), # http://www.kenw.org + (r'video\.kued\.org', 'KUED Channel 7 (KUED)'), # http://www.kued.org + (r'video\.wyomingpbs\.org', 'Wyoming PBS (KCWC)'), # http://www.wyomingpbs.org + (r'video\.cpt12\.org', 'Colorado Public Television / KBDI 12 (KBDI)'), # http://www.cpt12.org/ + (r'video\.kbyueleven\.org', 'KBYU-TV (KBYU)'), # http://www.kbyutv.org/ + (r'video\.thirteen\.org', 'Thirteen/WNET New York (WNET)'), # http://www.thirteen.org + (r'video\.wgbh\.org', 'WGBH/Channel 2 (WGBH)'), # http://wgbh.org + (r'video\.wgby\.org', 'WGBY (WGBY)'), # http://www.wgby.org + (r'watch\.njtvonline\.org', 'NJTV Public Media NJ (WNJT)'), # http://www.njtvonline.org/ + # (r'ripbs\.org', 'Rhode Island PBS (WSBE)'), # http://www.ripbs.org/home/ + (r'watch\.wliw\.org', 'WLIW21 (WLIW)'), # http://www.wliw.org/ + (r'video\.mpt\.tv', 'mpt/Maryland Public Television (WMPB)'), # http://www.mpt.org + (r'watch\.weta\.org', 'WETA Television and Radio (WETA)'), # http://www.weta.org + (r'video\.whyy\.org', 'WHYY (WHYY)'), # http://www.whyy.org + (r'video\.wlvt\.org', 'PBS 39 (WLVT)'), # http://www.wlvt.org/ + (r'video\.wvpt\.net', 'WVPT - Your Source for PBS and More! (WVPT)'), # http://www.wvpt.net + (r'video\.whut\.org', 'Howard University Television (WHUT)'), # http://www.whut.org + (r'video\.wedu\.org', 'WEDU PBS (WEDU)'), # http://www.wedu.org + (r'video\.wgcu\.org', 'WGCU Public Media (WGCU)'), # http://www.wgcu.org/ + # (r'wjct\.org', 'WJCT Public Broadcasting (WJCT)'), # http://www.wjct.org + (r'video\.wpbt2\.org', 'WPBT2 (WPBT)'), # http://www.wpbt2.org + (r'video\.wucftv\.org', 'WUCF TV (WUCF)'), # http://wucftv.org + (r'video\.wuft\.org', 'WUFT/Channel 5 (WUFT)'), # http://www.wuft.org + (r'watch\.wxel\.org', 'WXEL/Channel 42 (WXEL)'), # http://www.wxel.org/home/ + (r'video\.wlrn\.org', 'WLRN/Channel 17 (WLRN)'), # http://www.wlrn.org/ + (r'video\.wusf\.usf\.edu', 'WUSF Public Broadcasting (WUSF)'), # http://wusf.org/ + (r'video\.scetv\.org', 'ETV (WRLK)'), # http://www.scetv.org + (r'video\.unctv\.org', 'UNC-TV (WUNC)'), # http://www.unctv.org/ + # (r'pbsguam\.org', 'PBS Guam (KGTF)'), # http://www.pbsguam.org/ + (r'video\.pbshawaii\.org', 'PBS Hawaii - Oceanic Cable Channel 10 (KHET)'), # http://www.pbshawaii.org/ + (r'video\.idahoptv\.org', 'Idaho Public Television (KAID)'), # http://idahoptv.org + (r'video\.ksps\.org', 'KSPS (KSPS)'), # http://www.ksps.org/home/ + (r'watch\.opb\.org', 'OPB (KOPB)'), # http://www.opb.org + (r'watch\.nwptv\.org', 'KWSU/Channel 10 & KTNW/Channel 31 (KWSU)'), # http://www.kwsu.org + (r'video\.will\.illinois\.edu', 'WILL-TV (WILL)'), # http://will.illinois.edu/ + (r'video\.networkknowledge\.tv', 'Network Knowledge - WSEC/Springfield (WSEC)'), # http://www.wsec.tv + (r'video\.wttw\.com', 'WTTW11 (WTTW)'), # http://www.wttw.com/ + # (r'wtvp\.org', 'WTVP & WTVP.org, Public Media for Central Illinois (WTVP)'), # http://www.wtvp.org/ + (r'video\.iptv\.org', 'Iowa Public Television/IPTV (KDIN)'), # http://www.iptv.org/ + (r'video\.ninenet\.org', 'Nine Network (KETC)'), # http://www.ninenet.org + (r'video\.wfwa\.org', 'PBS39 Fort Wayne (WFWA)'), # http://wfwa.org/ + (r'video\.wfyi\.org', 'WFYI Indianapolis (WFYI)'), # http://www.wfyi.org + (r'video\.mptv\.org', 'Milwaukee Public Television (WMVS)'), # http://www.mptv.org + (r'video\.wnin\.org', 'WNIN (WNIN)'), # http://www.wnin.org/ + (r'video\.wnit\.org', 'WNIT Public Television (WNIT)'), # http://www.wnit.org/ + (r'video\.wpt\.org', 'WPT (WPNE)'), # http://www.wpt.org/ + (r'video\.wvut\.org', 'WVUT/Channel 22 (WVUT)'), # http://wvut.org/ + (r'video\.weiu\.net', 'WEIU/Channel 51 (WEIU)'), # http://www.weiu.net + (r'video\.wqpt\.org', 'WQPT-TV (WQPT)'), # http://www.wqpt.org + (r'video\.wycc\.org', 'WYCC PBS Chicago (WYCC)'), # http://www.wycc.org + # (r'lakeshorepublicmedia\.org', 'Lakeshore Public Television (WYIN)'), # http://lakeshorepublicmedia.org/ + (r'video\.wipb\.org', 'WIPB-TV (WIPB)'), # http://wipb.org + (r'video\.indianapublicmedia\.org', 'WTIU (WTIU)'), # http://indianapublicmedia.org/tv/ + (r'watch\.cetconnect\.org', 'CET (WCET)'), # http://www.cetconnect.org + (r'video\.thinktv\.org', 'ThinkTVNetwork (WPTD)'), # http://www.thinktv.org + (r'video\.wbgu\.org', 'WBGU-TV (WBGU)'), # http://wbgu.org + (r'video\.wgvu\.org', 'WGVU TV (WGVU)'), # http://www.wgvu.org/ + (r'video\.netnebraska\.org', 'NET1 (KUON)'), # http://netnebraska.org + (r'video\.pioneer\.org', 'Pioneer Public Television (KWCM)'), # http://www.pioneer.org + (r'watch\.sdpb\.org', 'SDPB Television (KUSD)'), # http://www.sdpb.org + (r'video\.tpt\.org', 'TPT (KTCA)'), # http://www.tpt.org + (r'watch\.ksmq\.org', 'KSMQ (KSMQ)'), # http://www.ksmq.org/ + (r'watch\.kpts\.org', 'KPTS/Channel 8 (KPTS)'), # http://www.kpts.org/ + (r'watch\.ktwu\.org', 'KTWU/Channel 11 (KTWU)'), # http://ktwu.org + # (r'shptv\.org', 'Smoky Hills Public Television (KOOD)'), # http://www.shptv.org + # (r'kcpt\.org', 'KCPT Kansas City Public Television (KCPT)'), # http://kcpt.org/ + # (r'blueridgepbs\.org', 'Blue Ridge PBS (WBRA)'), # http://www.blueridgepbs.org/ + (r'watch\.easttennesseepbs\.org', 'East Tennessee PBS (WSJK)'), # http://easttennesseepbs.org + (r'video\.wcte\.tv', 'WCTE-TV (WCTE)'), # http://www.wcte.org + (r'video\.wljt\.org', 'WLJT, Channel 11 (WLJT)'), # http://wljt.org/ + (r'video\.wosu\.org', 'WOSU TV (WOSU)'), # http://wosu.org/ + (r'video\.woub\.org', 'WOUB/WOUC (WOUB)'), # http://woub.org/tv/index.php?section=5 + (r'video\.wvpublic\.org', 'WVPB (WVPB)'), # http://wvpublic.org/ + (r'video\.wkyupbs\.org', 'WKYU-PBS (WKYU)'), # http://www.wkyupbs.org + # (r'wyes\.org', 'WYES-TV/New Orleans (WYES)'), # http://www.wyes.org + (r'video\.kera\.org', 'KERA 13 (KERA)'), # http://www.kera.org/ + (r'video\.mpbn\.net', 'MPBN (WCBB)'), # http://www.mpbn.net/ + (r'video\.mountainlake\.org', 'Mountain Lake PBS (WCFE)'), # http://www.mountainlake.org/ + (r'video\.nhptv\.org', 'NHPTV (WENH)'), # http://nhptv.org/ + (r'video\.vpt\.org', 'Vermont PBS (WETK)'), # http://www.vpt.org + (r'video\.witf\.org', 'witf (WITF)'), # http://www.witf.org + (r'watch\.wqed\.org', 'WQED Multimedia (WQED)'), # http://www.wqed.org/ + (r'video\.wmht\.org', 'WMHT Educational Telecommunications (WMHT)'), # http://www.wmht.org/home/ + (r'video\.deltabroadcasting\.org', 'Q-TV (WDCQ)'), # http://www.deltabroadcasting.org + (r'video\.dptv\.org', 'WTVS Detroit Public TV (WTVS)'), # http://www.dptv.org/ + (r'video\.wcmu\.org', 'CMU Public Television (WCMU)'), # http://www.wcmu.org + (r'video\.wkar\.org', 'WKAR-TV (WKAR)'), # http://wkar.org/ + (r'wnmuvideo\.nmu\.edu', 'WNMU-TV Public TV 13 (WNMU)'), # http://wnmutv.nmu.edu + (r'video\.wdse\.org', 'WDSE - WRPT (WDSE)'), # http://www.wdse.org/ + (r'video\.wgte\.org', 'WGTE TV (WGTE)'), # http://www.wgte.org + (r'video\.lptv\.org', 'Lakeland Public Television (KAWE)'), # http://www.lakelandptv.org + # (r'prairiepublic\.org', 'PRAIRIE PUBLIC (KFME)'), # http://www.prairiepublic.org/ + (r'video\.kmos\.org', 'KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS)'), # http://www.kmos.org/ + (r'watch\.montanapbs\.org', 'MontanaPBS (KUSM)'), # http://montanapbs.org + (r'video\.krwg\.org', 'KRWG/Channel 22 (KRWG)'), # http://www.krwg.org + (r'video\.kacvtv\.org', 'KACV (KACV)'), # http://www.panhandlepbs.org/home/ + (r'video\.kcostv\.org', 'KCOS/Channel 13 (KCOS)'), # www.kcostv.org + (r'video\.wcny\.org', 'WCNY/Channel 24 (WCNY)'), # http://www.wcny.org + (r'video\.wned\.org', 'WNED (WNED)'), # http://www.wned.org/ + (r'watch\.wpbstv\.org', 'WPBS (WPBS)'), # http://www.wpbstv.org + (r'video\.wskg\.org', 'WSKG Public TV (WSKG)'), # http://wskg.org + (r'video\.wxxi\.org', 'WXXI (WXXI)'), # http://wxxi.org + (r'video\.wpsu\.org', 'WPSU (WPSU)'), # http://www.wpsu.org + # (r'wqln\.org', 'WQLN/Channel 54 (WQLN)'), # http://www.wqln.org + (r'on-demand\.wvia\.org', 'WVIA Public Media Studios (WVIA)'), # http://www.wvia.org/ + (r'video\.wtvi\.org', 'WTVI (WTVI)'), # http://www.wtvi.org/ + # (r'whro\.org', 'WHRO (WHRO)'), # http://whro.org + (r'video\.westernreservepublicmedia\.org', 'Western Reserve PBS (WNEO)'), # http://www.WesternReservePublicMedia.org/ + (r'video\.ideastream\.org', 'WVIZ/PBS ideastream (WVIZ)'), # http://www.wviz.org/ + (r'video\.kcts9\.org', 'KCTS 9 (KCTS)'), # http://kcts9.org/ + (r'video\.basinpbs\.org', 'Basin PBS (KPBT)'), # http://www.basinpbs.org + (r'video\.houstonpbs\.org', 'KUHT / Channel 8 (KUHT)'), # http://www.houstonpublicmedia.org/ + # (r'tamu\.edu', 'KAMU - TV (KAMU)'), # http://KAMU.tamu.edu + # (r'kedt\.org', 'KEDT/Channel 16 (KEDT)'), # http://www.kedt.org + (r'video\.klrn\.org', 'KLRN (KLRN)'), # http://www.klrn.org + (r'video\.klru\.tv', 'KLRU (KLRU)'), # http://www.klru.org + # (r'kmbh\.org', 'KMBH-TV (KMBH)'), # http://www.kmbh.org + # (r'knct\.org', 'KNCT (KNCT)'), # http://www.knct.org + # (r'ktxt\.org', 'KTTZ-TV (KTXT)'), # http://www.ktxt.org + (r'video\.wtjx\.org', 'WTJX Channel 12 (WTJX)'), # http://www.wtjx.org/ + (r'video\.ideastations\.org', 'WCVE PBS (WCVE)'), # http://ideastations.org/ + (r'video\.kbtc\.org', 'KBTC Public Television (KBTC)'), # http://kbtc.org + ) + + IE_NAME = 'pbs' + IE_DESC = 'Public Broadcasting Service (PBS) and member stations: %s' % ', '.join(list(zip(*_STATIONS))[1]) + _VALID_URL = r'''(?x)https?:// (?: # Direct video URL - video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | + (?:%s)/(?:viralplayer|video)/(?P<id>[0-9]+)/? | # Article with embedded player (or direct video) (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ ) - ''' + ''' % '|'.join(list(zip(*_STATIONS))[0]) _TESTS = [ { @@ -174,6 +339,10 @@ class PBSIE(InfoExtractor): { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, + }, + { + 'url': 'http://watch.knpb.org/video/2365616055/', + 'only_matching': True, } ] _ERRORS = { @@ -204,6 +373,7 @@ class PBSIE(InfoExtractor): MEDIA_ID_REGEXES = [ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer + r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer ] diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 63cc764bb..514e9b433 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -31,9 +31,8 @@ class PeriscopeIE(InfoExtractor): }] def _call_api(self, method, value): - attribute = 'token' if len(value) > 13 else 'broadcast_id' return self._download_json( - 'https://api.periscope.tv/api/v2/%s?%s=%s' % (method, attribute, value), value) + 'https://api.periscope.tv/api/v2/%s?broadcast_id=%s' % (method, value), value) def _real_extract(self, url): token = self._match_id(url) diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index 46cebc0d7..6ce2ec19d 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -1,10 +1,9 @@ from __future__ import unicode_literals -from .common import InfoExtractor -from .zdf import extract_from_xml_url +from .zdf import ZDFIE -class PhoenixIE(InfoExtractor): +class PhoenixIE(ZDFIE): _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ (?: phoenix/die_sendungen/(?:[^/]+/)? @@ -41,5 +40,5 @@ class PhoenixIE(InfoExtractor): r'<div class="phx_vod" id="phx_vod_([0-9]+)"', webpage, 'internal video ID') - api_url = 'http://www.phoenix.de/php/zdfplayer-v1.3/data/beitragsDetails.php?ak=web&id=%s' % internal_id - return extract_from_xml_url(self, video_id, api_url) + api_url = 'http://www.phoenix.de/php/mediaplayer/data/beitrags_details.php?ak=web&id=%s' % internal_id + return self.extract_from_xml_url(video_id, api_url) diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py index 551c8c9f0..bc559d1df 100644 --- a/youtube_dl/extractor/pladform.py +++ b/youtube_dl/extractor/pladform.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -44,6 +46,13 @@ class PladformIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src="(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage) + if mobj: + return mobj.group('url') + def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 7ff1d06c4..278b1d2bf 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -8,20 +8,24 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + ExtractorError, + determine_ext, parse_duration, unified_strdate, + int_or_none, + xpath_text, ) -class RaiIE(InfoExtractor): - _VALID_URL = r'(?P<url>(?P<host>http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it))/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' +class RaiTVIE(InfoExtractor): + _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', - 'md5': 'c064c0b2d09c278fb293116ef5d0a32d', + 'md5': '96382709b61dd64a6b88e0f791e6df4c', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Report del 07/04/2014', 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', 'upload_date': '20140407', @@ -30,16 +34,14 @@ class RaiIE(InfoExtractor): }, { 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', - 'md5': '8bb9c151924ce241b74dd52ef29ceafa', + 'md5': 'd9751b78eac9710d62c2447b224dea39', 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'TG PRIMO TEMPO', - 'description': '', 'upload_date': '20140612', 'duration': 1758, }, - 'skip': 'Error 404', }, { 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', @@ -55,110 +57,106 @@ class RaiIE(InfoExtractor): }, { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html', - 'md5': '35694f062977fe6619943f08ed935730', 'info_dict': { 'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132', 'ext': 'mp4', 'title': 'Alluvione in Sardegna e dissesto idrogeologico', 'description': 'Edizione delle ore 20:30 ', - } + }, + 'skip': 'invalid urls', }, { 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', - 'md5': '02b64456f7cc09f96ff14e7dd489017e', + 'md5': '496ab63e420574447f70d02578333437', 'info_dict': { 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', 'ext': 'flv', 'title': 'Il Candidato - Primo episodio: "Le Primarie"', - 'description': 'Primo appuntamento con "Il candidato" con Filippo Timi, alias Piero Zucca presidente!', - 'uploader': 'RaiTre', + 'description': 'md5:364b604f7db50594678f483353164fb8', + 'upload_date': '20140923', + 'duration': 386, } }, - { - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '037104d2c14132887e5e4cf114569214', - 'info_dict': { - 'id': '0c7a664b-d0f4-4b2c-8835-3f82e46f433e', - 'ext': 'flv', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'uploader': 'RaiTre', - 'upload_date': '20141221', - }, - } ] - def _extract_relinker_url(self, webpage): - return self._proto_relative_url(self._search_regex( - [r'name="videourl" content="([^"]+)"', r'var\s+videoURL(?:_MP4)?\s*=\s*"([^"]+)"'], - webpage, 'relinker url', default=None)) - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - host = mobj.group('host') + video_id = self._match_id(url) + media = self._download_json( + 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id, + video_id, 'Downloading video JSON') - webpage = self._download_webpage(url, video_id) + thumbnails = [] + for image_type in ('image', 'image_medium', 'image_300'): + thumbnail_url = media.get(image_type) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + }) - relinker_url = self._extract_relinker_url(webpage) - - if not relinker_url: - iframe_url = self._search_regex( - [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', - r'drawMediaRaiTV\(["\'](.+?)["\']'], - webpage, 'iframe') - if not iframe_url.startswith('http'): - iframe_url = compat_urlparse.urljoin(url, iframe_url) - webpage = self._download_webpage( - iframe_url, video_id) - relinker_url = self._extract_relinker_url(webpage) - - relinker = self._download_json( - '%s&output=47' % relinker_url, video_id) - - media_url = relinker['video'][0] - ct = relinker.get('ct') - if ct == 'f4m': - formats = self._extract_f4m_formats( - media_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id) - else: - formats = [{ - 'url': media_url, - 'format_id': ct, - }] + subtitles = [] + formats = [] + media_type = media['type'] + if 'Audio' in media_type: + formats.append({ + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + }) + elif 'Video' in media_type: + def fix_xml(xml): + return xml.replace(' tag elementi', '').replace('>/', '</') + + relinker = self._download_xml( + media['mediaUri'] + '&output=43', video_id, transform_source=fix_xml) - json_link = self._html_search_meta( - 'jsonlink', webpage, 'JSON link', default=None) - if json_link: - media = self._download_json( - host + json_link, video_id, 'Downloading video JSON') - title = media.get('name') - description = media.get('desc') - thumbnail = media.get('image_300') or media.get('image_medium') or media.get('image') - duration = parse_duration(media.get('length')) - uploader = media.get('author') - upload_date = unified_strdate(media.get('date')) + has_subtitle = False + + for element in relinker.findall('element'): + media_url = xpath_text(element, 'url') + ext = determine_ext(media_url) + content_type = xpath_text(element, 'content-type') + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + media_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', + fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif ext == 'f4m': + f4m_formats = self._extract_f4m_formats( + media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', + video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + elif ext == 'stl': + has_subtitle = True + elif content_type.startswith('video/'): + bitrate = int_or_none(xpath_text(element, 'bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', + }) + elif content_type.startswith('image/'): + thumbnails.append({ + 'url': media_url, + }) + + self._sort_formats(formats) + + if has_subtitle: + webpage = self._download_webpage(url, video_id) + subtitles = self._get_subtitles(video_id, webpage) else: - title = (self._search_regex( - r'var\s+videoTitolo\s*=\s*"(.+?)";', - webpage, 'title', default=None) or self._og_search_title(webpage)).replace('\\"', '"') - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - duration = None - uploader = self._html_search_meta('Editore', webpage, 'uploader') - upload_date = unified_strdate(self._html_search_meta( - 'item-date', webpage, 'upload date', default=None)) - - subtitles = self.extract_subtitles(video_id, webpage) + raise ExtractorError('not a media file') return { 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - 'duration': duration, + 'title': media['name'], + 'description': media.get('desc'), + 'thumbnails': thumbnails, + 'uploader': media.get('author'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(media.get('length')), 'formats': formats, 'subtitles': subtitles, } @@ -177,3 +175,36 @@ class RaiIE(InfoExtractor): 'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions), }] return subtitles + + +class RaiIE(InfoExtractor): + _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _TESTS = [ + { + 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', + 'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7', + 'info_dict': { + 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', + 'ext': 'flv', + 'title': 'Il pacco', + 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'upload_date': '20141221', + }, + } + ] + + @classmethod + def suitable(cls, url): + return False if RaiTVIE.suitable(url) else super(RaiIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + iframe_url = self._search_regex( + [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', + r'drawMediaRaiTV\(["\'](.+?)["\']'], + webpage, 'iframe') + if not iframe_url.startswith('http'): + iframe_url = compat_urlparse.urljoin(url, iframe_url) + return self.url_result(iframe_url) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 6b09550b0..9db62adb1 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -17,9 +17,9 @@ from ..utils import ( class RutubeIE(InfoExtractor): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' - _VALID_URL = r'https?://rutube\.ru/video/(?P<id>[\da-z]{32})' + _VALID_URL = r'https?://rutube\.ru/(?:video|play/embed)/(?P<id>[\da-z]{32})' - _TEST = { + _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', @@ -36,7 +36,10 @@ class RutubeIE(InfoExtractor): # It requires ffmpeg (m3u8 download) 'skip_download': True, }, - } + }, { + 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index d9df06861..f7fe1fece 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -131,7 +131,7 @@ class RUTVIE(InfoExtractor): is_live = video_type == 'live' json_data = self._download_json( - 'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if is_live else '', video_id), + 'http://player.rutv.ru/iframe/data%s/id/%s' % ('live' if is_live else 'video', video_id), video_id, 'Downloading JSON') if json_data['errors']: diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 919704261..7de7b7273 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -6,12 +6,12 @@ import re from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, sanitized_Request, smuggle_url, std_headers, + urlencode_postdata, ) @@ -57,7 +57,7 @@ class SafariBaseIE(InfoExtractor): } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form), headers=headers) + self._LOGIN_URL, urlencode_postdata(login_form), headers=headers) login_page = self._download_webpage( request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index daf6ad555..ea8fc258d 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -158,6 +158,7 @@ class SohuIE(InfoExtractor): 'file': clips_url[i], 'new': su[i], 'prod': 'flash', + 'rb': 1, } if cdnId is not None: diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py deleted file mode 100644 index 5da66ca9e..000000000 --- a/youtube_dl/extractor/soompi.py +++ /dev/null @@ -1,146 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .crunchyroll import CrunchyrollIE - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, - remove_start, - xpath_text, -) - - -class SoompiBaseIE(InfoExtractor): - def _get_episodes(self, webpage, episode_filter=None): - episodes = self._parse_json( - self._search_regex( - r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'), - None) - return list(filter(episode_filter, episodes)) - - -class SoompiIE(SoompiBaseIE, CrunchyrollIE): - IE_NAME = 'soompi' - _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://tv.soompi.com/en/watch/29235', - 'info_dict': { - 'id': '29235', - 'ext': 'mp4', - 'title': 'Episode 1096', - 'description': '2015-05-20' - }, - 'params': { - 'skip_download': True, - }, - }] - - def _get_episode(self, webpage, video_id): - return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0] - - def _get_subtitles(self, config, video_id): - sub_langs = {} - for subtitle in config.findall('./{default}preload/subtitles/subtitle'): - sub_langs[subtitle.attrib['id']] = subtitle.attrib['title'] - - subtitles = {} - for s in config.findall('./{default}preload/subtitle'): - lang_code = sub_langs.get(s.attrib['id']) - if not lang_code: - continue - sub_id = s.get('id') - data = xpath_text(s, './data', 'data') - iv = xpath_text(s, './iv', 'iv') - if not id or not iv or not data: - continue - subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8') - subtitles[lang_code] = self._extract_subtitles(subtitle) - return subtitles - - def _real_extract(self, url): - video_id = self._match_id(url) - - try: - webpage = self._download_webpage( - url, video_id, 'Downloading episode page') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - webpage = ee.cause.read() - block_message = self._html_search_regex( - r'(?s)<div class="block-message">(.+?)</div>', webpage, - 'block message', default=None) - if block_message: - raise ExtractorError(block_message, expected=True) - raise - - formats = [] - config = None - for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage): - config = self._download_xml( - 'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id), - video_id, 'Downloading %s XML' % format_id) - m3u8_url = xpath_text( - config, './{default}preload/stream_info/file', - '%s m3u8 URL' % format_id) - if not m3u8_url: - continue - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', m3u8_id=format_id)) - self._sort_formats(formats) - - episode = self._get_episode(webpage, video_id) - - title = episode['name'] - description = episode.get('description') - duration = int_or_none(episode.get('duration')) - - thumbnails = [{ - 'id': thumbnail_id, - 'url': thumbnail_url, - } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()] - - subtitles = self.extract_subtitles(config, video_id) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnails': thumbnails, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles - } - - -class SoompiShowIE(SoompiBaseIE): - IE_NAME = 'soompi:show' - _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)' - _TESTS = [{ - 'url': 'http://tv.soompi.com/en/shows/liar-game', - 'info_dict': { - 'id': 'liar-game', - 'title': 'Liar Game', - 'description': 'md5:52c02bce0c1a622a95823591d0589b66', - }, - 'playlist_count': 14, - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - - webpage = self._download_webpage( - url, show_id, 'Downloading show page') - - title = remove_start(self._og_search_title(webpage), 'SoompiTV | ') - description = self._og_search_description(webpage) - - entries = [ - self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi') - for episode in self._get_episodes(webpage)] - - return self.playlist_result(entries, show_id, title, description) diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index ebb75f059..a9927f6e2 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -70,10 +70,12 @@ class SportDeutschlandIE(InfoExtractor): smil_doc = self._download_xml( smil_url, video_id, note='Downloading SMIL metadata') - base_url = smil_doc.find('./head/meta').attrib['base'] + base_url_el = smil_doc.find('./head/meta') + if base_url_el: + base_url = base_url_el.attrib['base'] formats.extend([{ 'format_id': 'rmtp', - 'url': base_url, + 'url': base_url if base_url_el else n.attrib['src'], 'play_path': n.attrib['src'], 'ext': 'flv', 'preference': -100, diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index 5d583c720..74d01183f 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -1,17 +1,18 @@ # encoding: utf-8 from __future__ import unicode_literals -import json +from .ard import ARDMediathekIE +from ..utils import ( + ExtractorError, + get_element_by_attribute, +) -from .common import InfoExtractor -from ..utils import js_to_json - -class SRMediathekIE(InfoExtractor): +class SRMediathekIE(ARDMediathekIE): IE_DESC = 'Saarländischer Rundfunk' _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455', 'info_dict': { 'id': '28455', @@ -20,24 +21,36 @@ class SRMediathekIE(InfoExtractor): 'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ', 'thumbnail': 're:^https?://.*\.jpg$', }, - } + 'skip': 'no longer available', + }, { + 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=37682', + 'info_dict': { + 'id': '37682', + 'ext': 'mp4', + 'title': 'Love, Cakes and Rock\'n\'Roll', + 'description': 'md5:18bf9763631c7d326c22603681e1123d', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'] + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - murls = json.loads(js_to_json(self._search_regex( - r'var mediaURLs\s*=\s*(.*?);\n', webpage, 'video URLs'))) - formats = [{'url': murl} for murl in murls] - self._sort_formats(formats) - - title = json.loads(js_to_json(self._search_regex( - r'var mediaTitles\s*=\s*(.*?);\n', webpage, 'title')))[0] + if '>Der gewünschte Beitrag ist leider nicht mehr verfügbar.<' in webpage: + raise ExtractorError('Video %s is no longer available' % video_id, expected=True) - return { + media_collection_url = self._search_regex( + r'data-mediacollection-ardplayer="([^"]+)"', webpage, 'media collection url') + info = self._extract_media_info(media_collection_url, webpage, video_id) + info.update({ 'id': video_id, - 'title': title, - 'formats': formats, + 'title': get_element_by_attribute('class', 'ardplayer-title', webpage), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), - } + }) + return info diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py new file mode 100644 index 000000000..a363b4d40 --- /dev/null +++ b/youtube_dl/extractor/tele13.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( + js_to_json, + qualities, + determine_ext, +) + + +class Tele13IE(InfoExtractor): + _VALID_URL = r'^http://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' + _TESTS = [ + { + 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'md5': '4cb1fa38adcad8fea88487a078831755', + 'info_dict': { + 'id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'ext': 'mp4', + 'title': 'El círculo de hierro de Michelle Bachelet en su regreso a La Moneda', + }, + 'params': { + # HTTP Error 404: Not Found + 'skip_download': True, + }, + }, + { + 'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok', + 'md5': '867adf6a3b3fef932c68a71d70b70946', + 'info_dict': { + 'id': 'rOoKv2OMpOw', + 'ext': 'mp4', + 'title': 'Shooting star seen on 7-Sep-2015', + 'description': 'md5:7292ff2a34b2f673da77da222ae77e1e', + 'uploader': 'Porjai Jaturongkhakun', + 'upload_date': '20150906', + 'uploader_id': 'UCnLY_3ezwNcDSC_Wc6suZxw', + }, + 'add_ie': ['Youtube'], + } + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + setup_js = self._search_regex(r"(?s)jwplayer\('player-vivo'\).setup\((\{.*?\})\)", webpage, 'setup code') + sources = self._parse_json(self._search_regex(r'sources\s*:\s*(\[[^\]]+\])', setup_js, 'sources'), display_id, js_to_json) + + preference = qualities(['Móvil', 'SD', 'HD']) + formats = [] + urls = [] + for f in sources: + format_url = f['file'] + if format_url and format_url not in urls: + ext = determine_ext(format_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats(format_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif YoutubeIE.suitable(format_url): + return self.url_result(format_url, 'Youtube') + else: + formats.append({ + 'url': format_url, + 'format_id': f.get('label'), + 'preference': preference(f.get('label')), + 'ext': ext, + }) + urls.append(format_url) + self._sort_formats(formats) + + return { + 'id': display_id, + 'title': self._search_regex(r'title\s*:\s*"([^"]+)"', setup_js, 'title'), + 'description': self._html_search_meta('description', webpage, 'description'), + 'thumbnail': self._search_regex(r'image\s*:\s*"([^"]+)"', setup_js, 'thumbnail', default=None), + 'formats': formats, + } diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 3a68eaa80..6890021cf 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html' + _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { @@ -22,7 +22,7 @@ class TF1IE(InfoExtractor): }, { 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', 'info_dict': { - 'id': '12043945', + 'id': 'le-grand-mysterioso-chuggington-7085291-739', 'ext': 'mp4', 'title': 'Le grand Mystérioso - Chuggington', 'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.', @@ -32,22 +32,24 @@ class TF1IE(InfoExtractor): # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, + 'skip': 'HTTP Error 410: Gone', }, { 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', 'only_matching': True, }, { 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html', 'only_matching': True, + }, { + 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - embed_url = self._html_search_regex( - r'["\'](https?://www.wat.tv/embedframe/.*?)["\']', webpage, 'embed url') - embed_page = self._download_webpage(embed_url, video_id, - 'Downloading embed player page') - wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id') + wat_id = self._html_search_regex( + r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1', + webpage, 'wat id', group='id') wat_info = self._download_json( 'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id) return self.url_result(wat_info['media']['url'], 'Wat') diff --git a/youtube_dl/extractor/theintercept.py b/youtube_dl/extractor/theintercept.py new file mode 100644 index 000000000..8cb3c3669 --- /dev/null +++ b/youtube_dl/extractor/theintercept.py @@ -0,0 +1,49 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + parse_iso8601, + int_or_none, + ExtractorError, +) + + +class TheInterceptIE(InfoExtractor): + _VALID_URL = r'https://theintercept.com/fieldofvision/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/', + 'md5': '145f28b41d44aab2f87c0a4ac8ec95bd', + 'info_dict': { + 'id': '46214', + 'ext': 'mp4', + 'title': '#ThisIsACoup – Episode Four: Surrender or Die', + 'description': 'md5:74dd27f0e2fbd50817829f97eaa33140', + 'timestamp': 1450429239, + 'upload_date': '20151218', + 'comment_count': int, + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + json_data = self._parse_json(self._search_regex( + r'initialStoreTree\s*=\s*(?P<json_data>{.+})', webpage, + 'initialStoreTree'), display_id) + + for post in json_data['resources']['posts'].values(): + if post['slug'] == display_id: + return { + '_type': 'url_transparent', + 'url': 'jwplatform:%s' % post['fov_videoid'], + 'id': compat_str(post['ID']), + 'display_id': display_id, + 'title': post['title'], + 'description': post.get('excerpt'), + 'timestamp': parse_iso8601(post.get('date')), + 'comment_count': int_or_none(post.get('comments_number')), + } + raise ExtractorError('Unable to find the current post') diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 1555aa77c..0bf6726b5 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -16,11 +16,12 @@ from ..compat import ( from ..utils import ( determine_ext, ExtractorError, - xpath_with_ns, - unsmuggle_url, + float_or_none, int_or_none, + sanitized_Request, + unsmuggle_url, url_basename, - float_or_none, + xpath_with_ns, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -204,7 +205,12 @@ class ThePlatformIE(ThePlatformBaseIE): smil_url = url # Explicitly specified SMIL (see https://github.com/rg3/youtube-dl/issues/7385) elif '/guid/' in url: - webpage = self._download_webpage(url, video_id) + headers = {} + source_url = smuggled_data.get('source_url') + if source_url: + headers['Referer'] = source_url + request = sanitized_Request(url, headers=headers) + webpage = self._download_webpage(request, video_id) smil_url = self._search_regex( r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml', webpage, 'smil url', group='url') diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py new file mode 100644 index 000000000..a47239952 --- /dev/null +++ b/youtube_dl/extractor/toggle.py @@ -0,0 +1,194 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, + sanitized_Request, +) + + +class ToggleIE(InfoExtractor): + IE_NAME = 'toggle' + _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/(?:[^/]+/)+(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', + 'info_dict': { + 'id': '343115', + 'ext': 'mp4', + 'title': 'Lion Moms Premiere', + 'description': 'md5:aea1149404bff4d7f7b6da11fafd8e6b', + 'upload_date': '20150910', + 'timestamp': 1441858274, + }, + 'params': { + 'skip_download': 'm3u8 download', + } + }, { + 'note': 'DRM-protected video', + 'url': 'http://video.toggle.sg/en/movies/dug-s-special-mission/341413', + 'info_dict': { + 'id': '341413', + 'ext': 'wvm', + 'title': 'Dug\'s Special Mission', + 'description': 'md5:e86c6f4458214905c1772398fabc93e0', + 'upload_date': '20150827', + 'timestamp': 1440644006, + }, + 'params': { + 'skip_download': 'DRM-protected wvm download', + } + }, { + # this also tests correct video id extraction + 'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay', + 'url': 'http://video.toggle.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861', + 'info_dict': { + 'id': '332861', + 'ext': 'mp4', + 'title': '28th SEA Games (5 Show) - Episode 11', + 'description': 'md5:3cd4f5f56c7c3b1340c50a863f896faa', + 'upload_date': '20150605', + 'timestamp': 1433480166, + }, + 'params': { + 'skip_download': 'DRM-protected wvm download', + }, + 'skip': 'm3u8 links are geo-restricted' + }, { + 'url': 'http://video.toggle.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', + 'only_matching': True, + }, { + 'url': 'http://video.toggle.sg/zh/series/zero-calling-s2-hd/ep13/336367', + 'only_matching': True, + }, { + 'url': 'http://video.toggle.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302', + 'only_matching': True, + }, { + 'url': 'http://video.toggle.sg/en/movies/seven-days/321936', + 'only_matching': True, + }] + + _FORMAT_PREFERENCES = { + 'wvm-STBMain': -10, + 'wvm-iPadMain': -20, + 'wvm-iPhoneMain': -30, + 'wvm-Android': -40, + } + _API_USER = 'tvpapi_147' + _API_PASS = '11111' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + url, video_id, note='Downloading video page') + + api_user = self._search_regex( + r'apiUser\s*:\s*(["\'])(?P<user>.+?)\1', webpage, 'apiUser', + default=self._API_USER, group='user') + api_pass = self._search_regex( + r'apiPass\s*:\s*(["\'])(?P<pass>.+?)\1', webpage, 'apiPass', + default=self._API_PASS, group='pass') + + params = { + 'initObj': { + 'Locale': { + 'LocaleLanguage': '', + 'LocaleCountry': '', + 'LocaleDevice': '', + 'LocaleUserState': 0 + }, + 'Platform': 0, + 'SiteGuid': 0, + 'DomainID': '0', + 'UDID': '', + 'ApiUser': api_user, + 'ApiPass': api_pass + }, + 'MediaID': video_id, + 'mediaType': 0, + } + + req = sanitized_Request( + 'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo', + json.dumps(params).encode('utf-8')) + info = self._download_json(req, video_id, 'Downloading video info json') + + title = info['MediaName'] + + formats = [] + for video_file in info.get('Files', []): + video_url, vid_format = video_file.get('URL'), video_file.get('Format') + if not video_url or not vid_format: + continue + ext = determine_ext(video_url) + vid_format = vid_format.replace(' ', '') + # if geo-restricted, m3u8 is inaccessible, but mp4 is okay + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, ext='mp4', m3u8_id=vid_format, + note='Downloading %s m3u8 information' % vid_format, + errnote='Failed to download %s m3u8 information' % vid_format, + fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif ext in ('mp4', 'wvm'): + # wvm are drm-protected files + formats.append({ + 'ext': ext, + 'url': video_url, + 'format_id': vid_format, + 'preference': self._FORMAT_PREFERENCES.get(ext + '-' + vid_format) or -1, + 'format_note': 'DRM-protected video' if ext == 'wvm' else None + }) + if not formats: + # Most likely because geo-blocked + raise ExtractorError('No downloadable videos found', expected=True) + self._sort_formats(formats) + + duration = int_or_none(info.get('Duration')) + description = info.get('Description') + created_at = parse_iso8601(info.get('CreationDate') or None) + + average_rating = float_or_none(info.get('Rating')) + view_count = int_or_none(info.get('ViewCounter') or info.get('view_counter')) + like_count = int_or_none(info.get('LikeCounter') or info.get('like_counter')) + + thumbnails = [] + for picture in info.get('Pictures', []): + if not isinstance(picture, dict): + continue + pic_url = picture.get('URL') + if not pic_url: + continue + thumbnail = { + 'url': pic_url, + } + pic_size = picture.get('PicSize', '') + m = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', pic_size) + if m: + thumbnail.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + thumbnails.append(thumbnail) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': created_at, + 'average_rating': average_rating, + 'view_count': view_count, + 'like_count': like_count, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py index b6b1f2568..8322cc14d 100644 --- a/youtube_dl/extractor/tunein.py +++ b/youtube_dl/extractor/tunein.py @@ -2,74 +2,33 @@ from __future__ import unicode_literals import json -import re from .common import InfoExtractor from ..utils import ExtractorError +from ..compat import compat_urlparse -class TuneInIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?: - tunein\.com/ - (?: - radio/.*?-s| - station/.*?StationId\= - )(?P<id>[0-9]+) - |tun\.in/(?P<redirect_id>[A-Za-z0-9]+) - ) - ''' - _API_URL_TEMPLATE = 'http://tunein.com/tuner/tune/?stationId={0:}&tuneType=Station' - - _INFO_DICT = { - 'id': '34682', - 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', - 'ext': 'aac', - 'thumbnail': 're:^https?://.*\.png$', - 'location': 'Tacoma, WA', - } - _TESTS = [ - { - 'url': 'http://tunein.com/radio/Jazz24-885-s34682/', - 'info_dict': _INFO_DICT, - 'params': { - 'skip_download': True, # live stream - }, - }, - { # test redirection - 'url': 'http://tun.in/ser7s', - 'info_dict': _INFO_DICT, - 'params': { - 'skip_download': True, # live stream - }, - }, - ] +class TuneInBaseIE(InfoExtractor): + _API_BASE_URL = 'http://tunein.com/tuner/tune/' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - redirect_id = mobj.group('redirect_id') - if redirect_id: - # The server doesn't support HEAD requests - urlh = self._request_webpage( - url, redirect_id, note='Downloading redirect page') - url = urlh.geturl() - self.to_screen('Following redirect: %s' % url) - mobj = re.match(self._VALID_URL, url) - station_id = mobj.group('id') - - station_info = self._download_json( - self._API_URL_TEMPLATE.format(station_id), - station_id, note='Downloading station JSON') - - title = station_info['Title'] - thumbnail = station_info.get('Logo') - location = station_info.get('Location') - streams_url = station_info.get('StreamUrl') + content_id = self._match_id(url) + + content_info = self._download_json( + self._API_BASE_URL + self._API_URL_QUERY % content_id, + content_id, note='Downloading JSON metadata') + + title = content_info['Title'] + thumbnail = content_info.get('Logo') + location = content_info.get('Location') + streams_url = content_info.get('StreamUrl') if not streams_url: - raise ExtractorError('No downloadable streams found', - expected=True) + raise ExtractorError('No downloadable streams found', expected=True) + if not streams_url.startswith('http://'): + streams_url = compat_urlparse.urljoin(url, streams_url) + stream_data = self._download_webpage( - streams_url, station_id, note='Downloading stream data') + streams_url, content_id, note='Downloading stream data') streams = json.loads(self._search_regex( r'\((.*)\);', stream_data, 'stream info'))['Streams'] @@ -97,10 +56,122 @@ class TuneInIE(InfoExtractor): self._sort_formats(formats) return { - 'id': station_id, + 'id': content_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'location': location, 'is_live': is_live, } + + +class TuneInClipIE(TuneInBaseIE): + IE_NAME = 'tunein:clip' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/station/.*?audioClipId\=(?P<id>\d+)' + _API_URL_QUERY = '?tuneType=AudioClip&audioclipId=%s' + + _TESTS = [ + { + 'url': 'http://tunein.com/station/?stationId=246119&audioClipId=816', + 'md5': '99f00d772db70efc804385c6b47f4e77', + 'info_dict': { + 'id': '816', + 'title': '32m', + 'ext': 'mp3', + }, + }, + ] + + +class TuneInStationIE(TuneInBaseIE): + IE_NAME = 'tunein:station' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId\=)(?P<id>\d+)' + _API_URL_QUERY = '?tuneType=Station&stationId=%s' + + @classmethod + def suitable(cls, url): + return False if TuneInClipIE.suitable(url) else super(TuneInStationIE, cls).suitable(url) + + _TESTS = [ + { + 'url': 'http://tunein.com/radio/Jazz24-885-s34682/', + 'info_dict': { + 'id': '34682', + 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', + 'ext': 'mp3', + 'location': 'Tacoma, WA', + }, + 'params': { + 'skip_download': True, # live stream + }, + }, + ] + + +class TuneInProgramIE(TuneInBaseIE): + IE_NAME = 'tunein:program' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-p|program/.*?ProgramId\=)(?P<id>\d+)' + _API_URL_QUERY = '?tuneType=Program&programId=%s' + + _TESTS = [ + { + 'url': 'http://tunein.com/radio/Jazz-24-p2506/', + 'info_dict': { + 'id': '2506', + 'title': 'Jazz 24 on 91.3 WUKY-HD3', + 'ext': 'mp3', + 'location': 'Lexington, KY', + }, + 'params': { + 'skip_download': True, # live stream + }, + }, + ] + + +class TuneInTopicIE(TuneInBaseIE): + IE_NAME = 'tunein:topic' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/topic/.*?TopicId\=(?P<id>\d+)' + _API_URL_QUERY = '?tuneType=Topic&topicId=%s' + + _TESTS = [ + { + 'url': 'http://tunein.com/topic/?TopicId=101830576', + 'md5': 'c31a39e6f988d188252eae7af0ef09c9', + 'info_dict': { + 'id': '101830576', + 'title': 'Votez pour moi du 29 octobre 2015 (29/10/15)', + 'ext': 'mp3', + 'location': 'Belgium', + }, + }, + ] + + +class TuneInShortenerIE(InfoExtractor): + IE_NAME = 'tunein:shortener' + IE_DESC = False # Do not list + _VALID_URL = r'https?://tun\.in/(?P<id>[A-Za-z0-9]+)' + + _TEST = { + # test redirection + 'url': 'http://tun.in/ser7s', + 'info_dict': { + 'id': '34682', + 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', + 'ext': 'mp3', + 'location': 'Tacoma, WA', + }, + 'params': { + 'skip_download': True, # live stream + }, + } + + def _real_extract(self, url): + redirect_id = self._match_id(url) + # The server doesn't support HEAD requests + urlh = self._request_webpage( + url, redirect_id, note='Downloading redirect page') + url = urlh.geturl() + self.to_screen('Following redirect: %s' % url) + return self.url_result(url) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index c1ee1decc..e03e2dbaa 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -5,6 +5,8 @@ from .common import InfoExtractor from ..utils import ( parse_iso8601, int_or_none, + xpath_attr, + xpath_element, ) @@ -15,7 +17,7 @@ class TwentyFourVideoIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.24video.net/video/view/1044982', - 'md5': 'd041af8b5b4246ea466226a0d6693345', + 'md5': 'e09fc0901d9eaeedac872f154931deeb', 'info_dict': { 'id': '1044982', 'ext': 'mp4', @@ -64,33 +66,24 @@ class TwentyFourVideoIE(InfoExtractor): r'<div class="comments-title" id="comments-count">(\d+) комментари', webpage, 'comment count', fatal=False)) - formats = [] + # Sets some cookies + self._download_xml( + r'http://www.24video.net/video/xml/%s?mode=init' % video_id, + video_id, 'Downloading init XML') - pc_video = self._download_xml( + video_xml = self._download_xml( 'http://www.24video.net/video/xml/%s?mode=play' % video_id, - video_id, 'Downloading PC video URL').find('.//video') + video_id, 'Downloading video XML') - formats.append({ - 'url': pc_video.attrib['url'], - 'format_id': 'pc', - 'quality': 1, - }) + video = xpath_element(video_xml, './/video', 'video', fatal=True) - like_count = int_or_none(pc_video.get('ratingPlus')) - dislike_count = int_or_none(pc_video.get('ratingMinus')) - age_limit = 18 if pc_video.get('adult') == 'true' else 0 + formats = [{ + 'url': xpath_attr(video, '', 'url', 'video URL', fatal=True), + }] - mobile_video = self._download_xml( - 'http://www.24video.net/video/xml/%s' % video_id, - video_id, 'Downloading mobile video URL').find('.//video') - - formats.append({ - 'url': mobile_video.attrib['url'], - 'format_id': 'mobile', - 'quality': 0, - }) - - self._sort_formats(formats) + like_count = int_or_none(video.get('ratingPlus')) + dislike_count = int_or_none(video.get('ratingMinus')) + age_limit = 18 if video.get('adult') == 'true' else 0 return { 'id': video_id, diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 571289421..02dfd36f4 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_etree_fromstring +from ..compat import ( + compat_etree_fromstring, + compat_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -67,6 +70,17 @@ class VevoIE(InfoExtractor): 'params': { 'skip_download': 'true', } + }, { + 'note': 'No video_info', + 'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000', + 'md5': '8b83cc492d72fc9cf74a02acee7dc1b0', + 'info_dict': { + 'id': 'USUV71503000', + 'ext': 'mp4', + 'title': 'Till I Die - K Camp ft. T.I.', + 'duration': 193, + }, + 'expected_warnings': ['Unable to download SMIL file'], }] _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/' @@ -81,11 +95,17 @@ class VevoIE(InfoExtractor): if webpage is False: self._oauth_token = None else: + if 'THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION' in webpage: + raise ExtractorError('%s said: This page is currently unavailable in your region.' % self.IE_NAME, expected=True) + self._oauth_token = self._search_regex( r'access_token":\s*"([^"]+)"', webpage, 'access token', fatal=False) def _formats_from_json(self, video_info): + if not video_info: + return [] + last_version = {'version': -1} for version in video_info['videoVersions']: # These are the HTTP downloads, other types are for different manifests @@ -110,9 +130,8 @@ class VevoIE(InfoExtractor): }) return formats - def _formats_from_smil(self, smil_xml): + def _formats_from_smil(self, smil_doc): formats = [] - smil_doc = compat_etree_fromstring(smil_xml.encode('utf-8')) els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') for el in els: src = el.attrib['src'] @@ -145,14 +164,14 @@ class VevoIE(InfoExtractor): }) return formats - def _download_api_formats(self, video_id): + def _download_api_formats(self, video_id, video_url): if not self._oauth_token: self._downloader.report_warning( 'No oauth token available, skipping API HLS download') return [] - api_url = 'https://apiv2.vevo.com/video/%s/streams/hls?token=%s' % ( - video_id, self._oauth_token) + api_url = compat_urlparse.urljoin(video_url, '//apiv2.vevo.com/video/%s/streams/hls?token=%s' % ( + video_id, self._oauth_token)) api_data = self._download_json( api_url, video_id, note='Downloading HLS formats', @@ -166,18 +185,26 @@ class VevoIE(InfoExtractor): preference=0) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) + + webpage = None json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id response = self._download_json(json_url, video_id) - video_info = response['video'] + video_info = response['video'] or {} - if not video_info: + if not video_info and response.get('statusCode') != 909: if 'statusMessage' in response: raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusMessage']), expected=True) raise ExtractorError('Unable to extract videos') + if not video_info: + if url.startswith('vevo:'): + raise ExtractorError('Please specify full Vevo URL for downloading', expected=True) + webpage = self._download_webpage(url, video_id) + + title = video_info.get('title') or self._og_search_title(webpage) + formats = self._formats_from_json(video_info) is_explicit = video_info.get('isExplicit') @@ -189,11 +216,11 @@ class VevoIE(InfoExtractor): age_limit = None # Download via HLS API - formats.extend(self._download_api_formats(video_id)) + formats.extend(self._download_api_formats(video_id, url)) # Download SMIL smil_blocks = sorted(( - f for f in video_info['videoVersions'] + f for f in video_info.get('videoVersions', []) if f['sourceType'] == 13), key=lambda f: f['version']) smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( @@ -205,23 +232,26 @@ class VevoIE(InfoExtractor): if smil_url_m is not None: smil_url = smil_url_m if smil_url: - smil_xml = self._download_webpage( - smil_url, video_id, 'Downloading SMIL info', fatal=False) - if smil_xml: - formats.extend(self._formats_from_smil(smil_xml)) + smil_doc = self._download_smil(smil_url, video_id, fatal=False) + if smil_doc: + formats.extend(self._formats_from_smil(smil_doc)) self._sort_formats(formats) - timestamp_ms = int_or_none(self._search_regex( + timestamp = int_or_none(self._search_regex( r'/Date\((\d+)\)/', - video_info['launchDate'], 'launch date', fatal=False)) + video_info['launchDate'], 'launch date', fatal=False), + scale=1000) if video_info else None + + duration = video_info.get('duration') or int_or_none( + self._html_search_meta('video:duration', webpage)) return { 'id': video_id, - 'title': video_info['title'], + 'title': title, 'formats': formats, - 'thumbnail': video_info['imageUrl'], - 'timestamp': timestamp_ms // 1000, - 'uploader': video_info['mainArtists'][0]['artistName'], - 'duration': video_info['duration'], + 'thumbnail': video_info.get('imageUrl'), + 'timestamp': timestamp, + 'uploader': video_info['mainArtists'][0]['artistName'] if video_info else None, + 'duration': duration, 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index f38a72fde..129668a99 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -4,26 +4,48 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .xstream import XstreamIE from ..utils import ( ExtractorError, float_or_none, ) -class VGTVIE(InfoExtractor): - IE_DESC = 'VGTV and BTTV' +class VGTVIE(XstreamIE): + IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' + + _HOST_TO_APPNAME = { + 'vgtv.no': 'vgtv', + 'bt.no/tv': 'bttv', + 'aftenbladet.no/tv': 'satv', + 'fvn.no/fvntv': 'fvntv', + 'aftenposten.no/webtv': 'aptv', + } + + _APP_NAME_TO_VENDOR = { + 'vgtv': 'vgtv', + 'bttv': 'bt', + 'satv': 'sa', + 'fvntv': 'fvn', + 'aptv': 'ap', + } + _VALID_URL = r'''(?x) - (?: - vgtv:| - http://(?:www\.)? + (?:https?://(?:www\.)? + (?P<host> + %s ) - (?P<host>vgtv|bt) + / (?: - :| - \.no/(?:tv/)?\#!/(?:video|live)/ - ) - (?P<id>[0-9]+) - ''' + \#!/(?:video|live)/| + embed?.*id= + )| + (?P<appname> + %s + ):) + (?P<id>\d+) + ''' % ('|'.join(_HOST_TO_APPNAME.keys()), '|'.join(_APP_NAME_TO_VENDOR.keys())) + _TESTS = [ { # streamType: vod @@ -59,25 +81,37 @@ class VGTVIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'skip': 'Video is no longer available', }, { - # streamType: live + # streamType: wasLive 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', + 'md5': '458f4841239dab414343b50e5af8869c', 'info_dict': { 'id': '113063', 'ext': 'flv', - 'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 'V75 fra Solvalla 30.05.15', 'description': 'md5:b3743425765355855f88e096acc93231', 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 0, + 'duration': 25966, 'timestamp': 1432975582, 'upload_date': '20150530', 'view_count': int, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + }, + { + 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', + 'md5': 'fd828cd29774a729bf4d4425fe192972', + 'info_dict': { + 'id': '21039', + 'ext': 'mov', + 'title': 'TRAILER: «SWEATSHOP» - I can´t take any more', + 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', + 'duration': 66, + 'timestamp': 1417002452, + 'upload_date': '20141126', + 'view_count': int, + } }, { 'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien', @@ -89,21 +123,27 @@ class VGTVIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') host = mobj.group('host') - - HOST_WEBSITES = { - 'vgtv': 'vgtv', - 'bt': 'bttv', - } + appname = self._HOST_TO_APPNAME[host] if host else mobj.group('appname') + vendor = self._APP_NAME_TO_VENDOR[appname] data = self._download_json( 'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website' - % (host, video_id, HOST_WEBSITES[host]), + % (vendor, video_id, appname), video_id, 'Downloading media JSON') if data.get('status') == 'inactive': raise ExtractorError( 'Video %s is no longer available' % video_id, expected=True) + info = { + 'formats': [], + } + if len(video_id) == 5: + if appname == 'bttv': + info = self._extract_video_info('btno', video_id) + elif appname == 'aptv': + info = self._extract_video_info('ap', video_id) + streams = data['streamUrls'] stream_type = data.get('streamType') @@ -111,48 +151,56 @@ class VGTVIE(InfoExtractor): hls_url = streams.get('hls') if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) hds_url = streams.get('hds') - # wasLive hds are always 404 - if hds_url and stream_type != 'wasLive': - formats.extend(self._extract_f4m_formats( - hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - video_id, f4m_id='hds')) + if hds_url: + hdcore_sign = 'hdcore=3.7.0' + f4m_formats = self._extract_f4m_formats( + hds_url + '?%s' % hdcore_sign, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + for entry in f4m_formats: + # URLs without the extra param induce an 404 error + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.append(entry) + mp4_urls = streams.get('pseudostreaming') or [] mp4_url = streams.get('mp4') if mp4_url: - _url = hls_url or hds_url - MP4_URL_TEMPLATE = '%s/%%s.%s' % (mp4_url.rpartition('/')[0], mp4_url.rpartition('.')[-1]) - for mp4_format in _url.split(','): - m = re.search('(?P<width>\d+)_(?P<height>\d+)_(?P<vbr>\d+)', mp4_format) - if not m: - continue - width = int(m.group('width')) - height = int(m.group('height')) - vbr = int(m.group('vbr')) - formats.append({ - 'url': MP4_URL_TEMPLATE % mp4_format, - 'format_id': 'mp4-%s' % vbr, - 'width': width, - 'height': height, - 'vbr': vbr, - 'preference': 1, + mp4_urls.append(mp4_url) + for mp4_url in mp4_urls: + format_info = { + 'url': mp4_url, + } + mobj = re.search('(\d+)_(\d+)_(\d+)', mp4_url) + if mobj: + tbr = int(mobj.group(3)) + format_info.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + 'tbr': tbr, + 'format_id': 'mp4-%s' % tbr, }) - self._sort_formats(formats) + formats.append(format_info) + + info['formats'].extend(formats) - return { + self._sort_formats(info['formats']) + + info.update({ 'id': video_id, - 'title': self._live_title(data['title']), + 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], 'duration': float_or_none(data['duration'], 1000), 'view_count': data['displays'], - 'formats': formats, 'is_live': True if stream_type == 'live' else False, - } + }) + return info class BTArticleIE(InfoExtractor): @@ -161,7 +209,7 @@ class BTArticleIE(InfoExtractor): _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html' _TEST = { 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', - 'md5': 'd055e8ee918ef2844745fcfd1a4175fb', + 'md5': '2acbe8ad129b3469d5ae51b1158878df', 'info_dict': { 'id': '23199', 'ext': 'mp4', @@ -178,15 +226,15 @@ class BTArticleIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, self._match_id(url)) video_id = self._search_regex( - r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id') - return self.url_result('vgtv:bt:%s' % video_id, 'VGTV') + r'<video[^>]+data-id="(\d+)"', webpage, 'video id') + return self.url_result('bttv:%s' % video_id, 'VGTV') class BTVestlendingenIE(InfoExtractor): IE_NAME = 'bt:vestlendingen' IE_DESC = 'Bergens Tidende - Vestlendingen' _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588', 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', 'info_dict': { @@ -197,7 +245,19 @@ class BTVestlendingenIE(InfoExtractor): 'timestamp': 1430473209, 'upload_date': '20150501', }, - } + 'skip': '404 Error', + }, { + 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86255', + 'md5': 'a2893f8632e96389f4bdf36aa9463ceb', + 'info_dict': { + 'id': '86255', + 'ext': 'mov', + 'title': 'Du må tåle å fryse og være sulten', + 'description': 'md5:b8046f4d022d5830ddab04865791d063', + 'upload_date': '20150321', + 'timestamp': 1426942023, + }, + }] def _real_extract(self, url): - return self.url_result('xstream:btno:%s' % self._match_id(url), 'Xstream') + return self.url_result('bttv:%s' % self._match_id(url), 'VGTV') diff --git a/youtube_dl/extractor/videoweed.py b/youtube_dl/extractor/videoweed.py deleted file mode 100644 index ca2e50935..000000000 --- a/youtube_dl/extractor/videoweed.py +++ /dev/null @@ -1,26 +0,0 @@ -from __future__ import unicode_literals - -from .novamov import NovaMovIE - - -class VideoWeedIE(NovaMovIE): - IE_NAME = 'videoweed' - IE_DESC = 'VideoWeed' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'videoweed\.(?:es|com)'} - - _HOST = 'www.videoweed.es' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r'<h1 class="text_shadow">([^<]+)</h1>' - - _TEST = { - 'url': 'http://www.videoweed.es/file/b42178afbea14', - 'md5': 'abd31a2132947262c50429e1d16c1bfd', - 'info_dict': { - 'id': 'b42178afbea14', - 'ext': 'flv', - 'title': 'optical illusion dissapeared image magic illusion', - 'description': '' - }, - } diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index a63c23617..9a1c377a4 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -30,6 +30,12 @@ class VikiBaseIE(InfoExtractor): _token = None + _ERRORS = { + 'geo': 'Sorry, this content is not available in your region.', + 'upcoming': 'Sorry, this content is not yet available.', + # 'paywall': 'paywall', + } + def _prepare_call(self, path, timestamp=None, post_data=None): path += '?' if '?' not in path else '&' if not timestamp: @@ -67,6 +73,12 @@ class VikiBaseIE(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, error), expected=True) + def _check_errors(self, data): + for reason, status in data.get('blocking', {}).items(): + if status and reason in self._ERRORS: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, self._ERRORS[reason]), expected=True) + def _real_initialize(self): self._login() @@ -193,6 +205,7 @@ class VikiIE(VikiBaseIE): 'timestamp': 1321985454, 'description': 'md5:44b1e46619df3a072294645c770cef36', 'title': 'Love In Magic', + 'age_limit': 13, }, }] @@ -202,6 +215,8 @@ class VikiIE(VikiBaseIE): video = self._call_api( 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') + self._check_errors(video) + title = self.dict_selection(video.get('titles', {}), 'en') if not title: title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id @@ -262,8 +277,11 @@ class VikiIE(VikiBaseIE): r'^(\d+)[pP]$', format_id, 'height', default=None)) for protocol, format_dict in stream_dict.items(): if format_id == 'm3u8': - formats = self._extract_m3u8_formats( - format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol) + m3u8_formats = self._extract_m3u8_formats( + format_dict['url'], video_id, 'mp4', 'm3u8_native', + m3u8_id='m3u8-%s' % protocol, fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) else: formats.append({ 'url': format_dict['url'], @@ -315,6 +333,8 @@ class VikiChannelIE(VikiBaseIE): 'containers/%s.json' % channel_id, channel_id, 'Downloading channel JSON') + self._check_errors(channel) + title = self.dict_selection(channel['titles'], 'en') description = self.dict_selection(channel['descriptions'], 'en') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index f392ccf1c..ce08e6955 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -23,6 +23,7 @@ from ..utils import ( unsmuggle_url, urlencode_postdata, unescapeHTML, + parse_filesize, ) @@ -185,6 +186,20 @@ class VimeoIE(VimeoBaseInfoExtractor): }, }, { + # contains original format + 'url': 'https://vimeo.com/33951933', + 'md5': '53c688fa95a55bf4b7293d37a89c5c53', + 'info_dict': { + 'id': '33951933', + 'ext': 'mp4', + 'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute', + 'uploader': 'The DMCI', + 'uploader_id': 'dmci', + 'upload_date': '20111220', + 'description': 'md5:ae23671e82d05415868f7ad1aec21147', + }, + }, + { 'url': 'https://vimeo.com/109815029', 'note': 'Video not completely processed, "failed" seed status', 'only_matching': True, @@ -392,6 +407,21 @@ class VimeoIE(VimeoBaseInfoExtractor): comment_count = None formats = [] + download_request = sanitized_Request('https://vimeo.com/%s?action=load_download_config' % video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest'}) + download_data = self._download_json(download_request, video_id, fatal=False) + if download_data: + source_file = download_data.get('source_file') + if source_file and not source_file.get('is_cold') and not source_file.get('is_defrosting'): + formats.append({ + 'url': source_file['download_url'], + 'ext': source_file['extension'].lower(), + 'width': int_or_none(source_file.get('width')), + 'height': int_or_none(source_file.get('height')), + 'filesize': parse_filesize(source_file.get('size')), + 'format_id': source_file.get('public_name', 'Original'), + 'preference': 1, + }) config_files = config['video'].get('files') or config['request'].get('files', {}) for f in config_files.get('progressive', []): video_url = f.get('url') @@ -408,12 +438,12 @@ class VimeoIE(VimeoBaseInfoExtractor): m3u8_url = config_files.get('hls', {}).get('url') if m3u8_url: m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False) + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) if m3u8_formats: formats.extend(m3u8_formats) # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps # at the same time without actual units specified. This lead to wrong sorting. - self._sort_formats(formats, field_preference=('height', 'width', 'fps', 'format_id')) + self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id')) subtitles = {} text_tracks = config['request'].get('text_tracks') diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index d99a42a9f..90557fa61 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -18,6 +18,7 @@ from ..utils import ( unified_strdate, ) from .vimeo import VimeoIE +from .pladform import PladformIE class VKIE(InfoExtractor): @@ -164,6 +165,11 @@ class VKIE(InfoExtractor): # vk wrapper 'url': 'http://www.biqle.ru/watch/847655_160197695', 'only_matching': True, + }, + { + # pladform embed + 'url': 'https://vk.com/video-76116461_171554880', + 'only_matching': True, } ] @@ -254,10 +260,13 @@ class VKIE(InfoExtractor): if vimeo_url is not None: return self.url_result(vimeo_url) + pladform_url = PladformIE._extract_url(info_page) + if pladform_url: + return self.url_result(pladform_url) + m_rutube = re.search( r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page) if m_rutube is not None: - self.to_screen('rutube video detected') rutube_url = self._proto_relative_url( m_rutube.group(1).replace('\\', '')) return self.url_result(rutube_url) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index b46802306..ef096cbd2 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -10,8 +10,8 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( - determine_ext, unified_strdate, + qualities, ) @@ -33,6 +33,7 @@ class WDRIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Page Not Found', }, { 'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html', @@ -47,6 +48,7 @@ class WDRIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Page Not Found', }, { 'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html', @@ -71,6 +73,7 @@ class WDRIE(InfoExtractor): 'upload_date': '20140717', 'is_live': False }, + 'skip': 'Page Not Found', }, { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html', @@ -83,10 +86,10 @@ class WDRIE(InfoExtractor): 'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html', 'info_dict': { 'id': 'mdb-103364', - 'title': 're:^WDR Fernsehen [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^WDR Fernsehen Live [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', 'ext': 'flv', - 'upload_date': '20150212', + 'upload_date': '20150101', 'is_live': True }, 'params': { @@ -150,25 +153,52 @@ class WDRIE(InfoExtractor): if upload_date: upload_date = unified_strdate(upload_date) + formats = [] + preference = qualities(['S', 'M', 'L', 'XL']) + if video_url.endswith('.f4m'): - video_url += '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18' - ext = 'flv' + f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', page_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) elif video_url.endswith('.smil'): - fmt = self._extract_smil_formats(video_url, page_id)[0] - video_url = fmt['url'] - sep = '&' if '?' in video_url else '?' - video_url += sep - video_url += 'hdcore=3.3.0&plugin=aasp-3.3.0.99.43' - ext = fmt['ext'] + smil_formats = self._extract_smil_formats(video_url, page_id, False, { + 'hdcore': '3.3.0', + 'plugin': 'aasp-3.3.0.99.43', + }) + if smil_formats: + formats.extend(smil_formats) else: - ext = determine_ext(video_url) + formats.append({ + 'url': video_url, + 'http_headers': { + 'User-Agent': 'mobile', + }, + }) + + m3u8_url = self._search_regex(r'rel="adaptiv"[^>]+href="([^"]+)"', webpage, 'm3u8 url', default=None) + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats(m3u8_url, page_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + + direct_urls = re.findall(r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage) + if direct_urls: + for quality, video_url in direct_urls: + formats.append({ + 'url': video_url, + 'preference': preference(quality), + 'http_headers': { + 'User-Agent': 'mobile', + }, + }) + + self._sort_formats(formats) description = self._html_search_meta('Description', webpage, 'description') return { 'id': page_id, - 'url': video_url, - 'ext': ext, + 'formats': formats, 'title': title, 'description': description, 'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index e4f50e64c..041ff6c55 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -5,7 +5,7 @@ from .youtube import YoutubeIE class WimpIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P<id>[^/]+)/' + _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.wimp.com/maruexhausted/', 'md5': 'ee21217ffd66d058e8b16be340b74883', @@ -28,18 +28,23 @@ class WimpIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"], - webpage, 'video URL') - if YoutubeIE.suitable(video_url): - self.to_screen('Found YouTube video') + + youtube_id = self._search_regex( + r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + webpage, 'video URL', default=None) + if youtube_id: return { '_type': 'url', - 'url': video_url, + 'url': youtube_id, 'ie_key': YoutubeIE.ie_key(), } + video_url = self._search_regex( + r'<video[^>]+>\s*<source[^>]+src=(["\'])(?P<url>.+?)\1', + webpage, 'video URL', group='url') + return { 'id': video_id, 'url': video_url, diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py index 71584c291..76c91bd92 100644 --- a/youtube_dl/extractor/xstream.py +++ b/youtube_dl/extractor/xstream.py @@ -42,11 +42,7 @@ class XstreamIE(InfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - partner_id = mobj.group('partner_id') - video_id = mobj.group('id') - + def _extract_video_info(self, partner_id, video_id): data = self._download_xml( 'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s' % (partner_id, video_id), @@ -97,6 +93,7 @@ class XstreamIE(InfoExtractor): formats.append({ 'url': link.get('href'), 'format_id': link.get('rel'), + 'preference': 1, }) thumbnails = [{ @@ -113,3 +110,10 @@ class XstreamIE(InfoExtractor): 'formats': formats, 'thumbnails': thumbnails, } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + partner_id = mobj.group('partner_id') + video_id = mobj.group('id') + + return self._extract_video_info(partner_id, video_id) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 69ecc837a..3a3432be8 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -25,8 +25,8 @@ class YoukuIE(InfoExtractor): ''' _TESTS = [{ + # MD5 is unstable 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', - 'md5': '5f3af4192eabacc4501508d54a8cabd7', 'info_dict': { 'id': 'XMTc1ODE5Njcy_part1', 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', @@ -42,6 +42,7 @@ class YoukuIE(InfoExtractor): 'title': '武媚娘传奇 85', }, 'playlist_count': 11, + 'skip': 'Available in China only', }, { 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html', 'info_dict': { @@ -49,7 +50,6 @@ class YoukuIE(InfoExtractor): 'title': '花千骨 04', }, 'playlist_count': 13, - 'skip': 'Available in China only', }, { 'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html', 'note': 'Video protected with password', @@ -63,7 +63,7 @@ class YoukuIE(InfoExtractor): }, }] - def construct_video_urls(self, data1, data2): + def construct_video_urls(self, data): # get sid, token def yk_t(s1, s2): ls = list(range(256)) @@ -81,34 +81,24 @@ class YoukuIE(InfoExtractor): return bytes(s) sid, token = yk_t( - b'becaf9be', base64.b64decode(data2['ep'].encode('ascii')) + b'becaf9be', base64.b64decode(data['security']['encrypt_string'].encode('ascii')) ).decode('ascii').split('_') # get oip - oip = data2['ip'] - - # get fileid - string_ls = list( - 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890') - shuffled_string_ls = [] - seed = data1['seed'] - N = len(string_ls) - for ii in range(N): - seed = (seed * 0xd3 + 0x754f) % 0x10000 - idx = seed * len(string_ls) // 0x10000 - shuffled_string_ls.append(string_ls[idx]) - del string_ls[idx] + oip = data['security']['ip'] fileid_dict = {} - for format in data1['streamtypes']: - streamfileid = [ - int(i) for i in data1['streamfileids'][format].strip('*').split('*')] - fileid = ''.join( - [shuffled_string_ls[i] for i in streamfileid]) - fileid_dict[format] = fileid[:8] + '%s' + fileid[10:] + for stream in data['stream']: + format = stream.get('stream_type') + fileid = stream['stream_fileid'] + fileid_dict[format] = fileid def get_fileid(format, n): - fileid = fileid_dict[format] % hex(int(n))[2:].upper().zfill(2) + number = hex(int(str(n), 10))[2:].upper() + if len(number) == 1: + number = '0' + number + streamfileids = fileid_dict[format] + fileid = streamfileids[0:8] + number + streamfileids[10:] return fileid # get ep @@ -123,15 +113,15 @@ class YoukuIE(InfoExtractor): # generate video_urls video_urls_dict = {} - for format in data1['streamtypes']: + for stream in data['stream']: + format = stream.get('stream_type') video_urls = [] - for dt in data1['segs'][format]: - n = str(int(dt['no'])) + for dt in stream['segs']: + n = str(stream['segs'].index(dt)) param = { - 'K': dt['k'], + 'K': dt['key'], 'hd': self.get_hd(format), 'myp': 0, - 'ts': dt['seconds'], 'ypp': 0, 'ctype': 12, 'ev': 1, @@ -142,7 +132,7 @@ class YoukuIE(InfoExtractor): video_url = \ 'http://k.youku.com/player/getFlvPath/' + \ 'sid/' + sid + \ - '_' + str(int(n) + 1).zfill(2) + \ + '_00' + \ '/st/' + self.parse_ext_l(format) + \ '/fileid/' + get_fileid(format, n) + '?' + \ compat_urllib_parse.urlencode(param) @@ -153,23 +143,31 @@ class YoukuIE(InfoExtractor): def get_hd(self, fm): hd_id_dict = { + '3gp': '0', + '3gphd': '1', 'flv': '0', + 'flvhd': '0', 'mp4': '1', + 'mp4hd': '1', + 'mp4hd2': '1', + 'mp4hd3': '1', 'hd2': '2', 'hd3': '3', - '3gp': '0', - '3gphd': '1' } return hd_id_dict[fm] def parse_ext_l(self, fm): ext_dict = { + '3gp': 'flv', + '3gphd': 'mp4', 'flv': 'flv', + 'flvhd': 'flv', 'mp4': 'mp4', + 'mp4hd': 'mp4', + 'mp4hd2': 'flv', + 'mp4hd3': 'flv', 'hd2': 'flv', 'hd3': 'flv', - '3gp': 'flv', - '3gphd': 'mp4' } return ext_dict[fm] @@ -178,9 +176,13 @@ class YoukuIE(InfoExtractor): '3gp': 'h6', '3gphd': 'h5', 'flv': 'h4', + 'flvhd': 'h4', 'mp4': 'h3', + 'mp4hd': 'h3', + 'mp4hd2': 'h4', + 'mp4hd3': 'h4', 'hd2': 'h2', - 'hd3': 'h1' + 'hd3': 'h1', } return _dict[fm] @@ -188,45 +190,46 @@ class YoukuIE(InfoExtractor): video_id = self._match_id(url) def retrieve_data(req_url, note): - req = sanitized_Request(req_url) + headers = { + 'Referer': req_url, + } + self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') + req = sanitized_Request(req_url, headers=headers) cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') if cn_verification_proxy: req.add_header('Ytdl-request-proxy', cn_verification_proxy) raw_data = self._download_json(req, video_id, note=note) - return raw_data['data'][0] + + return raw_data['data'] video_password = self._downloader.params.get('videopassword', None) # request basic data - basic_data_url = 'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id + basic_data_url = "http://play.youku.com/play/get.json?vid=%s&ct=12" % video_id if video_password: - basic_data_url += '?password=%s' % video_password - - data1 = retrieve_data( - basic_data_url, - 'Downloading JSON metadata 1') - data2 = retrieve_data( - 'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id, - 'Downloading JSON metadata 2') - - error_code = data1.get('error_code') - if error_code: - error = data1.get('error') - if error is not None and '因版权原因无法观看此视频' in error: + basic_data_url += '&pwd=%s' % video_password + + data = retrieve_data(basic_data_url, 'Downloading JSON metadata') + + error = data.get('error') + if error: + error_note = error.get('note') + if error_note is not None and '因版权原因无法观看此视频' in error_note: raise ExtractorError( 'Youku said: Sorry, this video is available in China only', expected=True) else: - msg = 'Youku server reported error %i' % error_code - if error is not None: - msg += ': ' + error + msg = 'Youku server reported error %i' % error.get('code') + if error_note is not None: + msg += ': ' + error_note raise ExtractorError(msg) - title = data1['title'] + # get video title + title = data['video']['title'] # generate video_urls_dict - video_urls_dict = self.construct_video_urls(data1, data2) + video_urls_dict = self.construct_video_urls(data) # construct info entries = [{ @@ -235,10 +238,11 @@ class YoukuIE(InfoExtractor): 'formats': [], # some formats are not available for all parts, we have to detect # which one has all - } for i in range(max(len(v) for v in data1['segs'].values()))] - for fm in data1['streamtypes']: + } for i in range(max(len(v.get('segs')) for v in data['stream']))] + for stream in data['stream']: + fm = stream.get('stream_type') video_urls = video_urls_dict[fm] - for video_url, seg, entry in zip(video_urls, data1['segs'][fm], entries): + for video_url, seg, entry in zip(video_urls, stream['segs'], entries): entry['formats'].append({ 'url': video_url, 'format_id': self.get_format_name(fm), diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9b39505ba..4aac2cc03 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,6 +26,7 @@ from ..compat import ( from ..utils import ( clean_html, encode_dict, + error_to_compat_str, ExtractorError, float_or_none, get_element_by_attribute, @@ -33,6 +34,7 @@ from ..utils import ( int_or_none, orderedSet, parse_duration, + remove_quotes, remove_start, sanitized_Request, smuggle_url, @@ -395,12 +397,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', + 'alt_title': 'I Love It (feat. Charli XCX)', 'description': 'md5:782e8651347686cba06e58f71ab51773', 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', + 'creator': 'Icona Pop', } }, { @@ -411,9 +415,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20130703', 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', + 'alt_title': 'Tunnel Vision', 'description': 'md5:64249768eec3bc4276236606ea996373', 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', + 'creator': 'Justin Timberlake', 'age_limit': 18, } }, @@ -492,10 +498,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'nfWlot6h_JM', 'ext': 'm4a', 'title': 'Taylor Swift - Shake It Off', + 'alt_title': 'Shake It Off', 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3', 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', + 'creator': 'Taylor Swift', }, 'params': { 'youtube_include_dash_manifest': True, @@ -551,9 +559,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20100430', 'uploader_id': 'deadmau5', + 'creator': 'deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', + 'alt_title': 'Some Chords', }, 'expected_warnings': [ 'DASH manifest missing', @@ -701,10 +711,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'lsguqyKfVQg', 'ext': 'mp4', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', + 'alt_title': 'Dark Walk', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'upload_date': '20151119', 'uploader_id': 'IronSoulElf', 'uploader': 'IronSoulElf', + 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', }, 'params': { 'skip_download': True, @@ -892,7 +904,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, video_id, note=False) except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err)) + self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) return {} sub_lang_list = {} @@ -1308,6 +1320,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) + m_music = re.search( + r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li', + video_webpage) + if m_music: + video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) + video_creator = clean_html(m_music.group('creator')) + else: + video_alt_title = video_creator = None + m_cat_container = self._search_regex( r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', video_webpage, 'categories', default=None) @@ -1537,7 +1558,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': video_uploader, 'uploader_id': video_uploader_id, 'upload_date': upload_date, + 'creator': video_creator, 'title': video_title, + 'alt_title': video_alt_title, 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, @@ -1752,6 +1775,10 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): }, }] + @classmethod + def suitable(cls, url): + return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url) + def _real_extract(self, url): channel_id = self._match_id(url) @@ -1825,10 +1852,10 @@ class YoutubeUserIE(YoutubeChannelIE): return super(YoutubeUserIE, cls).suitable(url) -class YoutubeUserPlaylistsIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com user playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/user/(?P<id>[^/]+)/playlists' - IE_NAME = 'youtube:user:playlists' +class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): + IE_DESC = 'YouTube.com user/channel playlists' + _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' + IE_NAME = 'youtube:playlists' _TESTS = [{ 'url': 'http://www.youtube.com/user/ThirstForScience/playlists', @@ -1845,6 +1872,13 @@ class YoutubeUserPlaylistsIE(YoutubePlaylistsBaseInfoExtractor): 'id': 'igorkle1', 'title': 'Игорь Клейнер', }, + }, { + 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', + 'playlist_mincount': 17, + 'info_dict': { + 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', + 'title': 'Chem Player', + }, }] diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index a795f56b3..92c12bac6 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -10,106 +10,16 @@ from ..utils import ( unified_strdate, OnDemandPagedList, xpath_text, + determine_ext, + qualities, + float_or_none, ) -def extract_from_xml_url(ie, video_id, xml_url): - doc = ie._download_xml( - xml_url, video_id, - note='Downloading video info', - errnote='Failed to download video info') - - title = doc.find('.//information/title').text - description = xpath_text(doc, './/information/detail', 'description') - duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) - uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') - uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') - upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) - - def xml_to_format(fnode): - video_url = fnode.find('url').text - is_available = 'http://www.metafilegenerator' not in video_url - - format_id = fnode.attrib['basetype'] - format_m = re.match(r'''(?x) - (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ - (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) - ''', format_id) - - ext = format_m.group('container') - proto = format_m.group('proto').lower() - - quality = xpath_text(fnode, './quality', 'quality') - abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) - - width = int_or_none(xpath_text(fnode, './width', 'width')) - height = int_or_none(xpath_text(fnode, './height', 'height')) - - filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) - - format_note = '' - if not format_note: - format_note = None - - return { - 'format_id': format_id + '-' + quality, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'width': width, - 'height': height, - 'filesize': filesize, - 'format_note': format_note, - 'protocol': proto, - '_available': is_available, - } - - def xml_to_thumbnails(fnode): - thumbnails = [] - for node in fnode: - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - if 'key' in node.attrib: - m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - return thumbnails - - thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) - - format_nodes = doc.findall('.//formitaeten/formitaet') - formats = list(filter( - lambda f: f['_available'], - map(xml_to_format, format_nodes))) - ie._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'thumbnails': thumbnails, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, - 'formats': formats, - } - - class ZDFIE(InfoExtractor): _VALID_URL = r'(?:zdf:|zdf:video:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/(.*beitrag/(?:video/)?))(?P<id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?' - _TEST = { + _TESTS = [{ 'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt', 'info_dict': { 'id': '2037704', @@ -122,23 +32,183 @@ class ZDFIE(InfoExtractor): 'upload_date': '20131127', }, 'skip': 'Videos on ZDF.de are depublicised in short order', - } + }] + + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + param_groups = {} + for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): + group_id = param_group.attrib.get(self._xpath_ns('id', 'http://www.w3.org/XML/1998/namespace')) + params = {} + for param in param_group: + params[param.get('name')] = param.get('value') + param_groups[group_id] = params + + formats = [] + for video in smil.findall(self._xpath_ns('.//video', namespace)): + src = video.get('src') + if not src: + continue + bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + group_id = video.get('paramGroup') + param_group = param_groups[group_id] + for proto in param_group['protocols'].split(','): + formats.append({ + 'url': '%s://%s' % (proto, param_group['host']), + 'app': param_group['app'], + 'play_path': src, + 'ext': 'flv', + 'format_id': '%s-%d' % (proto, bitrate), + 'tbr': bitrate, + 'protocol': proto, + }) + self._sort_formats(formats) + return formats + + def extract_from_xml_url(self, video_id, xml_url): + doc = self._download_xml( + xml_url, video_id, + note='Downloading video info', + errnote='Failed to download video info') + + title = doc.find('.//information/title').text + description = xpath_text(doc, './/information/detail', 'description') + duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) + uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') + uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') + upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) + + def xml_to_thumbnails(fnode): + thumbnails = [] + for node in fnode: + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } + if 'key' in node.attrib: + m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) + thumbnails.append(thumbnail) + return thumbnails + + thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) + + format_nodes = doc.findall('.//formitaeten/formitaet') + quality = qualities(['veryhigh', 'high', 'med', 'low']) + + def get_quality(elem): + return quality(xpath_text(elem, 'quality')) + format_nodes.sort(key=get_quality) + format_ids = [] + formats = [] + for fnode in format_nodes: + video_url = fnode.find('url').text + is_available = 'http://www.metafilegenerator' not in video_url + if not is_available: + continue + format_id = fnode.attrib['basetype'] + quality = xpath_text(fnode, './quality', 'quality') + format_m = re.match(r'''(?x) + (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ + (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) + ''', format_id) + + ext = determine_ext(video_url, None) or format_m.group('container') + if ext not in ('smil', 'f4m', 'm3u8'): + format_id = format_id + '-' + quality + if format_id in format_ids: + continue + + if ext == 'meta': + continue + elif ext == 'smil': + smil_formats = self._extract_smil_formats( + video_url, video_id, fatal=False) + if smil_formats: + formats.extend(smil_formats) + elif ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif ext == 'f4m': + f4m_formats = self._extract_f4m_formats( + video_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + else: + proto = format_m.group('proto').lower() + + abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) + + width = int_or_none(xpath_text(fnode, './width', 'width')) + height = int_or_none(xpath_text(fnode, './height', 'height')) + + filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) + + format_note = '' + if not format_note: + format_note = None + + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + 'acodec': format_m.group('acodec'), + 'vcodec': format_m.group('vcodec'), + 'abr': abr, + 'vbr': vbr, + 'width': width, + 'height': height, + 'filesize': filesize, + 'format_note': format_note, + 'protocol': proto, + '_available': is_available, + }) + format_ids.append(format_id) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'formats': formats, + } def _real_extract(self, url): video_id = self._match_id(url) xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id - return extract_from_xml_url(self, video_id, xml_url) + return self.extract_from_xml_url(video_id, xml_url) class ZDFChannelIE(InfoExtractor): - _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/)(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/(?:[^/]+/)?)(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.zdf.de/ZDFmediathek#/kanaluebersicht/1586442/sendung/Titanic', 'info_dict': { 'id': '1586442', }, 'playlist_count': 3, - } + }, { + 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/aktuellste/332', + 'only_matching': True, + }, { + 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/meist-gesehen/332', + 'only_matching': True, + }, { + 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/_/1798716?bc=nrt;nrm?flash=off', + 'only_matching': True, + }] _PAGE_SIZE = 50 def _fetch_page(self, channel_id, page): |