diff options
Diffstat (limited to 'youtube_dl')
85 files changed, 1925 insertions, 556 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 3b2be3159..09d2b18f2 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -46,6 +46,7 @@ from .utils import ( DateRange, DEFAULT_OUTTMPL, determine_ext, + determine_protocol, DownloadError, encode_compat_str, encodeFilename, @@ -898,6 +899,9 @@ class YoutubeDL(object): STR_OPERATORS = { '=': operator.eq, '!=': operator.ne, + '^=': lambda attr, value: attr.startswith(value), + '$=': lambda attr, value: attr.endswith(value), + '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) \s*(?P<key>ext|acodec|vcodec|container|protocol) @@ -1244,6 +1248,12 @@ class YoutubeDL(object): except (ValueError, OverflowError, OSError): pass + # Auto generate title fields corresponding to the *_number fields when missing + # in order to always have clean titles. This is very common for TV series. + for field in ('chapter', 'season', 'episode'): + if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): + info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + subtitles = info_dict.get('subtitles') if subtitles: for _, subtitle in subtitles.items(): @@ -1300,6 +1310,10 @@ class YoutubeDL(object): # Automatically determine file extension if missing if 'ext' not in format: format['ext'] = determine_ext(format['url']).lower() + # Automatically determine protocol if missing (useful for format + # selection purposes) + if 'protocol' not in format: + format['protocol'] = determine_protocol(format) # Add HTTP headers, so that external programs can use them from the # json output full_format_info = info_dict.copy() @@ -1312,7 +1326,7 @@ class YoutubeDL(object): # only set the 'formats' fields if the original info_dict list them # otherwise we end up with a circular reference, the first (and unique) # element in the 'formats' field in info_dict is info_dict itself, - # wich can't be exported to json + # which can't be exported to json info_dict['formats'] = formats if self.params.get('listformats'): self.list_formats(info_dict) @@ -1986,8 +2000,19 @@ class YoutubeDL(object): https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) data_handler = compat_urllib_request_DataHandler() + + # When passing our own FileHandler instance, build_opener won't add the + # default FileHandler and allows us to disable the file protocol, which + # can be used for malicious purposes (see + # https://github.com/rg3/youtube-dl/issues/8227) + file_handler = compat_urllib_request.FileHandler() + + def file_open(*args, **kwargs): + raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons') + file_handler.file_open = file_open + opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh, data_handler) + proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler) # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index a3e85264a..8ab688001 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -433,7 +433,7 @@ if sys.version_info < (3, 0) and sys.platform == 'win32': else: compat_getpass = getpass.getpass -# Old 2.6 and 2.7 releases require kwargs to be bytes +# Python < 2.6.5 require kwargs to be bytes try: def _testfunc(x): pass diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index beae8c4d0..fc7521598 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -295,7 +295,7 @@ class FileDownloader(object): def report_retry(self, count, retries): """Report retry in case of HTTP error 5xx""" - self.to_screen('[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries)) + self.to_screen('[download] Got server HTTP error. Retrying (attempt %d of %.0f)...' % (count, retries)) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 5a64b29ee..0c9113d0f 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -59,37 +59,43 @@ class FragmentFD(FileDownloader): 'filename': ctx['filename'], 'tmpfilename': ctx['tmpfilename'], } + start = time.time() - ctx['started'] = start + ctx.update({ + 'started': start, + # Total complete fragments downloaded so far in bytes + 'complete_frags_downloaded_bytes': 0, + # Amount of fragment's bytes downloaded by the time of the previous + # frag progress hook invocation + 'prev_frag_downloaded_bytes': 0, + }) def frag_progress_hook(s): if s['status'] not in ('downloading', 'finished'): return - frag_total_bytes = s.get('total_bytes', 0) - if s['status'] == 'finished': - state['downloaded_bytes'] += frag_total_bytes - state['frag_index'] += 1 + frag_total_bytes = s.get('total_bytes') or 0 estimated_size = ( - (state['downloaded_bytes'] + frag_total_bytes) / + (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) / (state['frag_index'] + 1) * total_frags) time_now = time.time() state['total_bytes_estimate'] = estimated_size state['elapsed'] = time_now - start if s['status'] == 'finished': - progress = self.calc_percent(state['frag_index'], total_frags) + state['frag_index'] += 1 + state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] + ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] + ctx['prev_frag_downloaded_bytes'] = 0 else: frag_downloaded_bytes = s['downloaded_bytes'] - frag_progress = self.calc_percent(frag_downloaded_bytes, - frag_total_bytes) - progress = self.calc_percent(state['frag_index'], total_frags) - progress += frag_progress / float(total_frags) - + state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] state['eta'] = self.calc_eta( - start, time_now, estimated_size, state['downloaded_bytes'] + frag_downloaded_bytes) + start, time_now, estimated_size, + state['downloaded_bytes']) state['speed'] = s.get('speed') + ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state) ctx['dl'].add_progress_hook(frag_progress_hook) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index b5a3e1167..10b83c6b2 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -46,7 +46,16 @@ class HlsFD(FileDownloader): self._debug_cmd(args) - retval = subprocess.call(args) + proc = subprocess.Popen(args, stdin=subprocess.PIPE) + try: + retval = proc.wait() + except KeyboardInterrupt: + # subprocces.run would send the SIGKILL signal to ffmpeg and the + # mp4 file couldn't be played, but if we ask ffmpeg to quit it + # produces a file that is playable (this is mostly useful for live + # streams) + proc.communicate(b'q') + raise if retval == 0: fsize = os.path.getsize(encodeFilename(tmpfilename)) self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 4c7e5223d..245e4d044 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -15,6 +15,7 @@ from .adobetv import ( AdobeTVVideoIE, ) from .adultswim import AdultSwimIE +from .aenetworks import AENetworksIE from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE @@ -41,6 +42,7 @@ from .arte import ( ArteTVCreativeIE, ArteTVConcertIE, ArteTVFutureIE, + ArteTVCinemaIE, ArteTVDDCIE, ArteTVEmbedIE, ) @@ -61,6 +63,7 @@ from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE from .bet import BetIE +from .bigflix import BigflixIE from .bild import BildIE from .bilibili import BiliBiliIE from .bleacherreport import ( @@ -85,6 +88,7 @@ from .camdemy import ( ) from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .canvas import CanvasIE from .cbs import CBSIE from .cbsnews import CBSNewsIE from .cbssports import CBSSportsIE @@ -127,6 +131,8 @@ from .crunchyroll import ( ) from .cspan import CSpanIE from .ctsnews import CtsNewsIE +from .cultureunplugged import CultureUnpluggedIE +from .cwtv import CWTVIE from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, @@ -261,7 +267,6 @@ from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .historicfilms import HistoricFilmsIE -from .history import HistoryIE from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE from .hotnewhiphop import HotNewHipHopIE @@ -299,6 +304,7 @@ from .ivi import ( IviIE, IviCompilationIE ) +from .ivideon import IvideonIE from .izlesene import IzleseneIE from .jadorecettepub import JadoreCettePubIE from .jeuxvideo import JeuxVideoIE @@ -328,10 +334,12 @@ from .kuwo import ( from .la7 import LA7IE from .laola1tv import Laola1TvIE from .lecture2go import Lecture2GoIE +from .lemonde import LemondeIE from .letv import ( LetvIE, LetvTvIE, - LetvPlaylistIE + LetvPlaylistIE, + LetvCloudIE, ) from .libsyn import LibsynIE from .lifenews import ( @@ -350,6 +358,7 @@ from .livestream import ( LivestreamShortenerIE, ) from .lnkgo import LnkGoIE +from .lovehomeporn import LoveHomePornIE from .lrt import LRTIE from .lynda import ( LyndaIE, @@ -473,6 +482,7 @@ from .npo import ( VPROIE, WNLIE ) +from .npr import NprIE from .nrk import ( NRKIE, NRKPlaylistIE, @@ -563,7 +573,7 @@ from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtbf import RTBFIE -from .rte import RteIE +from .rte import RteIE, RteRadioIE from .rtlnl import RtlNlIE from .rtl2 import RTL2IE from .rtp import RTPIE @@ -571,6 +581,7 @@ from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE from .rtvnh import RTVNHIE from .ruhd import RUHDIE +from .ruleporn import RulePornIE from .rutube import ( RutubeIE, RutubeChannelIE, @@ -717,10 +728,15 @@ from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .trollvids import TrollvidsIE from .trutube import TruTubeIE from .tube8 import Tube8IE from .tubitv import TubiTvIE -from .tudou import TudouIE +from .tudou import ( + TudouIE, + TudouPlaylistIE, + TudouAlbumIE, +) from .tumblr import TumblrIE from .tunein import ( TuneInClipIE, @@ -746,6 +762,7 @@ from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE +from .twentymin import TwentyMinutenIE from .twentytwotracks import ( TwentyTwoTracksIE, TwentyTwoTracksGenreIE @@ -766,7 +783,7 @@ from .udemy import ( UdemyCourseIE ) from .udn import UDNEmbedIE -from .ultimedia import UltimediaIE +from .digiteka import DigitekaIE from .unistra import UnistraIE from .urort import UrortIE from .ustream import UstreamIE, UstreamChannelIE @@ -845,6 +862,7 @@ from .webofstories import ( WebOfStoriesPlaylistIE, ) from .weibo import WeiboIE +from .weiqitv import WeiqiTVIE from .wimp import WimpIE from .wistia import WistiaIE from .worldstarhiphop import WorldStarHipHopIE @@ -905,6 +923,7 @@ from .zingmp3 import ( ZingMp3SongIE, ZingMp3AlbumIE, ) +from .zippcast import ZippCastIE _ALL_CLASSES = [ klass diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index bf21a6887..8157da2cb 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -187,7 +187,8 @@ class AdultSwimIE(InfoExtractor): media_url = file_el.text if determine_ext(media_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( - media_url, segment_title, 'mp4', preference=0, m3u8_id='hls')) + media_url, segment_title, 'mp4', preference=0, + m3u8_id='hls', fatal=False)) else: formats.append({ 'format_id': '%s_%s' % (bitrate, ftype), diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py new file mode 100644 index 000000000..43d7b0523 --- /dev/null +++ b/youtube_dl/extractor/aenetworks.py @@ -0,0 +1,66 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class AENetworksIE(InfoExtractor): + IE_NAME = 'aenetworks' + IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' + _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])' + + _TESTS = [{ + 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', + 'info_dict': { + 'id': 'g12m5Gyt3fdR', + 'ext': 'mp4', + 'title': "Bet You Didn't Know: Valentine's Day", + 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + 'expected_warnings': ['JSON-LD'], + }, { + 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', + 'info_dict': { + 'id': 'eg47EERs_JsZ', + 'ext': 'mp4', + 'title': "Winter Is Coming", + 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'http://www.aetv.com/shows/duck-dynasty/video/inlawful-entry', + 'only_matching': True + }, { + 'url': 'http://www.fyi.tv/shows/tiny-house-nation/videos/207-sq-ft-minnesota-prairie-cottage', + 'only_matching': True + }, { + 'url': 'http://www.mylifetime.com/shows/project-runway-junior/video/season-1/episode-6/superstar-clients', + 'only_matching': True + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url_re = [ + r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id, + r"media_url\s*=\s*'([^']+)'" + ] + video_url = self._search_regex(video_url_re, webpage, 'video url') + + info = self._search_json_ld(webpage, video_id, fatal=False) + info.update({ + '_type': 'url_transparent', + 'url': smuggle_url(video_url, {'sig': {'key': 'crazyjava', 'secret': 's3cr3t'}}), + }) + return info diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 1035d1c48..69e6baff7 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -76,5 +76,6 @@ class AMPIE(InfoExtractor): 'thumbnails': thumbnails, 'timestamp': parse_iso8601(item.get('pubDate'), ' '), 'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), + 'subtitles': subtitles, 'formats': formats, } diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 23f942ae2..2fd912da4 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -1,11 +1,9 @@ from __future__ import unicode_literals -import re +from .nuevo import NuevoBaseIE -from .common import InfoExtractor - -class AnitubeIE(InfoExtractor): +class AnitubeIE(NuevoBaseIE): IE_NAME = 'anitube.se' _VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P<id>\d+)' @@ -22,38 +20,11 @@ class AnitubeIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) key = self._search_regex( r'src=["\']https?://[^/]+/embed/([A-Za-z0-9_-]+)', webpage, 'key') - config_xml = self._download_xml( - 'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key) - - video_title = config_xml.find('title').text - thumbnail = config_xml.find('image').text - duration = float(config_xml.find('duration').text) - - formats = [] - video_url = config_xml.find('file') - if video_url is not None: - formats.append({ - 'format_id': 'sd', - 'url': video_url.text, - }) - video_url = config_xml.find('filehd') - if video_url is not None: - formats.append({ - 'format_id': 'hd', - 'url': video_url.text, - }) - - return { - 'id': video_id, - 'title': video_title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats - } + return self._extract_nuevo( + 'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, video_id) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 10301a8ea..b9e07f0ef 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -199,25 +199,19 @@ class ArteTVCreativeIE(ArteTVPlus7IE): class ArteTVFutureIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:future' - _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de)/(thema|sujet)/.*?#article-anchor-(?P<id>\d+)' + _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de)/(?P<id>.+)' - _TEST = { - 'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', + _TESTS = [{ + 'url': 'http://future.arte.tv/fr/info-sciences/les-ecrevisses-aussi-sont-anxieuses', 'info_dict': { - 'id': '5201', + 'id': '050940-028-A', 'ext': 'mp4', - 'title': 'Les champignons au secours de la planète', - 'upload_date': '20131101', + 'title': 'Les écrevisses aussi peuvent être anxieuses', }, - } - - def _real_extract(self, url): - anchor_id, lang = self._extract_url_info(url) - webpage = self._download_webpage(url, anchor_id) - row = self._search_regex( - r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id, - webpage, 'row') - return self._extract_from_webpage(row, anchor_id, lang) + }, { + 'url': 'http://future.arte.tv/fr/la-science-est-elle-responsable', + 'only_matching': True, + }] class ArteTVDDCIE(ArteTVPlus7IE): @@ -255,6 +249,23 @@ class ArteTVConcertIE(ArteTVPlus7IE): } +class ArteTVCinemaIE(ArteTVPlus7IE): + IE_NAME = 'arte.tv:cinema' + _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>de|fr)/(?P<id>.+)' + + _TEST = { + 'url': 'http://cinema.arte.tv/de/node/38291', + 'md5': '6b275511a5107c60bacbeeda368c3aa1', + 'info_dict': { + 'id': '055876-000_PWA12025-D', + 'ext': 'mp4', + 'title': 'Tod auf dem Nil', + 'upload_date': '20160122', + 'description': 'md5:7f749bbb77d800ef2be11d54529b96bc', + }, + } + + class ArteTVEmbedIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:embed' _VALID_URL = r'''(?x) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 3fb042cea..b8f9ae005 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -132,11 +132,6 @@ class AtresPlayerIE(InfoExtractor): }) formats.append(format_info) - m3u8_url = player.get('urlVideoHls') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, episode_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - timestamp = int_or_none(self._download_webpage( self._TIME_API_URL, video_id, 'Downloading timestamp', fatal=False), 1000, time.time()) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 7b169881a..1c493b72d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -124,14 +124,14 @@ class BBCCoUkIE(InfoExtractor): }, 'skip': 'Episode is no longer available on BBC iPlayer Radio', }, { - 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3', + 'url': 'http://www.bbc.co.uk/music/clips/p022h44b', 'note': 'Audio', 'info_dict': { - 'id': 'p02frcch', + 'id': 'p022h44j', 'ext': 'flv', - 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix', - 'description': 'French house superstar Madeon takes us out of the club and onto the after party.', - 'duration': 3507, + 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances', + 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.", + 'duration': 227, }, 'params': { # rtmp download @@ -182,13 +182,12 @@ class BBCCoUkIE(InfoExtractor): }, { # iptv-all mediaset fails with geolocation however there is no geo restriction # for this programme at all - 'url': 'http://www.bbc.co.uk/programmes/b06bp7lf', + 'url': 'http://www.bbc.co.uk/programmes/b06rkn85', 'info_dict': { - 'id': 'b06bp7kf', + 'id': 'b06rkms3', 'ext': 'flv', - 'title': "Annie Mac's Friday Night, B.Traits sits in for Annie", - 'description': 'B.Traits sits in for Annie Mac with a Mini-Mix from Disclosure.', - 'duration': 10800, + 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1", + 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!", }, 'params': { # rtmp download @@ -719,19 +718,10 @@ class BBCIE(BBCCoUkIE): webpage = self._download_webpage(url, playlist_id) - timestamp = None - playlist_title = None - playlist_description = None - - ld = self._parse_json( - self._search_regex( - r'(?s)<script type="application/ld\+json">(.+?)</script>', - webpage, 'ld json', default='{}'), - playlist_id, fatal=False) - if ld: - timestamp = parse_iso8601(ld.get('datePublished')) - playlist_title = ld.get('headline') - playlist_description = ld.get('articleBody') + json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) + timestamp = json_ld_info.get('timestamp') + playlist_title = json_ld_info.get('title') + playlist_description = json_ld_info.get('description') if not timestamp: timestamp = parse_iso8601(self._search_regex( diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index c8d921daf..34c2a756f 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -34,7 +34,7 @@ class BeegIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'http://beeg.com/api/v5/video/%s' % video_id, video_id) + 'https://api.beeg.com/api/v5/video/%s' % video_id, video_id) def split(o, e): def cut(s, x): @@ -60,7 +60,7 @@ class BeegIE(InfoExtractor): def decrypt_url(encrypted_url): encrypted_url = self._proto_relative_url( - encrypted_url.replace('{DATA_MARKERS}', ''), 'http:') + encrypted_url.replace('{DATA_MARKERS}', ''), 'https:') key = self._search_regex( r'/key=(.*?)%2Cend=', encrypted_url, 'key', default=None) if not key: diff --git a/youtube_dl/extractor/bigflix.py b/youtube_dl/extractor/bigflix.py new file mode 100644 index 000000000..33762ad93 --- /dev/null +++ b/youtube_dl/extractor/bigflix.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote + + +class BigflixIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.bigflix.com/Hindi-movies/Action-movies/Singham-Returns/16537', + 'md5': 'ec76aa9b1129e2e5b301a474e54fab74', + 'info_dict': { + 'id': '16537', + 'ext': 'mp4', + 'title': 'Singham Returns', + 'description': 'md5:3d2ba5815f14911d5cc6a501ae0cf65d', + } + }, { + # 2 formats + 'url': 'http://www.bigflix.com/Tamil-movies/Drama-movies/Madarasapatinam/16070', + 'info_dict': { + 'id': '16070', + 'ext': 'mp4', + 'title': 'Madarasapatinam', + 'description': 'md5:63b9b8ed79189c6f0418c26d9a3452ca', + 'formats': 'mincount:2', + }, + 'params': { + 'skip_download': True, + } + }, { + # multiple formats + 'url': 'http://www.bigflix.com/Malayalam-movies/Drama-movies/Indian-Rupee/15967', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r'<div[^>]+class=["\']pagetitle["\'][^>]*>(.+?)</div>', + webpage, 'title') + + def decode_url(quoted_b64_url): + return base64.b64decode(compat_urllib_parse_unquote( + quoted_b64_url).encode('ascii')).decode('utf-8') + + formats = [] + for height, encoded_url in re.findall( + r'ContentURL_(\d{3,4})[pP][^=]+=([^&]+)', webpage): + video_url = decode_url(encoded_url) + f = { + 'url': video_url, + 'format_id': '%sp' % height, + 'height': int(height), + } + if video_url.startswith('rtmp'): + f['ext'] = 'flv' + formats.append(f) + + file_url = self._search_regex( + r'file=([^&]+)', webpage, 'video url', default=None) + if file_url: + video_url = decode_url(file_url) + if all(f['url'] != video_url for f in formats): + formats.append({ + 'url': decode_url(file_url), + }) + + self._sort_formats(formats) + + description = self._html_search_meta('description', webpage) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats + } diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index f6a1ff381..f1f128c45 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -9,9 +9,9 @@ from ..utils import parse_duration class Canalc2IE(InfoExtractor): IE_NAME = 'canalc2.tv' - _VALID_URL = r'https?://(?:www\.)?canalc2\.tv/video/(?P<id>\d+)' + _VALID_URL = r'https?://(?:(?:www\.)?canalc2\.tv/video/|archives-canalc2\.u-strasbg\.fr/video\.asp\?.*\bidVideo=)(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.canalc2.tv/video/12163', 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { @@ -23,24 +23,36 @@ class Canalc2IE(InfoExtractor): 'params': { 'skip_download': True, # Requires rtmpdump } - } + }, { + 'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'jwplayer\((["\'])Player\1\)\.setup\({[^}]*file\s*:\s*(["\'])(?P<file>.+?)\2', - webpage, 'video_url', group='file') - formats = [{'url': video_url}] - if video_url.startswith('rtmp://'): - rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url) - formats[0].update({ - 'url': rtmp.group('url'), - 'ext': 'flv', - 'app': rtmp.group('app'), - 'play_path': rtmp.group('play_path'), - 'page_url': url, - }) + + webpage = self._download_webpage( + 'http://www.canalc2.tv/video/%s' % video_id, video_id) + + formats = [] + for _, video_url in re.findall(r'file\s*=\s*(["\'])(.+?)\1', webpage): + if video_url.startswith('rtmp://'): + rtmp = re.search( + r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url) + formats.append({ + 'url': rtmp.group('url'), + 'format_id': 'rtmp', + 'ext': 'flv', + 'app': rtmp.group('app'), + 'play_path': rtmp.group('play_path'), + 'page_url': url, + }) + else: + formats.append({ + 'url': video_url, + 'format_id': 'http', + }) + self._sort_formats(formats) title = self._html_search_regex( r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.*?)</h3>', webpage, 'title') diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py new file mode 100644 index 000000000..ee19ff836 --- /dev/null +++ b/youtube_dl/extractor/canvas.py @@ -0,0 +1,65 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import float_or_none + + +class CanvasIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?canvas\.be/video/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', + 'md5': 'ea838375a547ac787d4064d8c7860a6c', + 'info_dict': { + 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', + 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', + 'ext': 'mp4', + 'title': 'De afspraak veilt voor de Warmste Week', + 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 49.02, + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + title = self._search_regex( + r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>', + webpage, 'title', default=None) or self._og_search_title(webpage) + + video_id = self._html_search_regex( + r'data-video=(["\'])(?P<id>.+?)\1', webpage, 'video id', group='id') + + data = self._download_json( + 'https://mediazone.vrt.be/api/v1/canvas/assets/%s' % video_id, display_id) + + formats = [] + for target in data['targetUrls']: + format_url, format_type = target.get('url'), target.get('type') + if not format_url or not format_type: + continue + if format_type == 'HLS': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, entry_protocol='m3u8_native', + ext='mp4', preference=0, fatal=False, m3u8_id=format_type)) + elif format_type == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url, display_id, f4m_id=format_type, fatal=False)) + else: + formats.append({ + 'format_id': format_type, + 'url': format_url, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'formats': formats, + 'duration': float_or_none(data.get('duration'), 1000), + 'thumbnail': data.get('posterImageUrl'), + } diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index d211ec23b..480435e26 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -35,6 +35,11 @@ class CBSNewsIE(InfoExtractor): 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 205, + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, }, 'params': { # rtmp download @@ -85,10 +90,18 @@ class CBSNewsIE(InfoExtractor): fmt['ext'] = 'mp4' formats.append(fmt) + subtitles = {} + if 'mpxRefId' in video_info: + subtitles['en'] = [{ + 'ext': 'ttml', + 'url': 'http://www.cbsnews.com/videos/captions/%s.adb_xml' % video_info['mpxRefId'], + }] + return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0719c7bcd..2f574054d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -34,6 +34,7 @@ from ..utils import ( fix_xml_ampersands, float_or_none, int_or_none, + parse_iso8601, RegexNotFoundError, sanitize_filename, sanitized_Request, @@ -313,9 +314,9 @@ class InfoExtractor(object): except ExtractorError: raise except compat_http_client.IncompleteRead as e: - raise ExtractorError('A network error has occured.', cause=e, expected=True) + raise ExtractorError('A network error has occurred.', cause=e, expected=True) except (KeyError, StopIteration) as e: - raise ExtractorError('An extractor error has occured.', cause=e) + raise ExtractorError('An extractor error has occurred.', cause=e) def set_downloader(self, downloader): """Sets the downloader for this IE.""" @@ -762,6 +763,42 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') + def _search_json_ld(self, html, video_id, **kwargs): + json_ld = self._search_regex( + r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', + html, 'JSON-LD', group='json_ld', **kwargs) + if not json_ld: + return {} + return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) + + def _json_ld(self, json_ld, video_id, fatal=True): + if isinstance(json_ld, compat_str): + json_ld = self._parse_json(json_ld, video_id, fatal=fatal) + if not json_ld: + return {} + info = {} + if json_ld.get('@context') == 'http://schema.org': + item_type = json_ld.get('@type') + if item_type == 'TVEpisode': + info.update({ + 'episode': unescapeHTML(json_ld.get('name')), + 'episode_number': int_or_none(json_ld.get('episodeNumber')), + 'description': unescapeHTML(json_ld.get('description')), + }) + part_of_season = json_ld.get('partOfSeason') + if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': + info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) + part_of_series = json_ld.get('partOfSeries') + if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': + info['series'] = unescapeHTML(part_of_series.get('name')) + elif item_type == 'Article': + info.update({ + 'timestamp': parse_iso8601(json_ld.get('datePublished')), + 'title': unescapeHTML(json_ld.get('headline')), + 'description': unescapeHTML(json_ld.get('articleBody')), + }) + return dict((k, v) for k, v in info.items() if v is not None) + @staticmethod def _hidden_inputs(html): html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) @@ -1021,9 +1058,9 @@ class InfoExtractor(object): # TODO: looks like video codec is not always necessarily goes first va_codecs = codecs.split(',') if va_codecs[0]: - f['vcodec'] = va_codecs[0].partition('.')[0] + f['vcodec'] = va_codecs[0] if len(va_codecs) > 1 and va_codecs[1]: - f['acodec'] = va_codecs[1].partition('.')[0] + f['acodec'] = va_codecs[1] resolution = last_info.get('RESOLUTION') if resolution: width_str, height_str = resolution.split('x') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 00d943f77..785594df8 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -329,8 +329,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text streamdata_req, video_id, note='Downloading media info for %s' % video_format) stream_info = streamdata.find('./{default}preload/stream_info') - video_url = stream_info.find('./host').text - video_play_path = stream_info.find('./file').text + video_url = xpath_text(stream_info, './host') + video_play_path = xpath_text(stream_info, './file') + if not video_url or not video_play_path: + continue metadata = stream_info.find('./metadata') format_info = { 'format': video_format, diff --git a/youtube_dl/extractor/cultureunplugged.py b/youtube_dl/extractor/cultureunplugged.py new file mode 100644 index 000000000..9c764fe68 --- /dev/null +++ b/youtube_dl/extractor/cultureunplugged.py @@ -0,0 +1,63 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class CultureUnpluggedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cultureunplugged\.com/documentary/watch-online/play/(?P<id>\d+)(?:/(?P<display_id>[^/]+))?' + _TESTS = [{ + 'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662/The-Next--Best-West', + 'md5': 'ac6c093b089f7d05e79934dcb3d228fc', + 'info_dict': { + 'id': '53662', + 'display_id': 'The-Next--Best-West', + 'ext': 'mp4', + 'title': 'The Next, Best West', + 'description': 'md5:0423cd00833dea1519cf014e9d0903b1', + 'thumbnail': 're:^https?://.*\.jpg$', + 'creator': 'Coldstream Creative', + 'duration': 2203, + 'view_count': int, + } + }, { + 'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + movie_data = self._download_json( + 'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id) + + video_url = movie_data['url'] + title = movie_data['title'] + + description = movie_data.get('synopsis') + creator = movie_data.get('producer') + duration = int_or_none(movie_data.get('duration')) + view_count = int_or_none(movie_data.get('views')) + + thumbnails = [{ + 'url': movie_data['%s_thumb' % size], + 'id': size, + 'preference': preference, + } for preference, size in enumerate(( + 'small', 'large')) if movie_data.get('%s_thumb' % size)] + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'description': description, + 'creator': creator, + 'duration': duration, + 'view_count': view_count, + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py new file mode 100644 index 000000000..36af67013 --- /dev/null +++ b/youtube_dl/extractor/cwtv.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class CWTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/shows/(?:[^/]+/){2}\?play=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})' + _TESTS = [{ + 'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63', + 'info_dict': { + 'id': '6b15e985-9345-4f60-baf8-56e96be57c63', + 'ext': 'mp4', + 'title': 'Legends of Yesterday', + 'description': 'Oliver and Barry Allen take Kendra Saunders and Carter Hall to a remote location to keep them hidden from Vandal Savage while they figure out how to defeat him.', + 'duration': 2665, + 'series': 'Arrow', + 'season_number': 4, + 'season': '4', + 'episode_number': 8, + 'upload_date': '20151203', + 'timestamp': 1449122100, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://www.cwseed.com/shows/whose-line-is-it-anyway/jeff-davis-4/?play=24282b12-ead2-42f2-95ad-26770c2c6088', + 'info_dict': { + 'id': '24282b12-ead2-42f2-95ad-26770c2c6088', + 'ext': 'mp4', + 'title': 'Jeff Davis 4', + 'description': 'Jeff Davis is back to make you laugh.', + 'duration': 1263, + 'series': 'Whose Line Is It Anyway?', + 'season_number': 11, + 'season': '11', + 'episode_number': 20, + 'upload_date': '20151006', + 'timestamp': 1444107300, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/132?format=json' % video_id, video_id) + + formats = self._extract_m3u8_formats( + video_data['videos']['variantplaylist']['uri'], video_id, 'mp4') + + thumbnails = [{ + 'url': image['uri'], + 'width': image.get('width'), + 'height': image.get('height'), + } for image_id, image in video_data['images'].items() if image.get('uri')] if video_data.get('images') else None + + video_metadata = video_data['assetFields'] + + subtitles = { + 'en': [{ + 'url': video_metadata['UnicornCcUrl'], + }], + } if video_metadata.get('UnicornCcUrl') else None + + return { + 'id': video_id, + 'title': video_metadata['title'], + 'description': video_metadata.get('description'), + 'duration': int_or_none(video_metadata.get('duration')), + 'series': video_metadata.get('seriesName'), + 'season_number': int_or_none(video_metadata.get('seasonNumber')), + 'season': video_metadata.get('seasonName'), + 'episode_number': int_or_none(video_metadata.get('episodeNumber')), + 'timestamp': parse_iso8601(video_data.get('startTime')), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 439fd42e8..6e462af69 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -37,7 +37,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): class DailymotionIE(DailymotionBaseInfoExtractor): - _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)' IE_NAME = 'dailymotion' _FORMATS = [ @@ -104,6 +104,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor): { 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', 'only_matching': True, + }, + { + 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', + 'only_matching': True, } ] @@ -149,14 +153,15 @@ class DailymotionIE(DailymotionBaseInfoExtractor): ext = determine_ext(media_url) if type_ == 'application/x-mpegURL' or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + media_url, video_id, 'mp4', preference=-1, + m3u8_id='hls', fatal=False)) elif type_ == 'application/f4m' or ext == 'f4m': formats.extend(self._extract_f4m_formats( media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) else: f = { 'url': media_url, - 'format_id': quality, + 'format_id': 'http-%s' % quality, } m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url) if m: @@ -335,7 +340,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): class DailymotionUserIE(DailymotionPlaylistIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 8f48571de..15a1c40f7 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -5,7 +5,10 @@ import re import base64 from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import ( + compat_urllib_parse, + compat_str, +) from ..utils import ( int_or_none, parse_iso8601, @@ -186,7 +189,8 @@ class DCNSeasonIE(InfoExtractor): entries = [] for video in show['videos']: + video_id = compat_str(video['id']) entries.append(self.url_result( - 'http://www.dcndigital.ae/media/%s' % video['id'], 'DCNVideo')) + 'http://www.dcndigital.ae/media/%s' % video_id, 'DCNVideo', video_id)) return self.playlist_result(entries, season_id, title) diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/digiteka.py index 60328123c..7bb79ffda 100644 --- a/youtube_dl/extractor/ultimedia.py +++ b/youtube_dl/extractor/digiteka.py @@ -7,9 +7,9 @@ from .common import InfoExtractor from ..utils import int_or_none -class UltimediaIE(InfoExtractor): +class DigitekaIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?:www\.)?ultimedia\.com/ + https?://(?:www\.)?(?:digiteka\.net|ultimedia\.com)/ (?: deliver/ (?P<embed_type> @@ -56,6 +56,9 @@ class UltimediaIE(InfoExtractor): 'timestamp': 1424760500, 'uploader_id': '3rfzk', }, + }, { + 'url': 'https://www.digiteka.net/deliver/generic/iframe/mdtk/01637594/src/lqm3kl/zone/1/showtitle/1/autoplay/yes', + 'only_matching': True, }] @staticmethod diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index b3b21d65f..d35e88881 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -12,6 +12,7 @@ from ..compat import ( from ..utils import ( ExtractorError, clean_html, + int_or_none, sanitized_Request, ) @@ -66,13 +67,15 @@ class DramaFeverBaseIE(AMPIE): class DramaFeverIE(DramaFeverBaseIE): IE_NAME = 'dramafever' _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)' - _TEST = { + _TESTS = [{ 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', 'info_dict': { 'id': '4512.1', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cooking with Shin 4512.1', 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0', + 'episode': 'Episode 1', + 'episode_number': 1, 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1404336058, 'upload_date': '20140702', @@ -82,7 +85,25 @@ class DramaFeverIE(DramaFeverBaseIE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://www.dramafever.com/drama/4826/4/Mnet_Asian_Music_Awards_2015/?ap=1', + 'info_dict': { + 'id': '4826.4', + 'ext': 'mp4', + 'title': 'Mnet Asian Music Awards 2015 4826.4', + 'description': 'md5:3ff2ee8fedaef86e076791c909cf2e91', + 'episode': 'Mnet Asian Music Awards 2015 - Part 3', + 'episode_number': 4, + 'thumbnail': 're:^https?://.*\.jpg', + 'timestamp': 1450213200, + 'upload_date': '20151215', + 'duration': 5602, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] def _real_extract(self, url): video_id = self._match_id(url).replace('/', '.') @@ -105,13 +126,22 @@ class DramaFeverIE(DramaFeverBaseIE): video_id, 'Downloading episode info JSON', fatal=False) if episode_info: value = episode_info.get('value') - if value: - subfile = value[0].get('subfile') or value[0].get('new_subfile') - if subfile and subfile != 'http://www.dramafever.com/st/': - info.setdefault('subtitles', {}).setdefault('English', []).append({ - 'ext': 'srt', - 'url': subfile, - }) + if isinstance(value, list): + for v in value: + if v.get('type') == 'Episode': + subfile = v.get('subfile') or v.get('new_subfile') + if subfile and subfile != 'http://www.dramafever.com/st/': + info.setdefault('subtitles', {}).setdefault('English', []).append({ + 'ext': 'srt', + 'url': subfile, + }) + episode_number = int_or_none(v.get('number')) + episode_fallback = 'Episode' + if episode_number: + episode_fallback += ' %d' % episode_number + info['episode'] = v.get('title') or episode_fallback + info['episode_number'] = episode_number + break return info diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index baa24c6d1..2d74ff855 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -91,7 +91,7 @@ class DRTVIE(InfoExtractor): subtitles_list = asset.get('SubtitlesList') if isinstance(subtitles_list, list): LANGS = { - 'Danish': 'dk', + 'Danish': 'da', } for subs in subtitles_list: lang = subs['Language'] diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 5e43f2359..ec699ba54 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -105,7 +105,7 @@ class FacebookIE(InfoExtractor): login_results, 'login error', default=None, group='error') if error: raise ExtractorError('Unable to login: %s' % error, expected=True) - self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') + self._downloader.report_warning('unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait.') return fb_dtsg = self._search_regex( @@ -126,7 +126,7 @@ class FacebookIE(InfoExtractor): check_response = self._download_webpage(check_req, None, note='Confirming login') if re.search(r'id="checkpointSubmitButton"', check_response) is not None: - self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.') + self._downloader.report_warning('Unable to confirm login, you have to login in your browser and authorize the login.') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.report_warning('unable to log in: %s' % error_to_compat_str(err)) return diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d79e1adc9..26d3698c8 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -57,7 +57,7 @@ from .pladform import PladformIE from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE -from .ultimedia import UltimediaIE +from .digiteka import DigitekaIE class GenericIE(InfoExtractor): @@ -487,7 +487,7 @@ class GenericIE(InfoExtractor): 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', } }, - # Embeded Ustream video + # Embedded Ustream video { 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm', 'md5': '27b99cdb639c9b12a79bca876a073417', @@ -1402,7 +1402,7 @@ class GenericIE(InfoExtractor): # Look for embedded Dailymotion player matches = re.findall( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) + r'<(?:embed|iframe)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) if matches: return _playlist_from_matches( matches, lambda m: unescapeHTML(m[1])) @@ -1644,7 +1644,7 @@ class GenericIE(InfoExtractor): if myvi_url: return self.url_result(myvi_url) - # Look for embeded soundcloud player + # Look for embedded soundcloud player mobj = re.search( r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"', webpage) @@ -1814,10 +1814,10 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(unescapeHTML(mobj.group('url')), 'ScreenwaveMedia') - # Look for Ulltimedia embeds - ultimedia_url = UltimediaIE._extract_url(webpage) - if ultimedia_url: - return self.url_result(self._proto_relative_url(ultimedia_url), 'Ultimedia') + # Look for Digiteka embeds + digiteka_url = DigitekaIE._extract_url(webpage) + if digiteka_url: + return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) # Look for AdobeTVVideo embeds mobj = re.search( diff --git a/youtube_dl/extractor/history.py b/youtube_dl/extractor/history.py deleted file mode 100644 index f86164afe..000000000 --- a/youtube_dl/extractor/history.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import smuggle_url - - -class HistoryIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?history\.com/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])' - - _TESTS = [{ - 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', - 'md5': '6fe632d033c92aa10b8d4a9be047a7c5', - 'info_dict': { - 'id': 'bLx5Dv5Aka1G', - 'ext': 'mp4', - 'title': "Bet You Didn't Know: Valentine's Day", - 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', - }, - 'add_ie': ['ThePlatform'], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_url = self._search_regex( - r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id, - webpage, 'video url') - - return self.url_result(smuggle_url(video_url, {'sig': {'key': 'crazyjava', 'secret': 's3cr3t'}})) diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py index 421f55bbe..ff797438d 100644 --- a/youtube_dl/extractor/hitbox.py +++ b/youtube_dl/extractor/hitbox.py @@ -159,6 +159,9 @@ class HitboxLiveIE(HitboxIE): cdns = player_config.get('cdns') servers = [] for cdn in cdns: + # Subscribe URLs are not playable + if cdn.get('rtmpSubscribe') is True: + continue base_url = cdn.get('netConnectionUrl') host = re.search('.+\.([^\.]+\.[^\./]+)/.+', base_url).group(1) if base_url not in servers: diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 36baf3245..073777f34 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -14,6 +14,7 @@ from ..utils import ( class IPrimaIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P<id>[^?#]+)' _TESTS = [{ diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 66a70a181..691cb66d6 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -214,8 +214,8 @@ class IqiyiIE(InfoExtractor): def get_enc_key(self, swf_url, video_id): # TODO: automatic key extraction - # last update at 2015-12-18 for Zombie::bite - enc_key = '8b6b683780897eb8d9a48a02ccc4817d'[::-1] + # last update at 2016-01-22 for Zombie::bite + enc_key = '6ab6d0280511493ba85594779759d4ed' return enc_key def _real_extract(self, url): diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 029878d24..472d72b4c 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -7,6 +7,7 @@ import json from .common import InfoExtractor from ..utils import ( ExtractorError, + int_or_none, sanitized_Request, ) @@ -27,44 +28,36 @@ class IviIE(InfoExtractor): 'title': 'Иван Васильевич меняет профессию', 'description': 'md5:b924063ea1677c8fe343d8a72ac2195f', 'duration': 5498, - 'thumbnail': 'http://thumbs.ivi.ru/f20.vcp.digitalaccess.ru/contents/d/1/c3c885163a082c29bceeb7b5a267a6.jpg', + 'thumbnail': 're:^https?://.*\.jpg$', }, 'skip': 'Only works from Russia', }, - # Serial's serie + # Serial's series { 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/9549', 'md5': '221f56b35e3ed815fde2df71032f4b3e', 'info_dict': { 'id': '9549', 'ext': 'mp4', - 'title': 'Двое из ларца - Серия 1', + 'title': 'Двое из ларца - Дело Гольдберга (1 часть)', + 'series': 'Двое из ларца', + 'season': 'Сезон 1', + 'season_number': 1, + 'episode': 'Дело Гольдберга (1 часть)', + 'episode_number': 1, 'duration': 2655, - 'thumbnail': 'http://thumbs.ivi.ru/f15.vcp.digitalaccess.ru/contents/8/4/0068dc0677041f3336b7c2baad8fc0.jpg', + 'thumbnail': 're:^https?://.*\.jpg$', }, 'skip': 'Only works from Russia', } ] # Sorted by quality - _known_formats = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ'] - - # Sorted by size - _known_thumbnails = ['Thumb-120x90', 'Thumb-160', 'Thumb-640x480'] - - def _extract_description(self, html): - m = re.search(r'<meta name="description" content="(?P<description>[^"]+)"/>', html) - return m.group('description') if m is not None else None - - def _extract_comment_count(self, html): - m = re.search('(?s)<a href="#" id="view-comments" class="action-button dim gradient">\s*Комментарии:\s*(?P<commentcount>\d+)\s*</a>', html) - return int(m.group('commentcount')) if m is not None else 0 + _KNOWN_FORMATS = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ'] def _real_extract(self, url): video_id = self._match_id(url) - api_url = 'http://api.digitalaccess.ru/api/json/' - data = { 'method': 'da.content.get', 'params': [ @@ -76,11 +69,10 @@ class IviIE(InfoExtractor): ] } - request = sanitized_Request(api_url, json.dumps(data)) - - video_json_page = self._download_webpage( + request = sanitized_Request( + 'http://api.digitalaccess.ru/api/json/', json.dumps(data)) + video_json = self._download_json( request, video_id, 'Downloading video JSON') - video_json = json.loads(video_json_page) if 'error' in video_json: error = video_json['error'] @@ -95,35 +87,51 @@ class IviIE(InfoExtractor): formats = [{ 'url': x['url'], 'format_id': x['content_format'], - 'preference': self._known_formats.index(x['content_format']), - } for x in result['files'] if x['content_format'] in self._known_formats] + 'preference': self._KNOWN_FORMATS.index(x['content_format']), + } for x in result['files'] if x['content_format'] in self._KNOWN_FORMATS] self._sort_formats(formats) - if not formats: - raise ExtractorError('No media links available for %s' % video_id) - - duration = result['duration'] - compilation = result['compilation'] title = result['title'] + duration = int_or_none(result.get('duration')) + compilation = result.get('compilation') + episode = title if compilation else None + title = '%s - %s' % (compilation, title) if compilation is not None else title - previews = result['preview'] - previews.sort(key=lambda fmt: self._known_thumbnails.index(fmt['content_format'])) - thumbnail = previews[-1]['url'] if len(previews) > 0 else None + thumbnails = [{ + 'url': preview['url'], + 'id': preview.get('content_format'), + } for preview in result.get('preview', []) if preview.get('url')] + + webpage = self._download_webpage(url, video_id) + + season = self._search_regex( + r'<li[^>]+class="season active"[^>]*><a[^>]+>([^<]+)', + webpage, 'season', default=None) + season_number = int_or_none(self._search_regex( + r'<li[^>]+class="season active"[^>]*><a[^>]+data-season(?:-index)?="(\d+)"', + webpage, 'season number', default=None)) + + episode_number = int_or_none(self._search_regex( + r'<meta[^>]+itemprop="episode"[^>]*>\s*<meta[^>]+itemprop="episodeNumber"[^>]+content="(\d+)', + webpage, 'episode number', default=None)) - video_page = self._download_webpage(url, video_id, 'Downloading video page') - description = self._extract_description(video_page) - comment_count = self._extract_comment_count(video_page) + description = self._og_search_description(webpage, default=None) or self._html_search_meta( + 'description', webpage, 'description', default=None) return { 'id': video_id, 'title': title, - 'thumbnail': thumbnail, + 'series': compilation, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'thumbnails': thumbnails, 'description': description, 'duration': duration, - 'comment_count': comment_count, 'formats': formats, } @@ -149,8 +157,11 @@ class IviCompilationIE(InfoExtractor): }] def _extract_entries(self, html, compilation_id): - return [self.url_result('http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), 'Ivi') - for serie in re.findall(r'<strong><a href="/watch/%s/(\d+)">(?:[^<]+)</a></strong>' % compilation_id, html)] + return [ + self.url_result( + 'http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), IviIE.ie_key()) + for serie in re.findall( + r'<a href="/watch/%s/(\d+)"[^>]+data-id="\1"' % compilation_id, html)] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -158,7 +169,8 @@ class IviCompilationIE(InfoExtractor): season_id = mobj.group('seasonid') if season_id is not None: # Season link - season_page = self._download_webpage(url, compilation_id, 'Downloading season %s web page' % season_id) + season_page = self._download_webpage( + url, compilation_id, 'Downloading season %s web page' % season_id) playlist_id = '%s/season%s' % (compilation_id, season_id) playlist_title = self._html_search_meta('title', season_page, 'title') entries = self._extract_entries(season_page, compilation_id) @@ -166,8 +178,9 @@ class IviCompilationIE(InfoExtractor): compilation_page = self._download_webpage(url, compilation_id, 'Downloading compilation web page') playlist_id = compilation_id playlist_title = self._html_search_meta('title', compilation_page, 'title') - seasons = re.findall(r'<a href="/watch/%s/season(\d+)">[^<]+</a>' % compilation_id, compilation_page) - if len(seasons) == 0: # No seasons in this compilation + seasons = re.findall( + r'<a href="/watch/%s/season(\d+)' % compilation_id, compilation_page) + if not seasons: # No seasons in this compilation entries = self._extract_entries(compilation_page, compilation_id) else: entries = [] diff --git a/youtube_dl/extractor/ivideon.py b/youtube_dl/extractor/ivideon.py new file mode 100644 index 000000000..617dc8c07 --- /dev/null +++ b/youtube_dl/extractor/ivideon.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse, + compat_urlparse, +) +from ..utils import qualities + + +class IvideonIE(InfoExtractor): + IE_NAME = 'ivideon' + IE_DESC = 'Ivideon TV' + _VALID_URL = r'https?://(?:www\.)?ivideon\.com/tv/(?:[^/]+/)*camera/(?P<id>\d+-[\da-f]+)/(?P<camera_id>\d+)' + _TESTS = [{ + 'url': 'https://www.ivideon.com/tv/camera/100-916ca13b5c4ad9f564266424a026386d/0/', + 'info_dict': { + 'id': '100-916ca13b5c4ad9f564266424a026386d', + 'ext': 'flv', + 'title': 're:^Касса [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'Основное предназначение - запись действий кассиров. Плюс общий вид.', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://www.ivideon.com/tv/camera/100-c4ee4cb9ede885cf62dfbe93d7b53783/589824/?lang=ru', + 'only_matching': True, + }, { + 'url': 'https://www.ivideon.com/tv/map/22.917923/-31.816406/16/camera/100-e7bc16c7d4b5bbd633fd5350b66dfa9a/0', + 'only_matching': True, + }] + + _QUALITIES = ('low', 'mid', 'hi') + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + server_id, camera_id = mobj.group('id'), mobj.group('camera_id') + camera_name, description = None, None + camera_url = compat_urlparse.urljoin( + url, '/tv/camera/%s/%s/' % (server_id, camera_id)) + + webpage = self._download_webpage(camera_url, server_id, fatal=False) + if webpage: + config_string = self._search_regex( + r'var\s+config\s*=\s*({.+?});', webpage, 'config', default=None) + if config_string: + config = self._parse_json(config_string, server_id, fatal=False) + camera_info = config.get('ivTvAppOptions', {}).get('currentCameraInfo') + if camera_info: + camera_name = camera_info.get('camera_name') + description = camera_info.get('misc', {}).get('description') + if not camera_name: + camera_name = self._html_search_meta( + 'name', webpage, 'camera name', default=None) or self._search_regex( + r'<h1[^>]+class="b-video-title"[^>]*>([^<]+)', webpage, 'camera name', default=None) + + quality = qualities(self._QUALITIES) + + formats = [{ + 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse.urlencode({ + 'server': server_id, + 'camera': camera_id, + 'sessionId': 'demo', + 'q': quality(format_id), + }), + 'format_id': format_id, + 'ext': 'flv', + 'quality': quality(format_id), + } for format_id in self._QUALITIES] + self._sort_formats(formats) + + return { + 'id': server_id, + 'title': self._live_title(camera_name or server_id), + 'description': description, + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/kanalplay.py b/youtube_dl/extractor/kanalplay.py index 4597d1b96..6c3498c67 100644 --- a/youtube_dl/extractor/kanalplay.py +++ b/youtube_dl/extractor/kanalplay.py @@ -49,7 +49,7 @@ class KanalPlayIE(InfoExtractor): subs = self._download_json( 'http://www.kanal%splay.se/api/subtitles/%s' % (channel_id, video_id), video_id, 'Downloading subtitles JSON', fatal=False) - return {'se': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {} + return {'sv': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {} def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/lemonde.py b/youtube_dl/extractor/lemonde.py new file mode 100644 index 000000000..be66fff03 --- /dev/null +++ b/youtube_dl/extractor/lemonde.py @@ -0,0 +1,34 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class LemondeIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?lemonde\.fr/(?:[^/]+/)*(?P<id>[^/]+)\.html' + _TESTS = [{ + 'url': 'http://www.lemonde.fr/police-justice/video/2016/01/19/comprendre-l-affaire-bygmalion-en-cinq-minutes_4849702_1653578.html', + 'md5': '01fb3c92de4c12c573343d63e163d302', + 'info_dict': { + 'id': 'lqm3kl', + 'ext': 'mp4', + 'title': "Comprendre l'affaire Bygmalion en 5 minutes", + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 320, + 'upload_date': '20160119', + 'timestamp': 1453194778, + 'uploader_id': '3pmkp', + }, + }, { + 'url': 'http://redaction.actu.lemonde.fr/societe/video/2016/01/18/calais-debut-des-travaux-de-defrichement-dans-la-jungle_4849233_3224.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + digiteka_url = self._proto_relative_url(self._search_regex( + r'url\s*:\s*(["\'])(?P<url>(?:https?://)?//(?:www\.)?(?:digiteka\.net|ultimedia\.com)/deliver/.+?)\1', + webpage, 'digiteka url', group='url')) + return self.url_result(digiteka_url, 'Digiteka') diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index be648000e..08bdae8a2 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import datetime import re import time +import base64 from .common import InfoExtractor from ..compat import ( @@ -16,7 +17,9 @@ from ..utils import ( parse_iso8601, sanitized_Request, int_or_none, + str_or_none, encode_data_uri, + url_basename, ) @@ -239,3 +242,80 @@ class LetvPlaylistIE(LetvTvIE): }, 'playlist_mincount': 7 }] + + +class LetvCloudIE(InfoExtractor): + IE_DESC = '乐视云' + _VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+' + + _TESTS = [{ + 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=467623dedf', + 'md5': '26450599afd64c513bc77030ad15db44', + 'info_dict': { + 'id': 'p7jnfw5hw9_467623dedf', + 'ext': 'mp4', + 'title': 'Video p7jnfw5hw9_467623dedf', + }, + }, { + 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360', + 'info_dict': { + 'id': 'p7jnfw5hw9_ec93197892', + 'ext': 'mp4', + 'title': 'Video p7jnfw5hw9_ec93197892', + }, + }, { + 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd', + 'info_dict': { + 'id': 'p7jnfw5hw9_187060b6fd', + 'ext': 'mp4', + 'title': 'Video p7jnfw5hw9_187060b6fd', + }, + }] + + def _real_extract(self, url): + uu_mobj = re.search('uu=([\w]+)', url) + vu_mobj = re.search('vu=([\w]+)', url) + + if not uu_mobj or not vu_mobj: + raise ExtractorError('Invalid URL: %s' % url, expected=True) + + uu = uu_mobj.group(1) + vu = vu_mobj.group(1) + media_id = uu + '_' + vu + + play_json_req = sanitized_Request( + 'http://api.letvcloud.com/gpc.php?cf=html5&sign=signxxxxx&ver=2.2&format=json&' + + 'uu=' + uu + '&vu=' + vu) + play_json = self._download_json(play_json_req, media_id, 'Downloading playJson data') + + if not play_json.get('data'): + if play_json.get('message'): + raise ExtractorError('Letv cloud said: %s' % play_json['message'], expected=True) + elif play_json.get('code'): + raise ExtractorError('Letv cloud returned error %d' % play_json['code'], expected=True) + else: + raise ExtractorError('Letv cloud returned an unknwon error') + + def b64decode(s): + return base64.b64decode(s.encode('utf-8')).decode('utf-8') + + formats = [] + for media in play_json['data']['video_info']['media'].values(): + play_url = media['play_url'] + url = b64decode(play_url['main_url']) + decoded_url = b64decode(url_basename(url)) + formats.append({ + 'url': url, + 'ext': determine_ext(decoded_url), + 'format_id': int_or_none(play_url.get('vtype')), + 'format_note': str_or_none(play_url.get('definition')), + 'width': int_or_none(play_url.get('vwidth')), + 'height': int_or_none(play_url.get('vheight')), + }) + self._sort_formats(formats) + + return { + 'id': media_id, + 'title': 'Video %s' % media_id, + 'formats': formats, + } diff --git a/youtube_dl/extractor/lovehomeporn.py b/youtube_dl/extractor/lovehomeporn.py new file mode 100644 index 000000000..8f65a3c03 --- /dev/null +++ b/youtube_dl/extractor/lovehomeporn.py @@ -0,0 +1,37 @@ +from __future__ import unicode_literals + +import re + +from .nuevo import NuevoBaseIE + + +class LoveHomePornIE(NuevoBaseIE): + _VALID_URL = r'https?://(?:www\.)?lovehomeporn\.com/video/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' + _TEST = { + 'url': 'http://lovehomeporn.com/video/48483/stunning-busty-brunette-girlfriend-sucking-and-riding-a-big-dick#menu', + 'info_dict': { + 'id': '48483', + 'display_id': 'stunning-busty-brunette-girlfriend-sucking-and-riding-a-big-dick', + 'ext': 'mp4', + 'title': 'Stunning busty brunette girlfriend sucking and riding a big dick', + 'age_limit': 18, + 'duration': 238.47, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + info = self._extract_nuevo( + 'http://lovehomeporn.com/media/nuevo/config.php?key=%s' % video_id, + video_id) + info.update({ + 'display_id': display_id, + 'age_limit': 18 + }) + return info diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 88334889e..425fc9e2a 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -17,7 +17,7 @@ class MDRIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html' _TESTS = [{ - # MDR regularily deletes its videos + # MDR regularly deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', 'only_matching': True, }, { diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 340c922bd..1dd54c2f1 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -100,7 +100,7 @@ class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsIE(InfoExtractor): - # Does not include https becuase its certificate is invalid + # Does not include https because its certificate is invalid _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)' _TEST = { diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index 15eca825a..7830616f8 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -12,7 +12,10 @@ from ..compat import ( compat_str, compat_itertools_count, ) -from ..utils import sanitized_Request +from ..utils import ( + sanitized_Request, + float_or_none, +) class NetEaseMusicBaseIE(InfoExtractor): @@ -32,23 +35,32 @@ class NetEaseMusicBaseIE(InfoExtractor): result = b64encode(m.digest()).decode('ascii') return result.replace('/', '_').replace('+', '-') - @classmethod - def extract_formats(cls, info): + def extract_formats(self, info): formats = [] - for song_format in cls._FORMATS: + for song_format in self._FORMATS: details = info.get(song_format) if not details: continue - formats.append({ - 'url': 'http://m5.music.126.net/%s/%s.%s' % - (cls._encrypt(details['dfsId']), details['dfsId'], - details['extension']), - 'ext': details.get('extension'), - 'abr': details.get('bitrate', 0) / 1000, - 'format_id': song_format, - 'filesize': details.get('size'), - 'asr': details.get('sr') - }) + song_file_path = '/%s/%s.%s' % ( + self._encrypt(details['dfsId']), details['dfsId'], details['extension']) + + # 203.130.59.9, 124.40.233.182, 115.231.74.139, etc is a reverse proxy-like feature + # from NetEase's CDN provider that can be used if m5.music.126.net does not + # work, especially for users outside of Mainland China + # via: https://github.com/JixunMoe/unblock-163/issues/3#issuecomment-163115880 + for host in ('http://m5.music.126.net', 'http://115.231.74.139/m1.music.126.net', + 'http://124.40.233.182/m1.music.126.net', 'http://203.130.59.9/m1.music.126.net'): + song_url = host + song_file_path + if self._is_valid_url(song_url, info['id'], 'song'): + formats.append({ + 'url': song_url, + 'ext': details.get('extension'), + 'abr': float_or_none(details.get('bitrate'), scale=1000), + 'format_id': song_format, + 'filesize': details.get('size'), + 'asr': details.get('sr') + }) + break return formats @classmethod diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index e98a5ef89..8d5ce46ad 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -223,7 +223,7 @@ class NHLVideocenterIE(NHLBaseInfoExtractor): response = self._download_webpage(request_url, playlist_title) response = self._fix_json(response) if not response.strip(): - self._downloader.report_warning('Got an empty reponse, trying ' + self._downloader.report_warning('Got an empty response, trying ' 'adding the "newvideos" parameter') response = self._download_webpage(request_url + '&newvideos=true', playlist_title) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index fd107aca2..916a102bf 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -71,6 +71,7 @@ class NowTVBaseIE(InfoExtractor): class NowTVIE(NowTVBaseIE): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P<id>[^/]+)/(?:player|preview)' _TESTS = [{ diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py new file mode 100644 index 000000000..125c7010b --- /dev/null +++ b/youtube_dl/extractor/npr.py @@ -0,0 +1,82 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ( + int_or_none, + qualities, +) + + +class NprIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=449974205', + 'info_dict': { + 'id': '449974205', + 'title': 'New Music From Beach House, Chairlift, CMJ Discoveries And More' + }, + 'playlist_count': 7, + }, { + 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?action=1&t=1&islist=false&id=446928052&m=446929930&live=1', + 'info_dict': { + 'id': '446928052', + 'title': "Songs We Love: Tigran Hamasyan, 'Your Mercy is Boundless'" + }, + 'playlist': [{ + 'md5': '12fa60cb2d3ed932f53609d4aeceabf1', + 'info_dict': { + 'id': '446929930', + 'ext': 'mp3', + 'title': 'Your Mercy is Boundless (Bazum en Qo gtutyunqd)', + 'duration': 402, + }, + }], + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + config = self._download_json( + 'http://api.npr.org/query?%s' % compat_urllib_parse.urlencode({ + 'id': playlist_id, + 'fields': 'titles,audio,show', + 'format': 'json', + 'apiKey': 'MDAzMzQ2MjAyMDEyMzk4MTU1MDg3ZmM3MQ010', + }), playlist_id) + + story = config['list']['story'][0] + + KNOWN_FORMATS = ('threegp', 'mp4', 'mp3') + quality = qualities(KNOWN_FORMATS) + + entries = [] + for audio in story.get('audio', []): + title = audio.get('title', {}).get('$text') + duration = int_or_none(audio.get('duration', {}).get('$text')) + formats = [] + for format_id, formats_entry in audio.get('format', {}).items(): + if not formats_entry: + continue + if isinstance(formats_entry, list): + formats_entry = formats_entry[0] + format_url = formats_entry.get('$text') + if not format_url: + continue + if format_id in KNOWN_FORMATS: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'ext': formats_entry.get('type'), + 'quality': quality(format_id), + }) + self._sort_formats(formats) + entries.append({ + 'id': audio['id'], + 'title': title, + 'duration': duration, + 'formats': formats, + }) + + playlist_title = story.get('title', {}).get('$text') + return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/youtube_dl/extractor/ntvde.py b/youtube_dl/extractor/ntvde.py index d2cfe0961..a83e85cb8 100644 --- a/youtube_dl/extractor/ntvde.py +++ b/youtube_dl/extractor/ntvde.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, @@ -34,7 +35,7 @@ class NTVDeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) info = self._parse_json(self._search_regex( - r'(?s)ntv.pageInfo.article =\s(\{.*?\});', webpage, 'info'), + r'(?s)ntv\.pageInfo\.article\s*=\s*(\{.*?\});', webpage, 'info'), video_id, transform_source=js_to_json) timestamp = int_or_none(info.get('publishedDateAsUnixTimeStamp')) vdata = self._parse_json(self._search_regex( @@ -42,18 +43,24 @@ class NTVDeIE(InfoExtractor): webpage, 'player data'), video_id, transform_source=js_to_json) duration = parse_duration(vdata.get('duration')) - formats = [{ - 'format_id': 'flash', - 'url': 'rtmp://fms.n-tv.de/' + vdata['video'], - }, { - 'format_id': 'mobile', - 'url': 'http://video.n-tv.de' + vdata['videoMp4'], - 'tbr': 400, # estimation - }] - m3u8_url = 'http://video.n-tv.de' + vdata['videoM3u8'] - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', - entry_protocol='m3u8_native', preference=0)) + + formats = [] + if vdata.get('video'): + formats.append({ + 'format_id': 'flash', + 'url': 'rtmp://fms.n-tv.de/%s' % vdata['video'], + }) + if vdata.get('videoMp4'): + formats.append({ + 'format_id': 'mobile', + 'url': compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoMp4']), + 'tbr': 400, # estimation + }) + if vdata.get('videoM3u8'): + m3u8_url = compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoM3u8']) + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + preference=0, m3u8_id='hls', fatal=False)) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/nuevo.py b/youtube_dl/extractor/nuevo.py new file mode 100644 index 000000000..ef093dec2 --- /dev/null +++ b/youtube_dl/extractor/nuevo.py @@ -0,0 +1,38 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + float_or_none, + xpath_text +) + + +class NuevoBaseIE(InfoExtractor): + def _extract_nuevo(self, config_url, video_id): + config = self._download_xml( + config_url, video_id, transform_source=lambda s: s.strip()) + + title = xpath_text(config, './title', 'title', fatal=True).strip() + video_id = xpath_text(config, './mediaid', default=video_id) + thumbnail = xpath_text(config, ['./image', './thumb']) + duration = float_or_none(xpath_text(config, './duration')) + + formats = [] + for element_name, format_id in (('file', 'sd'), ('filehd', 'hd')): + video_url = xpath_text(config, element_name) + if video_url: + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) + self._check_formats(formats, video_id) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats + } diff --git a/youtube_dl/extractor/ora.py b/youtube_dl/extractor/ora.py index 9c4255a2d..8545fb1b8 100644 --- a/youtube_dl/extractor/ora.py +++ b/youtube_dl/extractor/ora.py @@ -21,7 +21,6 @@ class OraTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!', 'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1', - 'duration': 1477, } } @@ -30,14 +29,14 @@ class OraTVIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_data = self._search_regex( - r'"current"\s*:\s*({[^}]+?})', webpage, 'current video') + r'"(?:video|current)"\s*:\s*({[^}]+?})', webpage, 'current video') m3u8_url = self._search_regex( - r'"hls_stream"\s*:\s*"([^"]+)', video_data, 'm3u8 url', None) + r'hls_stream"?\s*:\s*"([^"]+)', video_data, 'm3u8 url', None) if m3u8_url: formats = self._extract_m3u8_formats( m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - # simular to GameSpotIE + # similar to GameSpotIE m3u8_path = compat_urlparse.urlparse(m3u8_url).path QUALITIES_RE = r'((,[a-z]+\d+)+,?)' available_qualities = self._search_regex( @@ -62,14 +61,12 @@ class OraTVIE(InfoExtractor): return { 'id': self._search_regex( - r'"video_id"\s*:\s*(\d+)', video_data, 'video id'), + r'"id"\s*:\s*(\d+)', video_data, 'video id', default=display_id), 'display_id': display_id, 'title': unescapeHTML(self._og_search_title(webpage)), 'description': get_element_by_attribute( 'class', 'video_txt_decription', webpage), 'thumbnail': self._proto_relative_url(self._search_regex( r'"thumb"\s*:\s*"([^"]+)', video_data, 'thumbnail', None)), - 'duration': int(self._search_regex( - r'"duration"\s*:\s*(\d+)', video_data, 'duration')), 'formats': formats, } diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 2e6c9872b..c54775d54 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -170,7 +170,21 @@ class ORFOE1IE(InfoExtractor): class ORFFM4IE(InfoExtractor): IE_NAME = 'orf:fm4' IE_DESC = 'radio FM4' - _VALID_URL = r'http://fm4\.orf\.at/7tage/?#(?P<date>[0-9]+)/(?P<show>\w+)' + _VALID_URL = r'http://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' + + _TEST = { + 'url': 'http://fm4.orf.at/player/20160110/IS/', + 'md5': '01e736e8f1cef7e13246e880a59ad298', + 'info_dict': { + 'id': '2016-01-10_2100_tl_54_7DaysSun13_11244', + 'ext': 'mp3', + 'title': 'Im Sumpf', + 'description': 'md5:384c543f866c4e422a55f66a62d669cd', + 'duration': 7173, + 'timestamp': 1452456073, + 'upload_date': '20160110', + }, + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 55c11b3bf..12e1c2862 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -232,7 +232,7 @@ class PluralsightIE(PluralsightBaseIE): # { a = author, cn = clip_id, lc = end, m = name } return { - 'id': clip['clipName'], + 'id': clip.get('clipName') or clip['name'], 'title': '%s - %s' % (module['title'], clip['title']), 'duration': int_or_none(clip.get('duration')) or parse_duration(clip.get('formattedDuration')), 'creator': author, diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index baa54a3af..670e6950f 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -20,7 +20,7 @@ from ..utils import ( class ProSiebenSat1IE(InfoExtractor): IE_NAME = 'prosiebensat1' IE_DESC = 'ProSiebenSat.1 Digital' - _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany|7tv)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)' _TESTS = [ { @@ -32,7 +32,7 @@ class ProSiebenSat1IE(InfoExtractor): 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', 'info_dict': { 'id': '2104602', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Episode 18 - Staffel 2', 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', 'upload_date': '20131231', @@ -138,14 +138,13 @@ class ProSiebenSat1IE(InfoExtractor): 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', 'info_dict': { 'id': '2572814', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Andreas Kümmert: Rocket Man', 'description': 'md5:6ddb02b0781c6adf778afea606652e38', 'upload_date': '20131017', 'duration': 469.88, }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -153,13 +152,12 @@ class ProSiebenSat1IE(InfoExtractor): 'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html', 'info_dict': { 'id': '2156342', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Kurztrips zum Valentinstag', - 'description': 'Romantischer Kurztrip zum Valentinstag? Wir verraten, was sich hier wirklich lohnt.', + 'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.', 'duration': 307.24, }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -172,12 +170,26 @@ class ProSiebenSat1IE(InfoExtractor): }, 'playlist_count': 2, }, + { + 'url': 'http://www.7tv.de/circus-halligalli/615-best-of-circus-halligalli-ganze-folge', + 'info_dict': { + 'id': '4187506', + 'ext': 'flv', + 'title': 'Best of Circus HalliGalli', + 'description': 'md5:8849752efd90b9772c9db6fdf87fb9e9', + 'upload_date': '20151229', + }, + 'params': { + 'skip_download': True, + }, + }, ] _CLIPID_REGEXES = [ r'"clip_id"\s*:\s+"(\d+)"', r'clipid: "(\d+)"', r'clip[iI]d=(\d+)', + r'clip[iI]d\s*=\s*["\'](\d+)', r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", ] _TITLE_REGEXES = [ @@ -186,12 +198,16 @@ class ProSiebenSat1IE(InfoExtractor): r'<!-- start video -->\s*<h1>(.+?)</h1>', r'<h1 class="att-name">\s*(.+?)</h1>', r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>', + r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>', + r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>', ] _DESCRIPTION_REGEXES = [ r'<p itemprop="description">\s*(.+?)</p>', r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>', r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>', r'<p class="att-description">\s*(.+?)\s*</p>', + r'<p class="video-description" itemprop="description">\s*(.+?)</p>', + r'<div[^>]+id="veeseoDescription"[^>]*>(.+?)</div>', ] _UPLOAD_DATE_REGEXES = [ r'<meta property="og:published_time" content="(.+?)">', diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py index d9cfbf180..47c8331fe 100644 --- a/youtube_dl/extractor/rte.py +++ b/youtube_dl/extractor/rte.py @@ -2,19 +2,22 @@ from __future__ import unicode_literals from .common import InfoExtractor - from ..utils import ( float_or_none, + parse_iso8601, + unescapeHTML, ) class RteIE(InfoExtractor): + IE_NAME = 'rte' + IE_DESC = 'Raidió Teilifís Éireann TV' _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', 'info_dict': { 'id': '10478715', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Watch iWitness online', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'iWitness : The spirit of Ireland, one voice and one minute at a time.', @@ -44,13 +47,6 @@ class RteIE(InfoExtractor): # f4m_url = server + relative_url f4m_url = json_string['shows'][0]['media:group'][0]['rte:server'] + json_string['shows'][0]['media:group'][0]['url'] f4m_formats = self._extract_f4m_formats(f4m_url, video_id) - f4m_formats = [{ - 'format_id': f['format_id'], - 'url': f['url'], - 'ext': 'mp4', - 'width': f['width'], - 'height': f['height'], - } for f in f4m_formats] return { 'id': video_id, @@ -60,3 +56,73 @@ class RteIE(InfoExtractor): 'thumbnail': thumbnail, 'duration': duration, } + + +class RteRadioIE(InfoExtractor): + IE_NAME = 'rte:radio' + IE_DESC = 'Raidió Teilifís Éireann radio' + # Radioplayer URLs have the specifier #!rii=<channel_id>:<id>:<playable_item_id>:<date>: + # where the IDs are int/empty, the date is DD-MM-YYYY, and the specifier may be truncated. + # An <id> uniquely defines an individual recording, and is the only part we require. + _VALID_URL = r'https?://(?:www\.)?rte\.ie/radio/utils/radioplayer/rteradioweb\.html#!rii=(?:[0-9]*)(?:%3A|:)(?P<id>[0-9]+)' + + _TEST = { + 'url': 'http://www.rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=16:10507902:2414:27-12-2015:', + 'info_dict': { + 'id': '10507902', + 'ext': 'mp4', + 'title': 'Gloria', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:9ce124a7fb41559ec68f06387cabddf0', + 'timestamp': 1451203200, + 'upload_date': '20151227', + 'duration': 7230.0, + }, + 'params': { + 'skip_download': 'f4m fails with --test atm' + } + } + + def _real_extract(self, url): + item_id = self._match_id(url) + + json_string = self._download_json( + 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=' + item_id, + item_id) + + # NB the string values in the JSON are stored using XML escaping(!) + show = json_string['shows'][0] + title = unescapeHTML(show['title']) + description = unescapeHTML(show.get('description')) + thumbnail = show.get('thumbnail') + duration = float_or_none(show.get('duration'), 1000) + timestamp = parse_iso8601(show.get('published')) + + mg = show['media:group'][0] + + formats = [] + + if mg.get('url') and not mg['url'].startswith('rtmpe:'): + formats.append({'url': mg['url']}) + + if mg.get('hls_server') and mg.get('hls_url'): + formats.extend(self._extract_m3u8_formats( + mg['hls_server'] + mg['hls_url'], item_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + + if mg.get('hds_server') and mg.get('hds_url'): + formats.extend(self._extract_f4m_formats( + mg['hds_server'] + mg['hds_url'], item_id, + f4m_id='hds', fatal=False)) + + self._sort_formats(formats) + + return { + 'id': item_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/ruleporn.py b/youtube_dl/extractor/ruleporn.py new file mode 100644 index 000000000..ebf9808d5 --- /dev/null +++ b/youtube_dl/extractor/ruleporn.py @@ -0,0 +1,44 @@ +from __future__ import unicode_literals + +from .nuevo import NuevoBaseIE + + +class RulePornIE(NuevoBaseIE): + _VALID_URL = r'https?://(?:www\.)?ruleporn\.com/(?:[^/?#&]+/)*(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'http://ruleporn.com/brunette-nympho-chick-takes-her-boyfriend-in-every-angle/', + 'md5': '86861ebc624a1097c7c10eaf06d7d505', + 'info_dict': { + 'id': '48212', + 'display_id': 'brunette-nympho-chick-takes-her-boyfriend-in-every-angle', + 'ext': 'mp4', + 'title': 'Brunette Nympho Chick Takes Her Boyfriend In Every Angle', + 'description': 'md5:6d28be231b981fff1981deaaa03a04d5', + 'age_limit': 18, + 'duration': 635.1, + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'lovehomeporn\.com/embed/(\d+)', webpage, 'video id') + + title = self._search_regex( + r'<h2[^>]+title=(["\'])(?P<url>.+?)\1', + webpage, 'title', group='url') + description = self._html_search_meta('description', webpage) + + info = self._extract_nuevo( + 'http://lovehomeporn.com/media/nuevo/econfig.php?key=%s&rp=true' % video_id, + video_id) + info.update({ + 'display_id': display_id, + 'title': title, + 'description': description, + 'age_limit': 18 + }) + return info diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index f76fb12c0..1178b7a27 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -73,6 +73,9 @@ class ShahidIE(InfoExtractor): 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-%s.html' % (video_id, api_vars['type']), video_id, 'Downloading player JSON') + if player.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) + formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4') video = self._download_json( diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 02e64e094..b2d5487ca 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -384,27 +384,24 @@ class SoundcloudUserIE(SoundcloudIE): resource = mobj.group('rsrc') or 'all' base_url = self._BASE_URL_MAP[resource] % user['id'] - next_href = None + COMMON_QUERY = { + 'limit': 50, + 'client_id': self._CLIENT_ID, + 'linked_partitioning': '1', + } + + query = COMMON_QUERY.copy() + query['offset'] = 0 + + next_href = base_url + '?' + compat_urllib_parse.urlencode(query) entries = [] for i in itertools.count(): - if not next_href: - data = compat_urllib_parse.urlencode({ - 'offset': i * 50, - 'limit': 50, - 'client_id': self._CLIENT_ID, - 'linked_partitioning': '1', - 'representation': 'speedy', - }) - next_href = base_url + '?' + data - response = self._download_json( next_href, uploader, 'Downloading track page %s' % (i + 1)) collection = response['collection'] - if not collection: - self.to_screen('%s: End page received' % uploader) break def resolve_permalink_url(candidates): @@ -419,12 +416,15 @@ class SoundcloudUserIE(SoundcloudIE): if permalink_url: entries.append(self.url_result(permalink_url)) - if 'next_href' in response: - next_href = response['next_href'] - if not next_href: - break - else: - next_href = None + next_href = response.get('next_href') + if not next_href: + break + + parsed_next_href = compat_urlparse.urlparse(response['next_href']) + qs = compat_urlparse.parse_qs(parsed_next_href.query) + qs.update(COMMON_QUERY) + next_href = compat_urlparse.urlunparse( + parsed_next_href._replace(query=compat_urllib_parse.urlencode(qs, True))) return { '_type': 'playlist', diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index fc20f664b..399c3b8ee 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -37,6 +37,14 @@ class SVTBaseIE(InfoExtractor): }) self._sort_formats(formats) + subtitles = {} + subtitle_references = video_info.get('subtitleReferences') + if isinstance(subtitle_references, list): + for sr in subtitle_references: + subtitle_url = sr.get('url') + if subtitle_url: + subtitles.setdefault('sv', []).append({'url': subtitle_url}) + duration = video_info.get('materialLength') age_limit = 18 if video_info.get('inappropriateForChildren') else 0 @@ -44,6 +52,7 @@ class SVTBaseIE(InfoExtractor): 'id': video_id, 'title': title, 'formats': formats, + 'subtitles': subtitles, 'thumbnail': thumbnail, 'duration': duration, 'age_limit': age_limit, @@ -83,30 +92,23 @@ class SVTIE(SVTBaseIE): class SVTPlayIE(SVTBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final', - 'md5': 'ade3def0643fa1c40587a422f98edfd9', - 'info_dict': { - 'id': '2609989', - 'ext': 'flv', - 'title': 'SM veckan vinter, Örebro - Rally, final', - 'duration': 4500, - 'thumbnail': 're:^https?://.*[\.-]jpg$', - 'age_limit': 0, - }, - }, { - 'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318', - 'md5': 'c3101a17ce9634f4c1f9800f0746c187', + _TEST = { + 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', + 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', 'info_dict': { - 'id': '1058509', - 'ext': 'flv', - 'title': 'Farlig kryssning', - 'duration': 2566, + 'id': '5996901', + 'ext': 'mp4', + 'title': 'Flygplan till Haile Selassie', + 'duration': 3527, 'thumbnail': 're:^https?://.*[\.-]jpg$', 'age_limit': 0, + 'subtitles': { + 'sv': [{ + 'ext': 'wsrt', + }] + }, }, - 'skip': 'Only works from Sweden', - }] + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/testurl.py b/youtube_dl/extractor/testurl.py index c7d559315..46918adb0 100644 --- a/youtube_dl/extractor/testurl.py +++ b/youtube_dl/extractor/testurl.py @@ -7,7 +7,7 @@ from ..utils import ExtractorError class TestURLIE(InfoExtractor): - """ Allows adressing of the test cases as test:yout.*be_1 """ + """ Allows addressing of the test cases as test:yout.*be_1 """ IE_DESC = False # Do not list _VALID_URL = r'test(?:url)?:(?P<id>(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?)$' diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 0bf6726b5..10f2cad55 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -85,7 +85,7 @@ class ThePlatformBaseIE(InfoExtractor): class ThePlatformIE(ThePlatformBaseIE): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ - (?:(?P<media>(?:[^/]+/)+select/media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? + (?:(?P<media>(?:(?:[^/]+/)+select/)?media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? |theplatform:)(?P<id>[^/\?&]+)''' _TESTS = [{ diff --git a/youtube_dl/extractor/trollvids.py b/youtube_dl/extractor/trollvids.py new file mode 100644 index 000000000..d239949a6 --- /dev/null +++ b/youtube_dl/extractor/trollvids.py @@ -0,0 +1,36 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .nuevo import NuevoBaseIE + + +class TrollvidsIE(NuevoBaseIE): + _VALID_URL = r'http://(?:www\.)?trollvids\.com/video/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + IE_NAME = 'trollvids' + _TEST = { + 'url': 'http://trollvids.com/video/2349002/%E3%80%90MMD-R-18%E3%80%91%E3%82%AC%E3%83%BC%E3%83%AB%E3%83%95%E3%83%AC%E3%83%B3%E3%83%89-carrymeoff', + 'md5': '1d53866b2c514b23ed69e4352fdc9839', + 'info_dict': { + 'id': '2349002', + 'ext': 'mp4', + 'title': '【MMD R-18】ガールフレンド carry_me_off', + 'age_limit': 18, + 'duration': 216.78, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + info = self._extract_nuevo( + 'http://trollvids.com/nuevo/player/config.php?v=%s' % video_id, + video_id) + info.update({ + 'display_id': display_id, + 'age_limit': 18 + }) + return info diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py index e7b79243a..d55e0c563 100644 --- a/youtube_dl/extractor/trutube.py +++ b/youtube_dl/extractor/trutube.py @@ -1,11 +1,10 @@ from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import xpath_text +from .nuevo import NuevoBaseIE -class TruTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?trutube\.tv/(?:video/|nuevo/player/embed\.php\?v=)(?P<id>[0-9]+)' +class TruTubeIE(NuevoBaseIE): + _VALID_URL = r'https?://(?:www\.)?trutube\.tv/(?:video/|nuevo/player/embed\.php\?v=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-', 'md5': 'c5b6e301b0a2040b074746cbeaa26ca1', @@ -22,19 +21,6 @@ class TruTubeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - config = self._download_xml( + return self._extract_nuevo( 'https://trutube.tv/nuevo/player/config.php?v=%s' % video_id, - video_id, transform_source=lambda s: s.strip()) - - # filehd is always 404 - video_url = xpath_text(config, './file', 'video URL', fatal=True) - title = xpath_text(config, './title', 'title').strip() - thumbnail = xpath_text(config, './image', ' thumbnail') - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - } + video_id) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 46ef61ff5..1d9271d1e 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -1,10 +1,9 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse +from ..compat import compat_str from ..utils import ( int_or_none, sanitized_Request, @@ -15,25 +14,23 @@ from ..aes import aes_decrypt_text class Tube8IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)' - _TESTS = [ - { - 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', - 'md5': '44bf12b98313827dd52d35b8706a4ea0', - 'info_dict': { - 'id': '229795', - 'display_id': 'kasia-music-video', - 'ext': 'mp4', - 'description': 'hot teen Kasia grinding', - 'uploader': 'unknown', - 'title': 'Kasia music video', - 'age_limit': 18, - } - }, - { - 'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/', - 'only_matching': True, - }, - ] + _TESTS = [{ + 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', + 'md5': '65e20c48e6abff62ed0c3965fff13a39', + 'info_dict': { + 'id': '229795', + 'display_id': 'kasia-music-video', + 'ext': 'mp4', + 'description': 'hot teen Kasia grinding', + 'uploader': 'unknown', + 'title': 'Kasia music video', + 'age_limit': 18, + 'duration': 230, + } + }, { + 'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -44,14 +41,28 @@ class Tube8IE(InfoExtractor): req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, display_id) - flashvars = json.loads(self._html_search_regex( - r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars')) + flashvars = self._parse_json( + self._search_regex( + r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars'), + video_id) - video_url = flashvars['video_url'] - if flashvars.get('encrypted') is True: - video_url = aes_decrypt_text(video_url, flashvars['video_title'], 32).decode('utf-8') - path = compat_urllib_parse_urlparse(video_url).path - format_id = '-'.join(path.split('/')[4].split('_')[:2]) + formats = [] + for key, video_url in flashvars.items(): + if not isinstance(video_url, compat_str) or not video_url.startswith('http'): + continue + height = self._search_regex( + r'quality_(\d+)[pP]', key, 'height', default=None) + if not height: + continue + if flashvars.get('encrypted') is True: + video_url = aes_decrypt_text( + video_url, flashvars['video_title'], 32).decode('utf-8') + formats.append({ + 'url': video_url, + 'format_id': '%sp' % height, + 'height': int(height), + }) + self._sort_formats(formats) thumbnail = flashvars.get('image_url') @@ -62,32 +73,31 @@ class Tube8IE(InfoExtractor): uploader = self._html_search_regex( r'<span class="username">\s*(.+?)\s*<', webpage, 'uploader', fatal=False) + duration = int_or_none(flashvars.get('video_duration')) - like_count = int_or_none(self._html_search_regex( + like_count = int_or_none(self._search_regex( r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False)) - dislike_count = int_or_none(self._html_search_regex( + dislike_count = int_or_none(self._search_regex( r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False)) - view_count = self._html_search_regex( - r'<strong>Views: </strong>([\d,\.]+)\s*</li>', webpage, 'view count', fatal=False) - if view_count: - view_count = str_to_int(view_count) - comment_count = self._html_search_regex( - r'<span id="allCommentsCount">(\d+)</span>', webpage, 'comment count', fatal=False) - if comment_count: - comment_count = str_to_int(comment_count) + view_count = str_to_int(self._search_regex( + r'<strong>Views: </strong>([\d,\.]+)\s*</li>', + webpage, 'view count', fatal=False)) + comment_count = str_to_int(self._search_regex( + r'<span id="allCommentsCount">(\d+)</span>', + webpage, 'comment count', fatal=False)) return { 'id': video_id, 'display_id': display_id, - 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, - 'format_id': format_id, + 'duration': duration, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, 'age_limit': 18, + 'formats': formats, } diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 5f7ac4b35..da3cd76f7 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -4,10 +4,16 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str +from ..utils import ( + int_or_none, + float_or_none, + unescapeHTML, +) class TudouIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/([^/]+/)*(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])' + IE_NAME = 'tudou' + _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:(?:programs|wlplay)/view|(?:listplay|albumplay)/[\w-]{11})/(?P<id>[\w-]{11})' _TESTS = [{ 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', 'md5': '140a49ed444bd22f93330985d8475fcb', @@ -16,6 +22,11 @@ class TudouIE(InfoExtractor): 'ext': 'f4v', 'title': '卡马乔国足开大脚长传冲吊集锦', 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1372113489000, + 'description': '卡马乔卡家军,开大脚先进战术不完全集锦!', + 'duration': 289.04, + 'view_count': int, + 'filesize': int, } }, { 'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/', @@ -24,10 +35,12 @@ class TudouIE(InfoExtractor): 'ext': 'f4v', 'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012', 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1349207518000, + 'description': 'md5:294612423894260f2dcd5c6c04fe248b', + 'duration': 5478.33, + 'view_count': int, + 'filesize': int, } - }, { - 'url': 'http://www.tudou.com/albumplay/cJAHGih4yYg.html', - 'only_matching': True, }] _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' @@ -42,24 +55,20 @@ class TudouIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + item_data = self._download_json( + 'http://www.tudou.com/tvp/getItemInfo.action?ic=%s' % video_id, video_id) - youku_vcode = self._search_regex( - r'vcode\s*:\s*[\'"]([^\'"]*)[\'"]', webpage, 'youku vcode', default=None) + youku_vcode = item_data.get('vcode') if youku_vcode: return self.url_result('youku:' + youku_vcode, ie='Youku') - title = self._search_regex( - r',kw\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'title') - thumbnail_url = self._search_regex( - r',pic\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'thumbnail URL', fatal=False) - - player_url = self._search_regex( - r'playerUrl\s*:\s*[\'"]([^\'"]+\.swf)[\'"]', - webpage, 'player URL', default=self._PLAYER_URL) + title = unescapeHTML(item_data['kw']) + description = item_data.get('desc') + thumbnail_url = item_data.get('pic') + view_count = int_or_none(item_data.get('playTimes')) + timestamp = int_or_none(item_data.get('pt')) - segments = self._parse_json(self._search_regex( - r'segs: \'([^\']+)\'', webpage, 'segments'), video_id) + segments = self._parse_json(item_data['itemSegs'], video_id) # It looks like the keys are the arguments that have to be passed as # the hd field in the request url, we pick the higher # Also, filter non-number qualities (see issue #3643). @@ -80,8 +89,13 @@ class TudouIE(InfoExtractor): 'ext': ext, 'title': title, 'thumbnail': thumbnail_url, + 'description': description, + 'view_count': view_count, + 'timestamp': timestamp, + 'duration': float_or_none(part.get('seconds'), 1000), + 'filesize': int_or_none(part.get('size')), 'http_headers': { - 'Referer': player_url, + 'Referer': self._PLAYER_URL, }, } result.append(part_info) @@ -92,3 +106,47 @@ class TudouIE(InfoExtractor): 'id': video_id, 'title': title, } + + +class TudouPlaylistIE(InfoExtractor): + IE_NAME = 'tudou:playlist' + _VALID_URL = r'https?://(?:www\.)?tudou\.com/listplay/(?P<id>[\w-]{11})\.html' + _TESTS = [{ + 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo.html', + 'info_dict': { + 'id': 'zzdE77v6Mmo', + }, + 'playlist_mincount': 209, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + playlist_data = self._download_json( + 'http://www.tudou.com/tvp/plist.action?lcode=%s' % playlist_id, playlist_id) + entries = [self.url_result( + 'http://www.tudou.com/programs/view/%s' % item['icode'], + 'Tudou', item['icode'], + item['kw']) for item in playlist_data['items']] + return self.playlist_result(entries, playlist_id) + + +class TudouAlbumIE(InfoExtractor): + IE_NAME = 'tudou:album' + _VALID_URL = r'https?://(?:www\.)?tudou\.com/album(?:cover|play)/(?P<id>[\w-]{11})' + _TESTS = [{ + 'url': 'http://www.tudou.com/albumplay/v5qckFJvNJg.html', + 'info_dict': { + 'id': 'v5qckFJvNJg', + }, + 'playlist_mincount': 45, + }] + + def _real_extract(self, url): + album_id = self._match_id(url) + album_data = self._download_json( + 'http://www.tudou.com/tvp/alist.action?acode=%s' % album_id, album_id) + entries = [self.url_result( + 'http://www.tudou.com/programs/view/%s' % item['icode'], + 'Tudou', item['icode'], + item['kw']) for item in album_data['items']] + return self.playlist_result(entries, album_id) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 1c4b6d635..343edf206 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -67,7 +67,7 @@ class TV4IE(InfoExtractor): info = self._download_json( 'http://www.tv4play.se/player/assets/%s.json' % video_id, video_id, 'Downloading video info JSON') - # If is_geo_restricted is true, it doesn't neceserally mean we can't download it + # If is_geo_restricted is true, it doesn't necessarily mean we can't download it if info['is_geo_restricted']: self.report_warning('This content might not be available in your country due to licensing restrictions.') if info['requires_subscription']: diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py new file mode 100644 index 000000000..ca7d953b8 --- /dev/null +++ b/youtube_dl/extractor/twentymin.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import remove_end + + +class TwentyMinutenIE(InfoExtractor): + IE_NAME = '20min' + _VALID_URL = r'https?://(?:www\.)?20min\.ch/(?:videotv/*\?.*\bvid=(?P<id>\d+)|(?:[^/]+/)*(?P<display_id>[^/#?]+))' + _TESTS = [{ + # regular video + 'url': 'http://www.20min.ch/videotv/?vid=469148&cid=2', + 'md5': 'b52d6bc6ea6398e6a38f12cfd418149c', + 'info_dict': { + 'id': '469148', + 'ext': 'flv', + 'title': '85 000 Franken für 15 perfekte Minuten', + 'description': 'Was die Besucher vom Silvesterzauber erwarten können. (Video: Alice Grosjean/Murat Temel)', + 'thumbnail': 'http://thumbnails.20min-tv.ch/server063/469148/frame-72-469148.jpg' + } + }, { + # news article with video + 'url': 'http://www.20min.ch/schweiz/news/story/-Wir-muessen-mutig-nach-vorne-schauen--22050469', + 'md5': 'cd4cbb99b94130cff423e967cd275e5e', + 'info_dict': { + 'id': '469408', + 'display_id': '-Wir-muessen-mutig-nach-vorne-schauen--22050469', + 'ext': 'flv', + 'title': '«Wir müssen mutig nach vorne schauen»', + 'description': 'Kein Land sei innovativer als die Schweiz, sagte Johann Schneider-Ammann in seiner Neujahrsansprache. Das Land müsse aber seine Hausaufgaben machen.', + 'thumbnail': 'http://www.20min.ch/images/content/2/2/0/22050469/10/teaserbreit.jpg' + } + }, { + 'url': 'http://www.20min.ch/videotv/?cid=44&vid=468738', + 'only_matching': True, + }, { + 'url': 'http://www.20min.ch/ro/sortir/cinema/story/Grandir-au-bahut--c-est-dur-18927411', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + title = self._html_search_regex( + r'<h1>.*?<span>(.+?)</span></h1>', + webpage, 'title', default=None) + if not title: + title = remove_end(re.sub( + r'^20 [Mm]inuten.*? -', '', self._og_search_title(webpage)), ' - News') + + if not video_id: + video_id = self._search_regex( + r'"file\d?"\s*,\s*\"(\d+)', webpage, 'video id') + + description = self._html_search_meta( + 'description', webpage, 'description') + thumbnail = self._og_search_thumbnail(webpage) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': 'http://speed.20min-tv.ch/%sm.flv' % video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 1df636779..f5b5e7fd6 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -11,6 +11,7 @@ from ..utils import ( float_or_none, int_or_none, sanitized_Request, + unescapeHTML, ) @@ -19,8 +20,6 @@ class UdemyIE(InfoExtractor): _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)' _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' _ORIGIN_URL = 'https://www.udemy.com' - _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<' - _ALREADY_ENROLLED = '>You are already taking this course.<' _NETRC_MACHINE = 'udemy' _TESTS = [{ @@ -37,15 +36,21 @@ class UdemyIE(InfoExtractor): }] def _enroll_course(self, webpage, course_id): - enroll_url = self._search_regex( + checkout_url = unescapeHTML(self._search_regex( + r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/payment/checkout/.+?)\1', + webpage, 'checkout url', group='url', default=None)) + if checkout_url: + raise ExtractorError( + 'Course %s is not free. You have to pay for it before you can download. ' + 'Use this URL to confirm purchase: %s' % (course_id, checkout_url), expected=True) + + enroll_url = unescapeHTML(self._search_regex( r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1', - webpage, 'enroll url', group='url', - default='https://www.udemy.com/course/subscribe/?courseId=%s' % course_id) - webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') - if self._SUCCESSFULLY_ENROLLED in webpage: - self.to_screen('%s: Successfully enrolled in' % course_id) - elif self._ALREADY_ENROLLED in webpage: - self.to_screen('%s: Already enrolled in' % course_id) + webpage, 'enroll url', group='url', default=None)) + if enroll_url: + webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') + if '>You have enrolled in' in webpage: + self.to_screen('%s: Successfully enrolled in the course' % course_id) def _download_lecture(self, course_id, lecture_id): return self._download_json( diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py index f70978299..594bee4f9 100644 --- a/youtube_dl/extractor/unistra.py +++ b/youtube_dl/extractor/unistra.py @@ -38,7 +38,7 @@ class UnistraIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - files = set(re.findall(r'file\s*:\s*"([^"]+)"', webpage)) + files = set(re.findall(r'file\s*:\s*"(/[^"]+)"', webpage)) quality = qualities(['SD', 'HD']) formats = [] diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 73b05ecab..b5fe753d7 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -47,7 +47,7 @@ class UstreamIE(InfoExtractor): m = re.match(self._VALID_URL, url) video_id = m.group('id') - # some sites use this embed format (see: http://github.com/rg3/youtube-dl/issues/2990) + # some sites use this embed format (see: https://github.com/rg3/youtube-dl/issues/2990) if m.group('type') == 'embed/recorded': video_id = m.group('id') desktop_url = 'http://www.ustream.tv/recorded/' + video_id diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index 87aca327b..5e2e7cbac 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -8,6 +8,7 @@ from ..utils import sanitized_Request class VideoMegaIE(InfoExtractor): + _WORKING = False _VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA', diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py index a66d6de23..fcee940e6 100644 --- a/youtube_dl/extractor/videomore.py +++ b/youtube_dl/extractor/videomore.py @@ -170,7 +170,7 @@ class VideomoreVideoIE(InfoExtractor): 'skip_download': True, }, }, { - # season single serie with og:video:iframe + # season single series with og:video:iframe 'url': 'http://videomore.ru/poslednii_ment/1_sezon/14_seriya', 'only_matching': True, }, { diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py index 591024ead..2cd36508a 100644 --- a/youtube_dl/extractor/videott.py +++ b/youtube_dl/extractor/videott.py @@ -11,6 +11,7 @@ from ..utils import ( class VideoTtIE(InfoExtractor): + _WORKING = False ID_NAME = 'video.tt' IE_DESC = 'video.tt - Your True Tube' _VALID_URL = r'http://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})' diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 185b1c119..fe94a4793 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -155,10 +155,10 @@ class ViewsterIE(InfoExtractor): self._sort_formats(formats) - synopsis = info.get('Synopsis', {}) + synopsis = info.get('Synopsis') or {} # Prefer title outside synopsis since it's less messy title = (info.get('Title') or synopsis['Title']).strip() - description = synopsis.get('Detailed') or info.get('Synopsis', {}).get('Short') + description = synopsis.get('Detailed') or (info.get('Synopsis') or {}).get('Short') duration = int_or_none(info.get('Duration')) timestamp = parse_iso8601(info.get('ReleaseDate')) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 7af699982..2389e7f0f 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -11,6 +11,7 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + determine_ext, encode_dict, ExtractorError, InAdvancePagedList, @@ -208,6 +209,11 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/groups/travelhd/videos/22439234', 'only_matching': True, }, + { + # source file returns 403: Forbidden + 'url': 'https://vimeo.com/7809605', + 'only_matching': True, + }, ] @staticmethod @@ -217,7 +223,7 @@ class VimeoIE(VimeoBaseInfoExtractor): r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) if mobj: player_url = unescapeHTML(mobj.group('url')) - surl = smuggle_url(player_url, {'Referer': url}) + surl = smuggle_url(player_url, {'http_headers': {'Referer': url}}) return surl # Look for embedded (swf embed) Vimeo player mobj = re.search( @@ -262,11 +268,11 @@ class VimeoIE(VimeoBaseInfoExtractor): self._login() def _real_extract(self, url): - url, data = unsmuggle_url(url) + url, data = unsmuggle_url(url, {}) headers = std_headers - if data is not None: + if 'http_headers' in data: headers = headers.copy() - headers.update(data) + headers.update(data['http_headers']) if 'Referer' not in headers: headers['Referer'] = url @@ -342,7 +348,7 @@ class VimeoIE(VimeoBaseInfoExtractor): raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option') if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None: - if data and '_video_password_verified' in data: + if '_video_password_verified' in data: raise ExtractorError('video password verification failed!') self._verify_video_password(url, video_id, webpage) return self._real_extract( @@ -354,6 +360,13 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password(url, video_id) + if '>You rented this title.<' in webpage: + feature_id = config.get('video', {}).get('vod', {}).get('feature_id') + if feature_id and not data.get('force_feature_id', False): + return self.url_result(smuggle_url( + 'https://player.vimeo.com/player/%s' % feature_id, + {'force_feature_id': True}), 'Vimeo') + # Extract title video_title = config["video"]["title"] @@ -412,16 +425,21 @@ class VimeoIE(VimeoBaseInfoExtractor): download_data = self._download_json(download_request, video_id, fatal=False) if download_data: source_file = download_data.get('source_file') - if source_file and not source_file.get('is_cold') and not source_file.get('is_defrosting'): - formats.append({ - 'url': source_file['download_url'], - 'ext': source_file['extension'].lower(), - 'width': int_or_none(source_file.get('width')), - 'height': int_or_none(source_file.get('height')), - 'filesize': parse_filesize(source_file.get('size')), - 'format_id': source_file.get('public_name', 'Original'), - 'preference': 1, - }) + if isinstance(source_file, dict): + download_url = source_file.get('download_url') + if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): + source_name = source_file.get('public_name', 'Original') + if self._is_valid_url(download_url, video_id, '%s video' % source_name): + ext = source_file.get('extension', determine_ext(download_url)).lower() + formats.append({ + 'url': download_url, + 'ext': ext, + 'width': int_or_none(source_file.get('width')), + 'height': int_or_none(source_file.get('height')), + 'filesize': parse_filesize(source_file.get('size')), + 'format_id': source_name, + 'preference': 1, + }) config_files = config['video'].get('files') or config['request'].get('files', {}) for f in config_files.get('progressive', []): video_url = f.get('url') diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index 357594a11..a97995a6d 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -5,12 +5,13 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, + NO_DEFAULT, sanitized_Request, ) class VodlockerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:\..*?)?' + _VALID_URL = r'https?://(?:www\.)?vodlocker\.(?:com|city)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:\..*?)?' _TESTS = [{ 'url': 'http://vodlocker.com/e8wvyzz4sl42', @@ -43,16 +44,31 @@ class VodlockerIE(InfoExtractor): webpage = self._download_webpage( req, video_id, 'Downloading video page') + def extract_file_url(html, default=NO_DEFAULT): + return self._search_regex( + r'file:\s*"(http[^\"]+)",', html, 'file url', default=default) + + video_url = extract_file_url(webpage, default=None) + + if not video_url: + embed_url = self._search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?vodlocker\.(?:com|city)/embed-.+?)\1', + webpage, 'embed url', group='url') + embed_webpage = self._download_webpage( + embed_url, video_id, 'Downloading embed webpage') + video_url = extract_file_url(embed_webpage) + thumbnail_webpage = embed_webpage + else: + thumbnail_webpage = webpage + title = self._search_regex( r'id="file_title".*?>\s*(.*?)\s*<(?:br|span)', webpage, 'title') thumbnail = self._search_regex( - r'image:\s*"(http[^\"]+)",', webpage, 'thumbnail') - url = self._search_regex( - r'file:\s*"(http[^\"]+)",', webpage, 'file url') + r'image:\s*"(http[^\"]+)",', thumbnail_webpage, 'thumbnail', fatal=False) formats = [{ 'format_id': 'sd', - 'url': url, + 'url': video_url, }] return { diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index bbd3bbf7b..01891ac4c 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -8,7 +8,7 @@ from ..utils import float_or_none class VRTIE(InfoExtractor): - _VALID_URL = r'https?://(?:deredactie|sporza|cobra)\.be/cm/(?:[^/]+/)+(?P<id>[^/]+)/*' + _VALID_URL = r'https?://(?:deredactie|sporza|cobra(?:\.canvas)?)\.be/cm/(?:[^/]+/)+(?P<id>[^/]+)/*' _TESTS = [ # deredactie.be { @@ -52,6 +52,10 @@ class VRTIE(InfoExtractor): 'duration': 661, } }, + { + 'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055', + 'only_matching': True, + } ] def _real_extract(self, url): @@ -69,11 +73,11 @@ class VRTIE(InfoExtractor): if mobj: formats.extend(self._extract_m3u8_formats( '%s/%s' % (mobj.group('server'), mobj.group('path')), - video_id, 'mp4')) + video_id, 'mp4', m3u8_id='hls')) mobj = re.search(r'data-video-src="(?P<src>[^"]+)"', webpage) if mobj: formats.extend(self._extract_f4m_formats( - '%s/manifest.f4m' % mobj.group('src'), video_id)) + '%s/manifest.f4m' % mobj.group('src'), video_id, f4m_id='hds')) self._sort_formats(formats) title = self._og_search_title(webpage) diff --git a/youtube_dl/extractor/weiqitv.py b/youtube_dl/extractor/weiqitv.py new file mode 100644 index 000000000..e333ae345 --- /dev/null +++ b/youtube_dl/extractor/weiqitv.py @@ -0,0 +1,52 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class WeiqiTVIE(InfoExtractor): + IE_DESC = 'WQTV' + _VALID_URL = r'http://www\.weiqitv\.com/index/video_play\?videoId=(?P<id>[A-Za-z0-9]+)' + + _TESTS = [{ + 'url': 'http://www.weiqitv.com/index/video_play?videoId=53c744f09874f0e76a8b46f3', + 'md5': '26450599afd64c513bc77030ad15db44', + 'info_dict': { + 'id': '53c744f09874f0e76a8b46f3', + 'ext': 'mp4', + 'title': '2013年度盘点', + }, + }, { + 'url': 'http://www.weiqitv.com/index/video_play?videoId=567379a2d4c36cca518b4569', + 'info_dict': { + 'id': '567379a2d4c36cca518b4569', + 'ext': 'mp4', + 'title': '民国围棋史', + }, + }, { + 'url': 'http://www.weiqitv.com/index/video_play?videoId=5430220a9874f088658b4567', + 'info_dict': { + 'id': '5430220a9874f088658b4567', + 'ext': 'mp4', + 'title': '二路托过的手段和运用', + }, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + page = self._download_webpage(url, media_id) + + info_json_str = self._search_regex( + 'var\s+video\s*=\s*(.+});', page, 'info json str') + info_json = self._parse_json(info_json_str, media_id) + + letvcloud_url = self._search_regex( + 'var\s+letvurl\s*=\s*"([^"]+)', page, 'letvcloud url') + + return { + '_type': 'url_transparent', + 'ie_key': 'LetvCloud', + 'url': letvcloud_url, + 'title': info_json['name'], + 'id': media_id, + } diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 8938c0e45..fd43e8854 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -4,10 +4,9 @@ import re from .common import InfoExtractor from ..utils import ( - unified_strdate, - str_to_int, + float_or_none, int_or_none, - parse_duration, + unified_strdate, ) @@ -22,7 +21,7 @@ class XHamsterIE(InfoExtractor): 'title': 'FemaleAgent Shy beauty takes the bait', 'upload_date': '20121014', 'uploader': 'Ruseful2011', - 'duration': 893, + 'duration': 893.52, 'age_limit': 18, } }, @@ -34,7 +33,7 @@ class XHamsterIE(InfoExtractor): 'title': 'Britney Spears Sexy Booty', 'upload_date': '20130914', 'uploader': 'jojo747400', - 'duration': 200, + 'duration': 200.48, 'age_limit': 18, } }, @@ -64,20 +63,21 @@ class XHamsterIE(InfoExtractor): webpage = self._download_webpage(mrss_url, video_id) title = self._html_search_regex( - [r'<title>(?P<title>.+?)(?:, (?:[^,]+? )?Porn: xHamster| - xHamster\.com)</title>', - r'<h1>([^<]+)</h1>'], webpage, 'title') + [r'<h1[^>]*>([^<]+)</h1>', + r'<meta[^>]+itemprop=".*?caption.*?"[^>]+content="(.+?)"', + r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'], + webpage, 'title') # Only a few videos have an description mobj = re.search(r'<span>Description: </span>([^<]+)', webpage) description = mobj.group(1) if mobj else None - upload_date = self._html_search_regex(r'hint=\'(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}\'', - webpage, 'upload date', fatal=False) - if upload_date: - upload_date = unified_strdate(upload_date) + upload_date = unified_strdate(self._search_regex( + r'hint=["\'](\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}', + webpage, 'upload date', fatal=False)) uploader = self._html_search_regex( - r"<a href='[^']+xhamster\.com/user/[^>]+>(?P<uploader>[^<]+)", + r'<span[^>]+itemprop=["\']author[^>]+><a[^>]+href=["\'].+?xhamster\.com/user/[^>]+>(?P<uploader>.+?)</a>', webpage, 'uploader', default='anonymous') thumbnail = self._search_regex( @@ -85,12 +85,13 @@ class XHamsterIE(InfoExtractor): r'''<video[^>]+poster=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''], webpage, 'thumbnail', fatal=False, group='thumbnail') - duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>', - webpage, 'duration', fatal=False)) + duration = float_or_none(self._search_regex( + r'(["\'])duration\1\s*:\s*(["\'])(?P<duration>.+?)\2', + webpage, 'duration', fatal=False, group='duration')) - view_count = self._html_search_regex(r'<span>Views:</span> ([^<]+)</div>', webpage, 'view count', fatal=False) - if view_count: - view_count = str_to_int(view_count) + view_count = int_or_none(self._search_regex( + r'content=["\']User(?:View|Play)s:(\d+)', + webpage, 'view count', fatal=False)) mobj = re.search(r"hint='(?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes'", webpage) (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index a1fe24050..8cd3a0687 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -1,10 +1,12 @@ from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( + int_or_none, parse_duration, sanitized_Request, str_to_int, @@ -12,7 +14,7 @@ from ..utils import ( class XTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<id>[^/?&#]+))' + _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/watch\.php\?.*\bv=)(?P<id>[^/?&#]+)' _TEST = { 'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_', 'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab', @@ -30,7 +32,7 @@ class XTubeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = sanitized_Request(url) + req = sanitized_Request('http://www.xtube.com/watch.php?v=%s' % video_id) req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) @@ -88,45 +90,43 @@ class XTubeIE(InfoExtractor): class XTubeUserIE(InfoExtractor): IE_DESC = 'XTube user profile' - _VALID_URL = r'https?://(?:www\.)?xtube\.com/community/profile\.php\?(.*?)user=(?P<username>[^&#]+)(?:$|[&#])' + _VALID_URL = r'https?://(?:www\.)?xtube\.com/profile/(?P<id>[^/]+-\d+)' _TEST = { - 'url': 'http://www.xtube.com/community/profile.php?user=greenshowers', + 'url': 'http://www.xtube.com/profile/greenshowers-4056496', 'info_dict': { - 'id': 'greenshowers', + 'id': 'greenshowers-4056496', 'age_limit': 18, }, 'playlist_mincount': 155, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - username = mobj.group('username') - - profile_page = self._download_webpage( - url, username, note='Retrieving profile page') - - video_count = int(self._search_regex( - r'<strong>%s\'s Videos \(([0-9]+)\)</strong>' % username, profile_page, - 'video count')) - - PAGE_SIZE = 25 - urls = [] - page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE - for n in range(1, page_count + 1): - lpage_url = 'http://www.xtube.com/user_videos.php?page=%d&u=%s' % (n, username) - lpage = self._download_webpage( - lpage_url, username, - note='Downloading page %d/%d' % (n, page_count)) - urls.extend( - re.findall(r'addthis:url="([^"]+)"', lpage)) - - return { - '_type': 'playlist', - 'id': username, - 'age_limit': 18, - 'entries': [{ - '_type': 'url', - 'url': eurl, - 'ie_key': 'XTube', - } for eurl in urls] - } + user_id = self._match_id(url) + + entries = [] + for pagenum in itertools.count(1): + request = sanitized_Request( + 'http://www.xtube.com/profile/%s/videos/%d' % (user_id, pagenum), + headers={ + 'Cookie': 'popunder=4', + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': url, + }) + + page = self._download_json( + request, user_id, 'Downloading videos JSON page %d' % pagenum) + + html = page.get('html') + if not html: + break + + for _, video_id in re.findall(r'data-plid=(["\'])(.+?)\1', html): + entries.append(self.url_result('xtube:%s' % video_id, XTubeIE.ie_key())) + + page_count = int_or_none(page.get('pageCount')) + if not page_count or pagenum == page_count: + break + + playlist = self.playlist_result(entries, user_id) + playlist['age_limit'] = 18 + return playlist diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 4a492f784..4c6142927 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -221,6 +221,8 @@ class YahooIE(InfoExtractor): r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"', r'"first_videoid"\s*:\s*"([^"]+)"', r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id), + r'<article[^>]data-uuid=["\']([^"\']+)', + r'yahoo://article/view\?.*\buuid=([^&"\']+)', ] video_id = self._search_regex( CONTENT_ID_REGEXES, webpage, 'content ID') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 64386f34a..92b9f3ae4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -32,6 +32,7 @@ from ..utils import ( get_element_by_attribute, get_element_by_id, int_or_none, + mimetype2ext, orderedSet, parse_duration, remove_quotes, @@ -613,7 +614,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'params': { 'skip_download': 'requires avconv', - } + }, + 'skip': 'This live event has ended.', }, # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097) { @@ -706,6 +708,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, { # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468) + # Also tests cut-off URL expansion in video description (see + # https://github.com/rg3/youtube-dl/issues/1892, + # https://github.com/rg3/youtube-dl/issues/8164) 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg', 'info_dict': { 'id': 'lsguqyKfVQg', @@ -960,6 +965,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): try: args = player_config['args'] caption_url = args['ttsurl'] + if not caption_url: + self._downloader.report_warning(err_msg) + return {} timestamp = args['timestamp'] # We get the available subtitles list_params = compat_urllib_parse.urlencode({ @@ -1083,9 +1091,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): full_info.update(f) codecs = r.attrib.get('codecs') if codecs: - if full_info.get('acodec') == 'none' and 'vcodec' not in full_info: + if full_info.get('acodec') == 'none': full_info['vcodec'] = codecs - elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info: + elif full_info.get('vcodec') == 'none': full_info['acodec'] = codecs formats.append(full_info) else: @@ -1235,10 +1243,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_description = re.sub(r'''(?x) <a\s+ (?:[a-zA-Z-]+="[^"]+"\s+)*? - title="([^"]+)"\s+ + (?:title|href)="([^"]+)"\s+ (?:[a-zA-Z-]+="[^"]+"\s+)*? - class="yt-uix-redirect-link"\s*> - [^<]+ + class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*> + [^<]+\.{3}\s* </a> ''', r'\1', video_description) video_description = clean_html(video_description) @@ -1454,15 +1462,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'ratebypass' not in url: url += '&ratebypass=yes' + dct = { + 'format_id': format_id, + 'url': url, + 'player_url': player_url, + } + if format_id in self._formats: + dct.update(self._formats[format_id]) + # Some itags are not included in DASH manifest thus corresponding formats will # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993). # Trying to extract metadata from url_encoded_fmt_stream_map entry. mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0]) width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) - dct = { - 'format_id': format_id, - 'url': url, - 'player_url': player_url, + + more_fields = { 'filesize': int_or_none(url_data.get('clen', [None])[0]), 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), 'width': width, @@ -1470,13 +1484,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'fps': int_or_none(url_data.get('fps', [None])[0]), 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0], } + for key, value in more_fields.items(): + if value: + dct[key] = value type_ = url_data.get('type', [None])[0] if type_: type_split = type_.split(';') kind_ext = type_split[0].split('/') if len(kind_ext) == 2: - kind, ext = kind_ext - dct['ext'] = ext + kind, _ = kind_ext + dct['ext'] = mimetype2ext(type_split[0]) if kind in ('audio', 'video'): codecs = None for mobj in re.finditer( @@ -1487,15 +1504,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if codecs: codecs = codecs.split(',') if len(codecs) == 2: - acodec, vcodec = codecs[0], codecs[1] + acodec, vcodec = codecs[1], codecs[0] else: acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0]) dct.update({ 'acodec': acodec, 'vcodec': vcodec, }) - if format_id in self._formats: - dct.update(self._formats[format_id]) formats.append(dct) elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] @@ -1505,6 +1520,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for a_format in formats: a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' else: + unavailable_message = self._html_search_regex( + r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>', + video_webpage, 'unavailable message', default=None) + if unavailable_message: + raise ExtractorError(unavailable_message, expected=True) raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # Look for the DASH manifest diff --git a/youtube_dl/extractor/zippcast.py b/youtube_dl/extractor/zippcast.py new file mode 100644 index 000000000..de819376d --- /dev/null +++ b/youtube_dl/extractor/zippcast.py @@ -0,0 +1,94 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + str_to_int, +) + + +class ZippCastIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zippcast\.com/(?:video/|videoview\.php\?.*\bvplay=)(?P<id>[0-9a-zA-Z]+)' + _TESTS = [{ + # m3u8, hq direct link + 'url': 'http://www.zippcast.com/video/c9cfd5c7e44dbc29c81', + 'md5': '5ea0263b5606866c4d6cda0fc5e8c6b6', + 'info_dict': { + 'id': 'c9cfd5c7e44dbc29c81', + 'ext': 'mp4', + 'title': '[Vinesauce] Vinny - Digital Space Traveler', + 'description': 'Muted on youtube, but now uploaded in it\'s original form.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'vinesauce', + 'view_count': int, + 'categories': ['Entertainment'], + 'tags': list, + }, + }, { + # f4m, lq ipod direct link + 'url': 'http://www.zippcast.com/video/b79c0a233e9c6581775', + 'only_matching': True, + }, { + 'url': 'http://www.zippcast.com/videoview.php?vplay=c9cfd5c7e44dbc29c81&auto=no', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.zippcast.com/video/%s' % video_id, video_id) + + formats = [] + video_url = self._search_regex( + r'<source[^>]+src=(["\'])(?P<url>.+?)\1', webpage, + 'video url', default=None, group='url') + if video_url: + formats.append({ + 'url': video_url, + 'format_id': 'http', + 'preference': 0, # direct link is almost always of worse quality + }) + src_url = self._search_regex( + r'src\s*:\s*(?:escape\()?(["\'])(?P<url>http://.+?)\1', + webpage, 'src', default=None, group='url') + ext = determine_ext(src_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + src_url, video_id, f4m_id='hds', fatal=False)) + self._sort_formats(formats) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage) + uploader = self._search_regex( + r'<a[^>]+href="https?://[^/]+/profile/[^>]+>([^<]+)</a>', + webpage, 'uploader', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + view_count = str_to_int(self._search_regex( + r'>([\d,.]+) views!', webpage, 'view count', fatal=False)) + + categories = re.findall( + r'<a[^>]+href="https?://[^/]+/categories/[^"]+">([^<]+),?<', + webpage) + tags = re.findall( + r'<a[^>]+href="https?://[^/]+/search/tags/[^"]+">([^<]+),?<', + webpage) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'view_count': view_count, + 'categories': categories, + 'tags': tags, + 'formats': formats, + } diff --git a/youtube_dl/options.py b/youtube_dl/options.py index ade58c375..433245f00 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -380,7 +380,7 @@ def parseOpts(overrideArguments=None): '--sub-lang', '--sub-langs', '--srt-lang', action='callback', dest='subtitleslangs', metavar='LANGS', type='str', default=[], callback=_comma_separated_values_options_callback, - help='Languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'') + help='Languages of the subtitles to download (optional) separated by commas, use --list-subs for available language tags') downloader = optparse.OptionGroup(parser, 'Download Options') downloader.add_option( diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index e60505ace..06c1d6cc1 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -689,7 +689,7 @@ class SWFInterpreter(object): elif mname in _builtin_classes: res = _builtin_classes[mname] else: - # Assume unitialized + # Assume uninitialized # TODO warn here res = undefined stack.append(res) diff --git a/youtube_dl/update.py b/youtube_dl/update.py index 995b8ed96..e4a1aaa64 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -15,33 +15,17 @@ from .version import __version__ def rsa_verify(message, signature, key): - from struct import pack from hashlib import sha256 - assert isinstance(message, bytes) - block_size = 0 - n = key[0] - while n: - block_size += 1 - n >>= 8 - signature = pow(int(signature, 16), key[1], key[0]) - raw_bytes = [] - while signature: - raw_bytes.insert(0, pack("B", signature & 0xFF)) - signature >>= 8 - signature = (block_size - len(raw_bytes)) * b'\x00' + b''.join(raw_bytes) - if signature[0:2] != b'\x00\x01': - return False - signature = signature[2:] - if b'\x00' not in signature: - return False - signature = signature[signature.index(b'\x00') + 1:] - if not signature.startswith(b'\x30\x31\x30\x0D\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x01\x05\x00\x04\x20'): - return False - signature = signature[19:] - if signature != sha256(message).digest(): + byte_size = (len(bin(key[0])) - 2 + 8 - 1) // 8 + signature = ('%x' % pow(int(signature, 16), key[1], key[0])).encode() + signature = (byte_size * 2 - len(signature)) * b'0' + signature + asn1 = b'3031300d060960864801650304020105000420' + asn1 += sha256(message).hexdigest().encode() + if byte_size < len(asn1) // 2 + 11: return False - return True + expected = b'0001' + (byte_size - len(asn1) // 2 - 3) * b'ff' + b'00' + asn1 + return expected == signature def update_self(to_screen, verbose, opener): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0ed6c45c8..c63b61598 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -70,6 +70,21 @@ ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] +KNOWN_EXTENSIONS = ( + 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', + 'flv', 'f4v', 'f4a', 'f4b', + 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', + 'mkv', 'mka', 'mk3d', + 'avi', 'divx', + 'mov', + 'asf', 'wmv', 'wma', + '3gp', '3g2', + 'mp3', + 'flac', + 'ape', + 'wav', + 'f4f', 'f4m', 'm3u8', 'smil') + def preferredencoding(): """Get preferred encoding. @@ -942,20 +957,8 @@ def determine_ext(url, default_ext='unknown_video'): guess = url.partition('?')[0].rpartition('.')[2] if re.match(r'^[A-Za-z0-9]+$', guess): return guess - elif guess.rstrip('/') in ( - 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', - 'flv', 'f4v', 'f4a', 'f4b', - 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', - 'mkv', 'mka', 'mk3d', - 'avi', 'divx', - 'mov', - 'asf', 'wmv', 'wma', - '3gp', '3g2', - 'mp3', - 'flac', - 'ape', - 'wav', - 'f4f', 'f4m', 'm3u8', 'smil'): + # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download + elif guess.rstrip('/') in KNOWN_EXTENSIONS: return guess.rstrip('/') else: return default_ext @@ -981,7 +984,7 @@ def date_from_str(date_str): if sign == '-': time = -time unit = match.group('unit') - # A bad aproximation? + # A bad approximation? if unit == 'month': unit = 'day' time *= 30 @@ -1304,7 +1307,7 @@ def parse_filesize(s): if s is None: return None - # The lower-case forms are of course incorrect and inofficial, + # The lower-case forms are of course incorrect and unofficial, # but we support those too _UNIT_TABLE = { 'B': 1, @@ -1825,9 +1828,11 @@ def mimetype2ext(mt): _, _, res = mt.rpartition('/') return { - 'x-ms-wmv': 'wmv', - 'x-mp4-fragmented': 'mp4', + '3gpp': '3gp', 'ttml+xml': 'ttml', + 'x-flv': 'flv', + 'x-mp4-fragmented': 'mp4', + 'x-ms-wmv': 'wmv', }.get(res, res) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 790bd5b3b..d5bf73815 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.01.01' +__version__ = '2016.01.23' |