diff options
Diffstat (limited to 'youtube_dl/extractor')
97 files changed, 4455 insertions, 1836 deletions
diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py new file mode 100644 index 000000000..b61a6327c --- /dev/null +++ b/youtube_dl/extractor/abcnews.py @@ -0,0 +1,135 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import calendar +import re +import time + +from .amp import AMPIE +from .common import InfoExtractor +from ..compat import compat_urlparse + + +class AbcNewsVideoIE(AMPIE): + IE_NAME = 'abcnews:video' + _VALID_URL = 'http://abcnews.go.com/[^/]+/video/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932', + 'info_dict': { + 'id': '20411932', + 'ext': 'mp4', + 'display_id': 'week-exclusive-irans-foreign-minister-zarif', + 'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif', + 'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.', + 'duration': 180, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + video_id = mobj.group('id') + info_dict = self._extract_feed_info( + 'http://abcnews.go.com/video/itemfeed?id=%s' % video_id) + info_dict.update({ + 'id': video_id, + 'display_id': display_id, + }) + return info_dict + + +class AbcNewsIE(InfoExtractor): + IE_NAME = 'abcnews' + _VALID_URL = 'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', + 'info_dict': { + 'id': '10498713', + 'ext': 'flv', + 'display_id': 'dramatic-video-rare-death-job-america', + 'title': 'Occupational Hazards', + 'description': 'Nightline investigates the dangers that lurk at various jobs.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20100428', + 'timestamp': 1272412800, + }, + 'add_ie': ['AbcNewsVideo'], + }, { + 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', + 'info_dict': { + 'id': '39125818', + 'ext': 'mp4', + 'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016', + 'title': 'Justin Timberlake Drops Hints For Secret Single', + 'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.', + 'upload_date': '20160515', + 'timestamp': 1463329500, + }, + 'params': { + # m3u8 download + 'skip_download': True, + # The embedded YouTube video is blocked due to copyright issues + 'playlist_items': '1', + }, + 'add_ie': ['AbcNewsVideo'], + }, { + 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + video_url = self._search_regex( + r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') + full_video_url = compat_urlparse.urljoin(url, video_url) + + youtube_url = self._html_search_regex( + r'<iframe[^>]+src="(https://www\.youtube\.com/embed/[^"]+)"', + webpage, 'YouTube URL', default=None) + + timestamp = None + date_str = self._html_search_regex( + r'<span[^>]+class="timestamp">([^<]+)</span>', + webpage, 'timestamp', fatal=False) + if date_str: + tz_offset = 0 + if date_str.endswith(' ET'): # Eastern Time + tz_offset = -5 + date_str = date_str[:-3] + date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p'] + for date_format in date_formats: + try: + timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format)) + except ValueError: + continue + if timestamp is not None: + timestamp -= tz_offset * 3600 + + entry = { + '_type': 'url_transparent', + 'ie_key': AbcNewsVideoIE.ie_key(), + 'url': full_video_url, + 'id': video_id, + 'display_id': display_id, + 'timestamp': timestamp, + } + + if youtube_url: + entries = [entry, self.url_result(youtube_url, 'Youtube')] + return self.playlist_result(entries) + + return entry diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 138fa0808..8545681be 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -52,7 +52,7 @@ class AMPIE(InfoExtractor): for media_data in media_content: media = media_data['@attributes'] media_type = media['type'] - if media_type == 'video/f4m': + if media_type in ('video/f4m', 'application/f4m+xml'): formats.extend(self._extract_f4m_formats( media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False)) @@ -61,7 +61,7 @@ class AMPIE(InfoExtractor): media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.append({ - 'format_id': media_data['media-category']['@attributes']['label'], + 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), 'url': media['url'], 'tbr': int_or_none(media.get('bitrate')), 'filesize': int_or_none(media.get('fileSize')), diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py new file mode 100644 index 000000000..cb29cf111 --- /dev/null +++ b/youtube_dl/extractor/anvato.py @@ -0,0 +1,224 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import hashlib +import json +import random +import time + +from .common import InfoExtractor +from ..aes import aes_encrypt +from ..compat import compat_str +from ..utils import ( + bytes_to_intlist, + determine_ext, + intlist_to_bytes, + int_or_none, + strip_jsonp, +) + + +def md5_text(s): + if not isinstance(s, compat_str): + s = compat_str(s) + return hashlib.md5(s.encode('utf-8')).hexdigest() + + +class AnvatoIE(InfoExtractor): + # Copied from anvplayer.min.js + _ANVACK_TABLE = { + 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', + 'nbcu_nbcd_desktop_web_qa_1a6f01bdd0dc45a439043b694c8a031d': 'eSxJUbA2UUKBTXryyQ2d6NuM8oEqaPySvaPzfKNA', + 'nbcu_nbcd_desktop_web_acc_eb2ff240a5d4ae9a63d4c297c32716b6c523a129': '89JR3RtUGbvKuuJIiKOMK0SoarLb5MUx8v89RcbP', + 'nbcu_nbcd_watchvod_web_prod_e61107507180976724ec8e8319fe24ba5b4b60e1': 'Uc7dFt7MJ9GsBWB5T7iPvLaMSOt8BBxv4hAXk5vv', + 'nbcu_nbcd_watchvod_web_qa_42afedba88a36203db5a4c09a5ba29d045302232': 'T12oDYVFP2IaFvxkmYMy5dKxswpLHtGZa4ZAXEi7', + 'nbcu_nbcd_watchvod_web_acc_9193214448e2e636b0ffb78abacfd9c4f937c6ca': 'MmobcxUxMedUpohNWwXaOnMjlbiyTOBLL6d46ZpR', + 'nbcu_local_monitor_web_acc_f998ad54eaf26acd8ee033eb36f39a7b791c6335': 'QvfIoPYrwsjUCcASiw3AIkVtQob2LtJHfidp9iWg', + 'nbcu_cable_monitor_web_acc_a413759603e8bedfcd3c61b14767796e17834077': 'uwVPJLShvJWSs6sWEIuVem7MTF8A4IknMMzIlFto', + 'nbcu_nbcd_mcpstage_web_qa_4c43a8f6e95a88dbb40276c0630ba9f693a63a4e': 'PxVYZVwjhgd5TeoPRxL3whssb5OUPnM3zyAzq8GY', + 'nbcu_comcast_comcast_web_prod_074080762ad4ce956b26b43fb22abf153443a8c4': 'afnaRZfDyg1Z3WZHdupKfy6xrbAG2MHqe3VfuSwh', + 'nbcu_comcast_comcast_web_qa_706103bb93ead3ef70b1de12a0e95e3c4481ade0': 'DcjsVbX9b3uoPlhdriIiovgFQZVxpISZwz0cx1ZK', + 'nbcu_comcast_comcastcable_web_prod_669f04817536743563d7331c9293e59fbdbe3d07': '0RwMN2cWy10qhAhOscq3eK7aEe0wqnKt3vJ0WS4D', + 'nbcu_comcast_comcastcable_web_qa_3d9d2d66219094127f0f6b09cc3c7bb076e3e1ca': '2r8G9DEya7PCqBceKZgrn2XkXgASjwLMuaFE1Aad', + 'hearst_hearst_demo_web_stage_960726dfef3337059a01a78816e43b29ec04dfc7': 'cuZBPXTR6kSdoTCVXwk5KGA8rk3NrgGn4H6e9Dsp', + 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922': 'IOaaLQ8ymqVyem14QuAvE5SndQynTcH5CrLkU2Ih', + 'anvato_nextmedia_demo_web_stage_9787d56a02ff6b9f43e9a2b0920d8ca88beb5818': 'Pqu9zVzI1ApiIzbVA3VkGBEQHvdKSUuKpD6s2uaR', + 'anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a': 'du1ccmn7RxzgizwbWU7hyUaGodNlJn7HtXI0WgXW', + 'anvato_scripps_app_web_stage_360797e00fe2826be142155c4618cc52fce6c26c': '2PMrQ0BRoqCWl7nzphj0GouIMEh2mZYivAT0S1Su', + 'fs2go_fs2go_go_all_prod_21934911ccfafc03a075894ead2260d11e2ddd24': 'RcuHlKikW2IJw6HvVoEkqq2UsuEJlbEl11pWXs4Q', + 'fs2go_fs2go_go_web_prod_ead4b0eec7460c1a07783808db21b49cf1f2f9a7': '4K0HTT2u1zkQA2MaGaZmkLa1BthGSBdr7jllrhk5', + 'fs2go_fs2go_go_web_stage_407585454a4400355d4391691c67f361': 'ftnc37VKRJBmHfoGGi3kT05bHyeJzilEzhKJCyl3', + 'fs2go_fs2go_go_android_stage_44b714db6f8477f29afcba15a41e1d30': 'CtxpPvVpo6AbZGomYUhkKs7juHZwNml9b9J0J2gI', + 'anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67': 'Pw0XX5KBDsyRnPS0R2JrSrXftsy8Jnz5pAjaYC8s', + 'anvato_cbslocal_app_web_stage_547a5f096594cd3e00620c6f825cad1096d28c80': '37OBUhX2uwNyKhhrNzSSNHSRPZpApC3trdqDBpuz', + 'fs2go_att_att_web_prod_1042dddd089a05438b6a08f972941176f699ffd8': 'JLcF20JwYvpv6uAGcLWIaV12jKwaL1R8us4b6Zkg', + 'fs2go_att_att_web_stage_807c5001955fc114a3331fe027ddc76e': 'gbu1oO1y0JiOFh4SUipt86P288JHpyjSqolrrT1x', + 'fs2go_fs2go_tudor_web_prod_a7dd8e5a7cdc830cae55eae6f3e9fee5ee49eb9b': 'ipcp87VCEZXPPe868j3orLqzc03oTy7DXsGkAXXH', + 'anvato_mhz_app_web_prod_b808218b30de7fdf60340cbd9831512bc1bf6d37': 'Stlm5Gs6BEhJLRTZHcNquyzxGqr23EuFmE5DCgjX', + 'fs2go_charter_charter_web_stage_c2c6e5a68375a1bf00fff213d3ff8f61a835a54c': 'Lz4hbJp1fwL6jlcz4M2PMzghM4jp4aAmybtT5dPc', + 'fs2go_charter_charter_web_prod_ebfe3b10f1af215a7321cd3d629e0b81dfa6fa8c': 'vUJsK345A1bVmyYDRhZX0lqFIgVXuqhmuyp1EtPK', + 'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b': 'GDKq1ixvX3MoBNdU5IOYmYa2DTUXYOozPjrCJnW7', + 'anvato_epfox_app_web_stage_a3c2ce60f8f83ef374a88b68ee73a950f8ab87ce': '2jz2NH4BsXMaDsoJ5qkHMbcczAfIReo2eFYuVC1C', + 'fs2go_verizon_verizon_web_stage_08e6df0354a4803f1b1f2428b5a9a382e8dbcd62': 'rKTVapNaAcmnUbGL4ZcuOoY4SE7VmZSQsblPFr7e', + 'fs2go_verizon_verizon_web_prod_f909564cb606eff1f731b5e22e0928676732c445': 'qLSUuHerM3u9eNPzaHyUK52obai5MvE4XDJfqYe1', + 'fs2go_foxcom_synd_web_stage_f7b9091f00ea25a4fdaaae77fca5b54cdc7e7043': '96VKF2vLd24fFiDfwPFpzM5llFN4TiIGAlodE0Re', + 'fs2go_foxcom_synd_web_prod_0f2cdd64d87e4ab6a1d54aada0ff7a7c8387a064': 'agiPjbXEyEZUkbuhcnmVPhe9NNVbDjCFq2xkcx51', + 'anvato_own_app_web_stage_1214ade5d28422c4dae9d03c1243aba0563c4dba': 'mzhamNac3swG4WsJAiUTacnGIODi6SWeVWk5D7ho', + 'anvato_own_app_web_prod_944e162ed927ec3e9ed13eb68ed2f1008ee7565e': '9TSxh6G2TXOLBoYm9ro3LdNjjvnXpKb8UR8KoIP9', + 'anvato_scripps_app_ftv_prod_a10a10468edd5afb16fb48171c03b956176afad1': 'COJ2i2UIPK7xZqIWswxe7FaVBOVgRkP1F6O6qGoH', + 'anvato_scripps_app_ftv_stage_77d3ad2bdb021ec37ca2e35eb09acd396a974c9a': 'Q7nnopNLe2PPfGLOTYBqxSaRpl209IhqaEuDZi1F', + 'anvato_univision_app_web_stage_551236ef07a0e17718c3995c35586b5ed8cb5031': 'D92PoLS6UitwxDRA191HUGT9OYcOjV6mPMa5wNyo', + 'anvato_univision_app_web_prod_039a5c0a6009e637ae8ac906718a79911e0e65e1': '5mVS5u4SQjtw6NGw2uhMbKEIONIiLqRKck5RwQLR', + 'nbcu_cnbc_springfield_ios_prod_670207fae43d6e9a94c351688851a2ce': 'M7fqCCIP9lW53oJbHs19OlJlpDrVyc2OL8gNeuTa', + 'nbcu_cnbc_springfieldvod_ios_prod_7a5f04b1ceceb0e9c9e2264a44aa236e08e034c2': 'Yia6QbJahW0S7K1I0drksimhZb4UFq92xLBmmMvk', + 'anvato_cox_app_web_prod_ce45cda237969f93e7130f50ee8bb6280c1484ab': 'cc0miZexpFtdoqZGvdhfXsLy7FXjRAOgb9V0f5fZ', + 'anvato_cox_app_web_stage_c23dbe016a8e9d8c7101d10172b92434f6088bf9': 'yivU3MYHd2eDZcOfmLbINVtqxyecKTOp8OjOuoGJ', + 'anvato_chnzero_app_web_stage_b1164d1352b579e792e542fddf13ee34c0eeb46b': 'A76QkXMmVH8lTCfU15xva1mZnSVcqeY4Xb22Kp7m', + 'anvato_chnzero_app_web_prod_253d358928dc08ec161eda2389d53707288a730c': 'OA5QI3ZWZZkdtUEDqh28AH8GedsF6FqzJI32596b', + 'anvato_discovery_vodpoc_web_stage_9fa7077b5e8af1f8355f65d4fb8d2e0e9d54e2b7': 'q3oT191tTQ5g3JCP67PkjLASI9s16DuWZ6fYmry3', + 'anvato_discovery_vodpoc_web_prod_688614983167a1af6cdf6d76343fda10a65223c1': 'qRvRQCTVHd0VVOHsMvvfidyWmlYVrTbjby7WqIuK', + 'nbcu_cnbc_springfieldvod_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua', + 'nbcu_cnbc_springfield_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua', + 'nbcu_nbcd_capture_web_stage_4dd9d585bfb984ebf856dee35db027b2465cc4ae': '0j1Ov4Vopyi2HpBZJYdL2m8ERJVGYh3nNpzPiO8F', + 'nbcu_nbcd_watch3_android_prod_7712ca5fcf1c22f19ec1870a9650f9c37db22dcf': '3LN2UB3rPUAMu7ZriWkHky9vpLMXYha8JbSnxBlx', + 'nbcu_nbcd_watchvod3_android_prod_0910a3a4692d57c0b5ff4316075bc5d096be45b9': 'mJagcQ2II30vUOAauOXne7ERwbf5S9nlB3IP17lQ', + 'anvato_scripps_app_atv_prod_790deda22e16e71e83df58f880cd389908a45d52': 'CB6trI1mpoDIM5o54DNTsji90NDBQPZ4z4RqBNSH', + 'nbcu_nbcd_watchv4_android_prod_ff67cef9cb409158c6f8c3533edddadd0b750507': 'j8CHQCUWjlYERj4NFRmUYOND85QNbHViH09UwuKm', + 'nbcu_nbcd_watchvodv4_android_prod_a814d781609989dea6a629d50ae4c7ad8cc8e907': 'rkVnUXxdA9rawVLUlDQtMue9Y4Q7lFEaIotcUhjt', + 'rvVKpA50qlOPLFxMjrCGf5pdkdQDm7qn': '1J7ZkY5Qz5lMLi93QOH9IveE7EYB3rLl', + 'nbcu_dtv_local_web_prod_b266cf49defe255fd4426a97e27c09e513e9f82f': 'HuLnJDqzLa4saCzYMJ79zDRSQpEduw1TzjMNQu2b', + 'nbcu_att_local_web_prod_4cef038b2d969a6b7d700a56a599040b6a619f67': 'Q0Em5VDc2KpydUrVwzWRXAwoNBulWUxCq2faK0AV', + 'nbcu_dish_local_web_prod_c56dcaf2da2e9157a4266c82a78195f1dd570f6b': 'bC1LWmRz9ayj2AlzizeJ1HuhTfIaJGsDBnZNgoRg', + 'nbcu_verizon_local_web_prod_88bebd2ce006d4ed980de8133496f9a74cb9b3e1': 'wzhDKJZpgvUSS1EQvpCQP8Q59qVzcPixqDGJefSk', + 'nbcu_charter_local_web_prod_9ad90f7fc4023643bb718f0fe0fd5beea2382a50': 'PyNbxNhEWLzy1ZvWEQelRuIQY88Eub7xbSVRMdfT', + 'nbcu_suddenlink_local_web_prod_20fb711725cac224baa1c1cb0b1c324d25e97178': '0Rph41lPXZbb3fqeXtHjjbxfSrNbtZp1Ygq7Jypa', + 'nbcu_wow_local_web_prod_652d9ce4f552d9c2e7b5b1ed37b8cb48155174ad': 'qayIBZ70w1dItm2zS42AptXnxW15mkjRrwnBjMPv', + 'nbcu_centurylink_local_web_prod_2034402b029bf3e837ad46814d9e4b1d1345ccd5': 'StePcPMkjsX51PcizLdLRMzxMEl5k2FlsMLUNV4k', + 'nbcu_atlanticbrd_local_web_prod_8d5f5ecbf7f7b2f5e6d908dd75d90ae3565f682e': 'NtYLb4TFUS0pRs3XTkyO5sbVGYjVf17bVbjaGscI', + 'nbcu_nbcd_watchvod_web_dev_08bc05699be47c4f31d5080263a8cfadc16d0f7c': 'hwxi2dgDoSWgfmVVXOYZm14uuvku4QfopstXckhr', + 'anvato_nextmedia_app_web_prod_a4fa8c7204aa65e71044b57aaf63711980cfe5a0': 'tQN1oGPYY1nM85rJYePWGcIb92TG0gSqoVpQTWOw', + 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749': 'GUXNf5ZDX2jFUpu4WT2Go4DJ5nhUCzpnwDRRUx1K', + 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa': 'bLDYF8JqfG42b7bwKEgQiU9E2LTIAtnKzSgYpFUH', + 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a': 'icgGoYGipQMMSEvhplZX1pwbN69srwKYWksz3xWK', + 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336': 'fA2iQdI7RDpynqzQYIpXALVS83NTPr8LLFK4LFsu', + 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg', + 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg', + 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99': 'P3uXJ0fXXditBPCGkfvlnVScpPEfKmc64Zv7ZgbK', + 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe': 'mGPvo5ZA5SgjOFAPEPXv7AnOpFUICX8hvFQVz69n', + 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582': 'qyT6PXXLjVNCrHaRVj0ugAhalNRS7Ee9BP7LUokD', + 'nbcu_nbcd_watchvodv4_web_stage_4108362fba2d4ede21f262fea3c4162cbafd66c7': 'DhaU5lj0W2gEdcSSsnxURq8t7KIWtJfD966crVDk', + 'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn', + 'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W', + 'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ', + 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ' + } + + _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' + + def __init__(self, *args, **kwargs): + super(AnvatoIE, self).__init__(*args, **kwargs) + self.__server_time = None + + def _server_time(self, access_key, video_id): + if self.__server_time is not None: + return self.__server_time + + self.__server_time = int(self._download_json( + self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id, + note='Fetching server time')['server_time']) + + return self.__server_time + + def _api_prefix(self, access_key): + return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage') + + def _get_video_json(self, access_key, video_id): + # See et() in anvplayer.min.js, which is an alias of getVideoJSON() + video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key) + server_time = self._server_time(access_key, video_id) + input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time)) + + auth_secret = intlist_to_bytes(aes_encrypt( + bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY))) + + video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') + anvrid = md5_text(time.time() * 1000 * random.random())[:30] + payload = { + 'api': { + 'anvrid': anvrid, + 'anvstk': md5_text('%s|%s|%d|%s' % ( + access_key, anvrid, server_time, self._ANVACK_TABLE[access_key])), + 'anvts': server_time, + }, + } + + return self._download_json( + video_data_url, video_id, transform_source=strip_jsonp, + data=json.dumps(payload).encode('utf-8')) + + def _extract_anvato_videos(self, webpage, video_id): + anvplayer_data = self._parse_json(self._html_search_regex( + r'<script[^>]+data-anvp=\'([^\']+)\'', webpage, + 'Anvato player data'), video_id) + + video_id = anvplayer_data['video'] + access_key = anvplayer_data['accessKey'] + + video_data = self._get_video_json(access_key, video_id) + + formats = [] + for published_url in video_data['published_urls']: + video_url = published_url['embed_url'] + ext = determine_ext(video_url) + + if ext == 'smil': + formats.extend(self._extract_smil_formats(video_url, video_id)) + continue + + tbr = int_or_none(published_url.get('kbps')) + a_format = { + 'url': video_url, + 'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(), + 'tbr': tbr if tbr != 0 else None, + } + + if ext == 'm3u8': + # Not using _extract_m3u8_formats here as individual media + # playlists are also included in published_urls. + if tbr is None: + formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls')) + continue + else: + a_format.update({ + 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), + 'ext': 'mp4', + }) + elif ext == 'mp3': + a_format['vcodec'] = 'none' + else: + a_format.update({ + 'width': int_or_none(published_url.get('width')), + 'height': int_or_none(published_url.get('height')), + }) + formats.append(a_format) + + self._sort_formats(formats) + + subtitles = {} + for caption in video_data.get('captions', []): + a_caption = { + 'url': caption['url'], + 'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None + } + subtitles.setdefault(caption['language'], []).append(a_caption) + + return { + 'id': video_id, + 'formats': formats, + 'title': video_data.get('def_title'), + 'description': video_data.get('def_description'), + 'categories': video_data.get('categories'), + 'thumbnail': video_data.get('thumbnail'), + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 881cacfab..f40532929 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -61,10 +61,7 @@ class ArteTvIE(InfoExtractor): } -class ArteTVPlus7IE(InfoExtractor): - IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' - +class ArteTVBaseIE(InfoExtractor): @classmethod def _extract_url_info(cls, url): mobj = re.match(cls._VALID_URL, url) @@ -78,60 +75,6 @@ class ArteTVPlus7IE(InfoExtractor): video_id = mobj.group('id') return video_id, lang - def _real_extract(self, url): - video_id, lang = self._extract_url_info(url) - webpage = self._download_webpage(url, video_id) - return self._extract_from_webpage(webpage, video_id, lang) - - def _extract_from_webpage(self, webpage, video_id, lang): - patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') - ids = (video_id, '') - # some pages contain multiple videos (like - # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), - # so we first try to look for json URLs that contain the video id from - # the 'vid' parameter. - patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] - json_url = self._html_search_regex( - patterns, webpage, 'json vp url', default=None) - if not json_url: - def find_iframe_url(webpage, default=NO_DEFAULT): - return self._html_search_regex( - r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', - webpage, 'iframe url', group='url', default=default) - - iframe_url = find_iframe_url(webpage, None) - if not iframe_url: - embed_url = self._html_search_regex( - r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) - if embed_url: - player = self._download_json( - embed_url, video_id, 'Downloading player page') - iframe_url = find_iframe_url(player['html']) - # en and es URLs produce react-based pages with different layout (e.g. - # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) - if not iframe_url: - program = self._search_regex( - r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', - webpage, 'program', default=None) - if program: - embed_html = self._parse_json(program, video_id) - if embed_html: - iframe_url = find_iframe_url(embed_html['embed_html']) - if iframe_url: - json_url = compat_parse_qs( - compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] - if json_url: - title = self._search_regex( - r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', - webpage, 'title', default=None, group='title') - return self._extract_from_json_url(json_url, video_id, lang, title=title) - # Different kind of embed URL (e.g. - # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) - embed_url = self._search_regex( - r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', - webpage, 'embed url', group='url') - return self.url_result(embed_url) - def _extract_from_json_url(self, json_url, video_id, lang, title=None): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] @@ -161,24 +104,53 @@ class ArteTVPlus7IE(InfoExtractor): 'es': 'E[ESP]', } + langcode = LANGS.get(lang, lang) + formats = [] for format_id, format_dict in player_info['VSR'].items(): f = dict(format_dict) versionCode = f.get('versionCode') - langcode = LANGS.get(lang, lang) - lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)] - lang_pref = None - if versionCode: - matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)] - lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs) - source_pref = 0 - if versionCode is not None: - # The original version with subtitles has lower relevance - if re.match(r'VO-ST(F|A|E)', versionCode): - source_pref -= 10 - # The version with sourds/mal subtitles has also lower relevance - elif re.match(r'VO?(F|A|E)-STM\1', versionCode): - source_pref -= 9 + l = re.escape(langcode) + + # Language preference from most to least priority + # Reference: section 5.6.3 of + # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf + PREFERENCES = ( + # original version in requested language, without subtitles + r'VO{0}$'.format(l), + # original version in requested language, with partial subtitles in requested language + r'VO{0}-ST{0}$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO{0}-STM{0}$'.format(l), + # non-original (dubbed) version in requested language, without subtitles + r'V{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language + r'V{0}-ST{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'V{0}-STM{0}$'.format(l), + # original version in requested language, with partial subtitles in different language + r'VO{0}-ST(?!{0}).+?$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language + r'VO{0}-STM(?!{0}).+?$'.format(l), + # original version in different language, with partial subtitles in requested language + r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), + # original version in different language, without subtitles + r'VO(?:(?!{0}))?$'.format(l), + # original version in different language, with partial subtitles in different language + r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in different language + r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), + ) + + for pref, p in enumerate(PREFERENCES): + if re.match(p, versionCode): + lang_pref = len(PREFERENCES) - pref + break + else: + lang_pref = -1 + format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -188,7 +160,6 @@ class ArteTVPlus7IE(InfoExtractor): 'height': int_or_none(f.get('height')), 'tbr': int_or_none(f.get('bitrate')), 'quality': qfunc(f.get('quality')), - 'source_preference': source_pref, } if f.get('mediaType') == 'rtmp': @@ -207,6 +178,74 @@ class ArteTVPlus7IE(InfoExtractor): return info_dict +class ArteTVPlus7IE(ArteTVBaseIE): + IE_NAME = 'arte.tv:+7' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' + + _TESTS = [{ + 'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url) + + def _real_extract(self, url): + video_id, lang = self._extract_url_info(url) + webpage = self._download_webpage(url, video_id) + return self._extract_from_webpage(webpage, video_id, lang) + + def _extract_from_webpage(self, webpage, video_id, lang): + patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') + ids = (video_id, '') + # some pages contain multiple videos (like + # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), + # so we first try to look for json URLs that contain the video id from + # the 'vid' parameter. + patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] + json_url = self._html_search_regex( + patterns, webpage, 'json vp url', default=None) + if not json_url: + def find_iframe_url(webpage, default=NO_DEFAULT): + return self._html_search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', + webpage, 'iframe url', group='url', default=default) + + iframe_url = find_iframe_url(webpage, None) + if not iframe_url: + embed_url = self._html_search_regex( + r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) + if embed_url: + player = self._download_json( + embed_url, video_id, 'Downloading player page') + iframe_url = find_iframe_url(player['html']) + # en and es URLs produce react-based pages with different layout (e.g. + # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) + if not iframe_url: + program = self._search_regex( + r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', + webpage, 'program', default=None) + if program: + embed_html = self._parse_json(program, video_id) + if embed_html: + iframe_url = find_iframe_url(embed_html['embed_html']) + if iframe_url: + json_url = compat_parse_qs( + compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] + if json_url: + title = self._search_regex( + r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', + webpage, 'title', default=None, group='title') + return self._extract_from_json_url(json_url, video_id, lang, title=title) + # Different kind of embed URL (e.g. + # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) + embed_url = self._search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', + webpage, 'embed url', group='url') + return self.url_result(embed_url) + + # It also uses the arte_vp_url url from the webpage to extract the information class ArteTVCreativeIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:creative' @@ -239,7 +278,7 @@ class ArteTVInfoIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:info' _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere', 'info_dict': { 'id': '067528-000-A', @@ -247,7 +286,7 @@ class ArteTVInfoIE(ArteTVPlus7IE): 'title': 'Service civique, un cache misère ?', 'upload_date': '20160403', }, - } + }] class ArteTVFutureIE(ArteTVPlus7IE): @@ -272,6 +311,8 @@ class ArteTVDDCIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:ddc' _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)' + _TESTS = [] + def _real_extract(self, url): video_id, lang = self._extract_url_info(url) if lang == 'folge': @@ -290,7 +331,7 @@ class ArteTVConcertIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:concert' _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', 'md5': '9ea035b7bd69696b67aa2ccaaa218161', 'info_dict': { @@ -300,14 +341,14 @@ class ArteTVConcertIE(ArteTVPlus7IE): 'upload_date': '20140128', 'description': 'md5:486eb08f991552ade77439fe6d82c305', }, - } + }] class ArteTVCinemaIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:cinema' _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)' - _TEST = { + _TESTS = [{ 'url': 'http://cinema.arte.tv/de/node/38291', 'md5': '6b275511a5107c60bacbeeda368c3aa1', 'info_dict': { @@ -317,7 +358,7 @@ class ArteTVCinemaIE(ArteTVPlus7IE): 'upload_date': '20160122', 'description': 'md5:7f749bbb77d800ef2be11d54529b96bc', }, - } + }] class ArteTVMagazineIE(ArteTVPlus7IE): @@ -362,9 +403,41 @@ class ArteTVEmbedIE(ArteTVPlus7IE): ) ''' + _TESTS = [] + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') lang = mobj.group('lang') json_url = mobj.group('json_url') return self._extract_from_json_url(json_url, video_id, lang) + + +class ArteTVPlaylistIE(ArteTVBaseIE): + IE_NAME = 'arte.tv:playlist' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)' + + _TESTS = [{ + 'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV', + 'info_dict': { + 'id': 'PL-013263', + 'title': 'Areva & Uramin', + }, + 'playlist_mincount': 6, + }, { + 'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id, lang = self._extract_url_info(url) + collection = self._download_json( + 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' + % (lang, playlist_id), playlist_id) + title = collection.get('title') + description = collection.get('shortDescription') or collection.get('teaserText') + entries = [ + self._extract_from_json_url( + video['jsonUrl'], video.get('programId') or playlist_id, lang) + for video in collection['videos'] if video.get('jsonUrl')] + return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index c1ef8051d..991ab0676 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -29,7 +29,7 @@ class BandcampIE(InfoExtractor): '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '2b68e5851514c20efdff2afc5603b8b4', + 'md5': '73d0b3171568232574e45652f8720b5c', 'info_dict': { 'id': '2650410135', 'ext': 'mp3', @@ -48,6 +48,10 @@ class BandcampIE(InfoExtractor): if m_trackinfo: json_code = m_trackinfo.group(1) data = json.loads(json_code)[0] + track_id = compat_str(data['id']) + + if not data.get('file'): + raise ExtractorError('Not streamable', video_id=track_id, expected=True) formats = [] for format_id, format_url in data['file'].items(): @@ -64,7 +68,7 @@ class BandcampIE(InfoExtractor): self._sort_formats(formats) return { - 'id': compat_str(data['id']), + 'id': track_id, 'title': data['title'], 'formats': formats, 'duration': float_or_none(data.get('duration')), diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 8baff2041..b17047b39 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -1,34 +1,42 @@ # coding: utf-8 from __future__ import unicode_literals +import calendar +import datetime import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_etree_fromstring, + compat_str, + compat_parse_qs, + compat_xml_parse_error, +) from ..utils import ( - int_or_none, - unescapeHTML, ExtractorError, + int_or_none, + float_or_none, xpath_text, ) class BiliBiliIE(InfoExtractor): - _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?' + _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', - 'md5': '2c301e4dab317596e837c3e7633e7d86', + 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', 'info_dict': { 'id': '1554319', 'ext': 'flv', 'title': '【金坷垃】金泡沫', - 'duration': 308313, + 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', + 'duration': 308.067, + 'timestamp': 1398012660, 'upload_date': '20140420', 'thumbnail': 're:^https?://.+\.jpg', - 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'timestamp': 1397983878, 'uploader': '菊子桑', + 'uploader_id': '156160', }, }, { 'url': 'http://www.bilibili.com/video/av1041170/', @@ -36,75 +44,186 @@ class BiliBiliIE(InfoExtractor): 'id': '1041170', 'title': '【BD1080P】刀语【诸神&异域】', 'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', - 'uploader': '枫叶逝去', - 'timestamp': 1396501299, }, 'playlist_count': 9, + }, { + 'url': 'http://www.bilibili.com/video/av4808130/', + 'info_dict': { + 'id': '4808130', + 'title': '【长篇】哆啦A梦443【钉铛】', + 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', + }, + 'playlist': [{ + 'md5': '55cdadedf3254caaa0d5d27cf20a8f9c', + 'info_dict': { + 'id': '4808130_part1', + 'ext': 'flv', + 'title': '【长篇】哆啦A梦443【钉铛】', + 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', + 'timestamp': 1464564180, + 'upload_date': '20160529', + 'uploader': '喜欢拉面', + 'uploader_id': '151066', + }, + }, { + 'md5': '926f9f67d0c482091872fbd8eca7ea3d', + 'info_dict': { + 'id': '4808130_part2', + 'ext': 'flv', + 'title': '【长篇】哆啦A梦443【钉铛】', + 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', + 'timestamp': 1464564180, + 'upload_date': '20160529', + 'uploader': '喜欢拉面', + 'uploader_id': '151066', + }, + }, { + 'md5': '4b7b225b968402d7c32348c646f1fd83', + 'info_dict': { + 'id': '4808130_part3', + 'ext': 'flv', + 'title': '【长篇】哆啦A梦443【钉铛】', + 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', + 'timestamp': 1464564180, + 'upload_date': '20160529', + 'uploader': '喜欢拉面', + 'uploader_id': '151066', + }, + }, { + 'md5': '7b795e214166501e9141139eea236e91', + 'info_dict': { + 'id': '4808130_part4', + 'ext': 'flv', + 'title': '【长篇】哆啦A梦443【钉铛】', + 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', + 'timestamp': 1464564180, + 'upload_date': '20160529', + 'uploader': '喜欢拉面', + 'uploader_id': '151066', + }, + }], + }, { + # Missing upload time + 'url': 'http://www.bilibili.com/video/av1867637/', + 'info_dict': { + 'id': '2880301', + 'ext': 'flv', + 'title': '【HDTV】【喜剧】岳父岳母真难当 (2014)【法国票房冠军】', + 'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫,老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人,结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】', + 'uploader': '黑夜为猫', + 'uploader_id': '610729', + }, + 'params': { + # Just to test metadata extraction + 'skip_download': True, + }, + 'expected_warnings': ['upload time'], }] + # BiliBili blocks keys from time to time. The current key is extracted from + # the Android client + # TODO: find the sign algorithm used in the flash player + _APP_KEY = '86385cdc024c0f6c' + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - page_num = mobj.group('page_num') or '1' - view_data = self._download_json( - 'http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s&page=%s' % (video_id, page_num), - video_id) - if 'error' in view_data: - raise ExtractorError('%s said: %s' % (self.IE_NAME, view_data['error']), expected=True) + webpage = self._download_webpage(url, video_id) - cid = view_data['cid'] - title = unescapeHTML(view_data['title']) + params = compat_parse_qs(self._search_regex( + [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', + r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], + webpage, 'player parameters')) + cid = params['cid'][0] - doc = self._download_xml( - 'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid, - cid, - 'Downloading page %s/%s' % (page_num, view_data['pages']) - ) + info_xml_str = self._download_webpage( + 'http://interface.bilibili.com/v_cdn_play', + cid, query={'appkey': self._APP_KEY, 'cid': cid}, + note='Downloading video info page') + + err_msg = None + durls = None + info_xml = None + try: + info_xml = compat_etree_fromstring(info_xml_str.encode('utf-8')) + except compat_xml_parse_error: + info_json = self._parse_json(info_xml_str, video_id, fatal=False) + err_msg = (info_json or {}).get('error_text') + else: + err_msg = xpath_text(info_xml, './message') - if xpath_text(doc, './result') == 'error': - raise ExtractorError('%s said: %s' % (self.IE_NAME, xpath_text(doc, './message')), expected=True) + if info_xml is not None: + durls = info_xml.findall('./durl') + if not durls: + if err_msg: + raise ExtractorError('%s said: %s' % (self.IE_NAME, err_msg), expected=True) + else: + raise ExtractorError('No videos found!') entries = [] - for durl in doc.findall('./durl'): + for durl in durls: size = xpath_text(durl, ['./filesize', './size']) formats = [{ 'url': durl.find('./url').text, 'filesize': int_or_none(size), - 'ext': 'flv', }] - backup_urls = durl.find('./backup_url') - if backup_urls is not None: - for backup_url in backup_urls.findall('./url'): - formats.append({'url': backup_url.text}) - formats.reverse() + for backup_url in durl.findall('./backup_url/url'): + formats.append({ + 'url': backup_url.text, + # backup URLs have lower priorities + 'preference': -2 if 'hd.mp4' in backup_url.text else -3, + }) + + self._sort_formats(formats) entries.append({ 'id': '%s_part%s' % (cid, xpath_text(durl, './order')), - 'title': title, 'duration': int_or_none(xpath_text(durl, './length'), 1000), 'formats': formats, }) + title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title') + description = self._html_search_meta('description', webpage) + datetime_str = self._html_search_regex( + r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False) + timestamp = None + if datetime_str: + timestamp = calendar.timegm(datetime.datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M').timetuple()) + + # TODO 'view_count' requires deobfuscating Javascript info = { 'id': compat_str(cid), 'title': title, - 'description': view_data.get('description'), - 'thumbnail': view_data.get('pic'), - 'uploader': view_data.get('author'), - 'timestamp': int_or_none(view_data.get('created')), - 'view_count': int_or_none(view_data.get('play')), - 'duration': int_or_none(xpath_text(doc, './timelength')), + 'description': description, + 'timestamp': timestamp, + 'thumbnail': self._html_search_meta('thumbnailUrl', webpage), + 'duration': float_or_none(xpath_text(info_xml, './timelength'), scale=1000), } + uploader_mobj = re.search( + r'<a[^>]+href="https?://space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"', + webpage) + if uploader_mobj: + info.update({ + 'uploader': uploader_mobj.group('name'), + 'uploader_id': uploader_mobj.group('id'), + }) + + for entry in entries: + entry.update(info) + if len(entries) == 1: - entries[0].update(info) return entries[0] else: - info.update({ + for idx, entry in enumerate(entries): + entry['id'] = '%s_part%d' % (video_id, (idx + 1)) + + return { '_type': 'multi_video', 'id': video_id, + 'title': title, + 'description': description, 'entries': entries, - }) - return info + } diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py new file mode 100644 index 000000000..ae4579b33 --- /dev/null +++ b/youtube_dl/extractor/biqle.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BIQLEIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)' + _TESTS = [{ + 'url': 'http://www.biqle.ru/watch/847655_160197695', + 'md5': 'ad5f746a874ccded7b8f211aeea96637', + 'info_dict': { + 'id': '160197695', + 'ext': 'mp4', + 'title': 'Foo Fighters - The Pretender (Live at Wembley Stadium)', + 'uploader': 'Andrey Rogozin', + 'upload_date': '20110605', + } + }, { + 'url': 'https://biqle.org/watch/-44781847_168547604', + 'md5': '7f24e72af1db0edf7c1aaba513174f97', + 'info_dict': { + 'id': '168547604', + 'ext': 'mp4', + 'title': 'Ребенок в шоке от автоматической мойки', + 'uploader': 'Dmitry Kotov', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + embed_url = self._proto_relative_url(self._search_regex( + r'<iframe.+?src="((?:http:)?//daxab\.com/[^"]+)".*?></iframe>', webpage, 'embed url')) + + return { + '_type': 'url_transparent', + 'url': embed_url, + } diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 13343bc25..bd538be50 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -17,6 +17,9 @@ class BloombergIE(InfoExtractor): 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', 'description': 'md5:a8ba0302912d03d246979735c17d2761', }, + 'params': { + 'format': 'best[format_id^=hds]', + }, }, { 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', 'only_matching': True, diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f0781fc27..ef560b592 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -307,9 +307,10 @@ class BrightcoveLegacyIE(InfoExtractor): playlist_title=playlist_info['mediaCollectionDTO']['displayName']) def _extract_video_info(self, video_info): + video_id = compat_str(video_info['id']) publisher_id = video_info.get('publisherId') info = { - 'id': compat_str(video_info['id']), + 'id': video_id, 'title': video_info['displayName'].strip(), 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), @@ -331,7 +332,8 @@ class BrightcoveLegacyIE(InfoExtractor): url_comp = compat_urllib_parse_urlparse(url) if url_comp.path.endswith('.m3u8'): formats.extend( - self._extract_m3u8_formats(url, info['id'], 'mp4')) + self._extract_m3u8_formats( + url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) continue elif 'akamaihd.net' in url_comp.netloc: # This type of renditions are served through @@ -365,7 +367,7 @@ class BrightcoveLegacyIE(InfoExtractor): a_format.update({ 'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''), 'ext': 'mp4', - 'protocol': 'm3u8', + 'protocol': 'm3u8_native', }) formats.append(a_format) @@ -395,7 +397,7 @@ class BrightcoveLegacyIE(InfoExtractor): return ad_info if 'url' not in info and not info.get('formats'): - raise ExtractorError('Unable to extract video url for %s' % info['id']) + raise ExtractorError('Unable to extract video url for %s' % video_id) return info @@ -442,6 +444,10 @@ class BrightcoveNewIE(InfoExtractor): # non numeric ref: prefixed video id 'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356', 'only_matching': True, + }, { + # unavailable video without message but with error_code + 'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001', + 'only_matching': True, }] @staticmethod @@ -512,8 +518,9 @@ class BrightcoveNewIE(InfoExtractor): }) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - json_data = self._parse_json(e.cause.read().decode(), video_id) - raise ExtractorError(json_data[0]['message'], expected=True) + json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + raise ExtractorError( + json_data.get('message') or json_data['error_code'], expected=True) raise title = json_data['name'].strip() @@ -527,7 +534,7 @@ class BrightcoveNewIE(InfoExtractor): if not src: continue formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id='hls', fatal=False)) + src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) elif source_type == 'application/dash+xml': if not src: continue diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index dda98059e..3aec601f8 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -11,6 +11,7 @@ class BYUtvIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)' _TEST = { 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', + 'md5': '05850eb8c749e2ee05ad5a1c34668493', 'info_dict': { 'id': 'studio-c-season-5-episode-5', 'ext': 'mp4', @@ -21,7 +22,8 @@ class BYUtvIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'add_ie': ['Ooyala'], } def _real_extract(self, url): diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 25b2d4efe..61463f249 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -4,11 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, HEADRequest, unified_strdate, - url_basename, qualities, int_or_none, ) @@ -16,24 +16,38 @@ from ..utils import ( class CanalplusIE(InfoExtractor): IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv' - _VALID_URL = r'https?://(?:www\.(?P<site>canalplus\.fr|piwiplus\.fr|d8\.tv|itele\.fr)/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))' + _VALID_URL = r'''(?x) + https?:// + (?: + (?: + (?:(?:www|m)\.)?canalplus\.fr| + (?:www\.)?piwiplus\.fr| + (?:www\.)?d8\.tv| + (?:www\.)?d17\.tv| + (?:www\.)?itele\.fr + )/(?:(?:[^/]+/)*(?P<display_id>[^/?#&]+))?(?:\?.*\bvid=(?P<vid>\d+))?| + player\.canalplus\.fr/#/(?P<id>\d+) + ) + + ''' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' _SITE_ID_MAP = { - 'canalplus.fr': 'cplus', - 'piwiplus.fr': 'teletoon', - 'd8.tv': 'd8', - 'itele.fr': 'itele', + 'canalplus': 'cplus', + 'piwiplus': 'teletoon', + 'd8': 'd8', + 'd17': 'd17', + 'itele': 'itele', } _TESTS = [{ - 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092', - 'md5': '12164a6f14ff6df8bd628e8ba9b10b78', + 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814', + 'md5': '41f438a4904f7664b91b4ed0dec969dc', 'info_dict': { - 'id': '1263092', + 'id': '1192814', 'ext': 'mp4', - 'title': 'Le Zapping - 13/05/15', - 'description': 'md5:09738c0d06be4b5d06a0940edb0da73f', - 'upload_date': '20150513', + 'title': "L'Année du Zapping 2014 - L'Année du Zapping 2014", + 'description': "Toute l'année 2014 dans un Zapping exceptionnel !", + 'upload_date': '20150105', }, }, { 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', @@ -46,35 +60,45 @@ class CanalplusIE(InfoExtractor): }, 'skip': 'Only works from France', }, { - 'url': 'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html', + 'url': 'http://www.d8.tv/d8-docs-mags/pid5198-d8-en-quete-d-actualite.html?vid=1390231', 'info_dict': { - 'id': '966289', - 'ext': 'flv', - 'title': 'Campagne intime - Documentaire exceptionnel', - 'description': 'md5:d2643b799fb190846ae09c61e59a859f', - 'upload_date': '20131108', + 'id': '1390231', + 'ext': 'mp4', + 'title': "Vacances pas chères : prix discount ou grosses dépenses ? - En quête d'actualité", + 'description': 'md5:edb6cf1cb4a1e807b5dd089e1ac8bfc6', + 'upload_date': '20160512', + }, + 'params': { + 'skip_download': True, }, - 'skip': 'videos get deleted after a while', }, { - 'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559', - 'md5': '38b8f7934def74f0d6f3ba6c036a5f82', + 'url': 'http://www.itele.fr/chroniques/invite-bruce-toussaint/thierry-solere-nicolas-sarkozy-officialisera-sa-candidature-a-la-primaire-quand-il-le-voudra-167224', 'info_dict': { - 'id': '1213714', + 'id': '1398334', 'ext': 'mp4', - 'title': 'Aubervilliers : un lycée en colère - Le 11/02/2015 à 06h45', - 'description': 'md5:8216206ec53426ea6321321f3b3c16db', - 'upload_date': '20150211', + 'title': "L'invité de Bruce Toussaint du 07/06/2016 - ", + 'description': 'md5:40ac7c9ad0feaeb6f605bad986f61324', + 'upload_date': '20160607', }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://m.canalplus.fr/?vid=1398231', + 'only_matching': True, + }, { + 'url': 'http://www.d17.tv/emissions/pid8303-lolywood.html?vid=1397061', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.groupdict().get('id') + video_id = mobj.groupdict().get('id') or mobj.groupdict().get('vid') - site_id = self._SITE_ID_MAP[mobj.group('site') or 'canal'] + site_id = self._SITE_ID_MAP[compat_urllib_parse_urlparse(url).netloc.rsplit('.', 2)[-2]] # Beware, some subclasses do not define an id group - display_id = url_basename(mobj.group('path')) + display_id = mobj.group('display_id') or video_id if video_id is None: webpage = self._download_webpage(url, display_id) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 68a0633b6..ff663d079 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -4,65 +4,66 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + js_to_json, + smuggle_url, +) class CBCIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:[^/]+/)+(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ # with mediaId 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs', + 'md5': '97e24d09672fc4cf56256d6faa6c25bc', 'info_dict': { 'id': '2682904050', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Don Cherry – All-Stars', 'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.', - 'timestamp': 1454475540, + 'timestamp': 1454463000, 'upload_date': '20160203', - }, - 'params': { - # rtmp download - 'skip_download': True, + 'uploader': 'CBCC-NEW', }, }, { # with clipId 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', + 'md5': '0274a90b51a9b4971fe005c63f592f12', 'info_dict': { 'id': '2487345465', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Robin Williams freestyles on 90 Minutes Live', 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.', - 'upload_date': '19700101', + 'upload_date': '19780210', 'uploader': 'CBCC-NEW', - }, - 'params': { - # rtmp download - 'skip_download': True, + 'timestamp': 255977160, }, }, { # multiple iframes 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', 'playlist': [{ + 'md5': '377572d0b49c4ce0c9ad77470e0b96b4', 'info_dict': { 'id': '2680832926', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'An Eagle\'s-Eye View Off Burrard Bridge', 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.', - 'upload_date': '19700101', + 'upload_date': '20160201', + 'timestamp': 1454342820, + 'uploader': 'CBCC-NEW', }, }, { + 'md5': '415a0e3f586113894174dfb31aa5bb1a', 'info_dict': { 'id': '2658915080', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Fly like an eagle!', 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower', - 'upload_date': '19700101', + 'upload_date': '20150315', + 'timestamp': 1426443984, + 'uploader': 'CBCC-NEW', }, }], - 'params': { - # rtmp download - 'skip_download': True, - }, }] @classmethod @@ -91,24 +92,54 @@ class CBCIE(InfoExtractor): class CBCPlayerIE(InfoExtractor): _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.cbc.ca/player/play/2683190193', + 'md5': '64d25f841ddf4ddb28a235338af32e2c', 'info_dict': { 'id': '2683190193', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Gerry Runs a Sweat Shop', 'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0', - 'timestamp': 1455067800, + 'timestamp': 1455071400, 'upload_date': '20160210', + 'uploader': 'CBCC-NEW', }, - 'params': { - # rtmp download - 'skip_download': True, + }, { + # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ + 'url': 'http://www.cbc.ca/player/play/2657631896', + 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', + 'info_dict': { + 'id': '2657631896', + 'ext': 'mp3', + 'title': 'CBC Montreal is organizing its first ever community hackathon!', + 'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.', + 'timestamp': 1425704400, + 'upload_date': '20150307', + 'uploader': 'CBCC-NEW', }, - } + }, { + # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url + 'url': 'http://www.cbc.ca/player/play/2164402062', + 'md5': '17a61eb813539abea40618d6323a7f82', + 'info_dict': { + 'id': '2164402062', + 'ext': 'flv', + 'title': 'Cancer survivor four times over', + 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', + 'timestamp': 1320410746, + 'upload_date': '20111104', + 'uploader': 'CBCC-NEW', + }, + }] def _real_extract(self, url): video_id = self._match_id(url) - return self.url_result( - 'http://feed.theplatform.com/f/ExhSPC/vms_5akSXx4Ng_Zn?byGuid=%s' % video_id, - 'ThePlatformFeed', video_id) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, { + 'force_smil_url': True + }), + 'id': video_id, + } diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 051d783a2..ac2c7dced 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .theplatform import ThePlatformIE from ..utils import ( xpath_text, @@ -21,7 +23,7 @@ class CBSBaseIE(ThePlatformIE): class CBSIE(CBSBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)' + _VALID_URL = r'(?:cbs:(?P<content_id>\w+)|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<display_id>[^/]+))' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', @@ -66,11 +68,12 @@ class CBSIE(CBSBaseIE): TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - content_id = self._search_regex( - [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"], - webpage, 'content id') + content_id, display_id = re.match(self._VALID_URL, url).groups() + if not content_id: + webpage = self._download_webpage(url, display_id) + content_id = self._search_regex( + [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"], + webpage, 'content id') items_data = self._download_xml( 'http://can.cbs.com/thunder/player/videoPlayerService.php', content_id, query={'partner': 'cbs', 'contentId': content_id}) diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py new file mode 100644 index 000000000..74adb38a6 --- /dev/null +++ b/youtube_dl/extractor/cbslocal.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import calendar +import datetime + +from .anvato import AnvatoIE +from .sendtonews import SendtoNewsIE +from ..compat import compat_urlparse + + +class CBSLocalIE(AnvatoIE): + _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/\d+/\d+/\d+/(?P<id>[0-9a-z-]+)' + + _TESTS = [{ + # Anvato backend + 'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis', + 'md5': 'f0ee3081e3843f575fccef901199b212', + 'info_dict': { + 'id': '3401037', + 'ext': 'mp4', + 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', + 'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.', + 'thumbnail': 're:^https?://.*', + 'timestamp': 1463440500, + 'upload_date': '20160516', + 'subtitles': { + 'en': 'mincount:5', + }, + 'categories': [ + 'Stations\\Spoken Word\\KCBSTV', + 'Syndication\\MSN', + 'Syndication\\NDN', + 'Syndication\\AOL', + 'Syndication\\Yahoo', + 'Syndication\\Tribune', + 'Syndication\\Curb.tv', + 'Content\\News' + ], + }, + }, { + # SendtoNews embed + 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', + 'info_dict': { + 'id': 'GxfCe0Zo7D-175909-5588', + 'ext': 'mp4', + 'title': 'Recap: CLE 15, CIN 6', + 'description': '5/16/16: Indians\' bats explode for 15 runs in a win', + 'upload_date': '20160516', + 'timestamp': 1463433840, + 'duration': 49, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + sendtonews_url = SendtoNewsIE._extract_url(webpage) + if sendtonews_url: + info_dict = { + '_type': 'url_transparent', + 'url': compat_urlparse.urljoin(url, sendtonews_url), + } + else: + info_dict = self._extract_anvato_videos(webpage, display_id) + + time_str = self._html_search_regex( + r'class="entry-date">([^<]+)<', webpage, 'released date', fatal=False) + timestamp = None + if time_str: + timestamp = calendar.timegm(datetime.datetime.strptime( + time_str, '%b %d, %Y %I:%M %p').timetuple()) + + info_dict.update({ + 'display_id': display_id, + 'timestamp': timestamp, + }) + + return info_dict diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 6652c8e42..5a58d1777 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -33,20 +33,34 @@ class CeskaTelevizeIE(InfoExtractor): 'skip_download': True, }, }, { - 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', + 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', 'info_dict': { - 'id': '61924494876844374', + 'id': '61924494877028507', 'ext': 'mp4', - 'title': 'První republika: Zpěvačka z Dupárny Bobina', - 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', + 'title': 'Hyde Park Civilizace: Bonus 01 - En', + 'description': 'English Subtittles', 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 88.4, + 'duration': 81.3, }, 'params': { # m3u8 download 'skip_download': True, }, }, { + # live stream + 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', + 'info_dict': { + 'id': 402, + 'ext': 'mp4', + 'title': 're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Georestricted to Czech Republic', + }, { # video with 18+ caution trailer 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', 'info_dict': { @@ -118,19 +132,21 @@ class CeskaTelevizeIE(InfoExtractor): req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist_title = self._og_search_title(webpage) - playlist_description = self._og_search_description(webpage) + playlist_title = self._og_search_title(webpage, default=None) + playlist_description = self._og_search_description(webpage, default=None) playlist = self._download_json(req, playlist_id)['playlist'] playlist_len = len(playlist) entries = [] for item in playlist: + is_live = item.get('type') == 'LIVE' formats = [] for format_id, stream_url in item['streamUrls'].items(): formats.extend(self._extract_m3u8_formats( stream_url, playlist_id, 'mp4', - entry_protocol='m3u8_native', fatal=False)) + entry_protocol='m3u8' if is_live else 'm3u8_native', + fatal=False)) self._sort_formats(formats) item_id = item.get('id') or item['assetId'] @@ -145,14 +161,22 @@ class CeskaTelevizeIE(InfoExtractor): if subs: subtitles = self.extract_subtitles(episode_id, subs) + if playlist_len == 1: + final_title = playlist_title or title + if is_live: + final_title = self._live_title(final_title) + else: + final_title = '%s (%s)' % (playlist_title, title) + entries.append({ 'id': item_id, - 'title': playlist_title if playlist_len == 1 else '%s (%s)' % (playlist_title, title), + 'title': final_title, 'description': playlist_description if playlist_len == 1 else None, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, 'subtitles': subtitles, + 'is_live': is_live, }) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index c74553dcf..34d4e6156 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -20,54 +20,64 @@ class Channel9IE(InfoExtractor): ''' IE_DESC = 'Channel 9' IE_NAME = 'channel9' - _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?' - - _TESTS = [ - { - 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', - 'md5': 'bbd75296ba47916b754e73c3a4bbdf10', - 'info_dict': { - 'id': 'Events/TechEd/Australia/2013/KOS002', - 'ext': 'mp4', - 'title': 'Developer Kick-Off Session: Stuff We Love', - 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', - 'duration': 4576, - 'thumbnail': 're:http://.*\.jpg', - 'session_code': 'KOS002', - 'session_day': 'Day 1', - 'session_room': 'Arena 1A', - 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'], - }, + _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' + + _TESTS = [{ + 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', + 'md5': 'bbd75296ba47916b754e73c3a4bbdf10', + 'info_dict': { + 'id': 'Events/TechEd/Australia/2013/KOS002', + 'ext': 'mp4', + 'title': 'Developer Kick-Off Session: Stuff We Love', + 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', + 'duration': 4576, + 'thumbnail': 're:http://.*\.jpg', + 'session_code': 'KOS002', + 'session_day': 'Day 1', + 'session_room': 'Arena 1A', + 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', + 'Mads Kristensen'], }, - { - 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', - 'md5': 'b43ee4529d111bc37ba7ee4f34813e68', - 'info_dict': { - 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing', - 'ext': 'mp4', - 'title': 'Self-service BI with Power BI - nuclear testing', - 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', - 'duration': 1540, - 'thumbnail': 're:http://.*\.jpg', - 'authors': ['Mike Wilmot'], - }, + }, { + 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', + 'md5': 'b43ee4529d111bc37ba7ee4f34813e68', + 'info_dict': { + 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing', + 'ext': 'mp4', + 'title': 'Self-service BI with Power BI - nuclear testing', + 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', + 'duration': 1540, + 'thumbnail': 're:http://.*\.jpg', + 'authors': ['Mike Wilmot'], }, - { - # low quality mp4 is best - 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', - 'info_dict': { - 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', - 'ext': 'mp4', - 'title': 'Ranges for the Standard Library', - 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', - 'duration': 5646, - 'thumbnail': 're:http://.*\.jpg', - }, - 'params': { - 'skip_download': True, - }, - } - ] + }, { + # low quality mp4 is best + 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'info_dict': { + 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'ext': 'mp4', + 'title': 'Ranges for the Standard Library', + 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', + 'duration': 5646, + 'thumbnail': 're:http://.*\.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', + 'info_dict': { + 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b', + 'title': 'Channel 9', + }, + 'playlist_count': 2, + }, { + 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', + 'only_matching': True, + }, { + 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', + 'only_matching': True, + }] _RSS_URL = 'http://channel9.msdn.com/%s/RSS' @@ -254,22 +264,30 @@ class Channel9IE(InfoExtractor): return self.playlist_result(contents) - def _extract_list(self, content_path): - rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS') + def _extract_list(self, video_id, rss_url=None): + if not rss_url: + rss_url = self._RSS_URL % video_id + rss = self._download_xml(rss_url, video_id, 'Downloading RSS') entries = [self.url_result(session_url.text, 'Channel9') for session_url in rss.findall('./channel/item/link')] title_text = rss.find('./channel/title').text - return self.playlist_result(entries, content_path, title_text) + return self.playlist_result(entries, video_id, title_text) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) content_path = mobj.group('contentpath') + rss = mobj.group('rss') + + if rss: + return self._extract_list(content_path, url) - webpage = self._download_webpage(url, content_path, 'Downloading web page') + webpage = self._download_webpage( + url, content_path, 'Downloading web page') - page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage) - if page_type_m is not None: - page_type = page_type_m.group('pagetype') + page_type = self._search_regex( + r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2', + webpage, 'page type', default=None, group='pagetype') + if page_type: if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content return self._extract_entry_item(webpage, content_path) elif page_type == 'Session': # Event session page, may contain downloadable content @@ -278,6 +296,5 @@ class Channel9IE(InfoExtractor): return self._extract_list(content_path) else: raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True) - else: # Assuming list return self._extract_list(content_path) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py deleted file mode 100644 index 042c4f2f1..000000000 --- a/youtube_dl/extractor/cinemassacre.py +++ /dev/null @@ -1,119 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ExtractorError -from .screenwavemedia import ScreenwaveMediaIE - - -class CinemassacreIE(InfoExtractor): - _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)' - _TESTS = [ - { - 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', - 'md5': 'fde81fbafaee331785f58cd6c0d46190', - 'info_dict': { - 'id': 'Cinemassacre-19911', - 'ext': 'mp4', - 'upload_date': '20121110', - 'title': '“Angry Video Game Nerd: The Movie” – Trailer', - 'description': 'md5:fb87405fcb42a331742a0dce2708560b', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', - 'md5': 'd72f10cd39eac4215048f62ab477a511', - 'info_dict': { - 'id': 'Cinemassacre-521be8ef82b16', - 'ext': 'mp4', - 'upload_date': '20131002', - 'title': 'The Mummy’s Hand (1940)', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - # Youtube embedded video - 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', - 'md5': 'ec9838a5520ef5409b3e4e42fcb0a3b9', - 'info_dict': { - 'id': 'OEVzPCY2T-g', - 'ext': 'webm', - 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', - 'upload_date': '20061207', - 'uploader': 'Cinemassacre', - 'uploader_id': 'JamesNintendoNerd', - 'description': 'md5:784734696c2b8b7f4b8625cc799e07f6', - } - }, - { - # Youtube embedded video - 'url': 'http://cinemassacre.com/2006/09/01/mckids/', - 'md5': '7393c4e0f54602ad110c793eb7a6513a', - 'info_dict': { - 'id': 'FnxsNhuikpo', - 'ext': 'webm', - 'upload_date': '20060901', - 'uploader': 'Cinemassacre Extra', - 'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53', - 'uploader_id': 'Cinemassacre', - 'title': 'AVGN: McKids', - } - }, - { - 'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/', - 'md5': '1376908e49572389e7b06251a53cdd08', - 'info_dict': { - 'id': 'Cinemassacre-555779690c440', - 'ext': 'mp4', - 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', - 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', - 'upload_date': '20150525', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d') - - webpage = self._download_webpage(url, display_id) - - playerdata_url = self._search_regex( - [ - ScreenwaveMediaIE.EMBED_PATTERN, - r'<iframe[^>]+src="(?P<url>(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', - ], - webpage, 'player data URL', default=None, group='url') - if not playerdata_url: - raise ExtractorError('Unable to find player data') - - video_title = self._html_search_regex( - r'<title>(?P<title>.+?)\|', webpage, 'title') - video_description = self._html_search_regex( - r'<div class="entry-content">(?P<description>.+?)</div>', - webpage, 'description', flags=re.DOTALL, fatal=False) - video_thumbnail = self._og_search_thumbnail(webpage) - - return { - '_type': 'url_transparent', - 'display_id': display_id, - 'title': video_title, - 'description': video_description, - 'upload_date': video_date, - 'thumbnail': video_thumbnail, - 'url': playerdata_url, - } diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py deleted file mode 100644 index 002b24037..000000000 --- a/youtube_dl/extractor/collegehumor.py +++ /dev/null @@ -1,101 +0,0 @@ -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class CollegeHumorIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$' - - _TESTS = [ - { - 'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', - 'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd', - 'info_dict': { - 'id': '6902724', - 'ext': 'mp4', - 'title': 'Comic-Con Cosplay Catastrophe', - 'description': "Fans get creative this year at San Diego. Too creative. And yes, that's really Joss Whedon.", - 'age_limit': 13, - 'duration': 187, - }, - }, { - 'url': 'http://www.collegehumor.com/video/3505939/font-conference', - 'md5': '72fa701d8ef38664a4dbb9e2ab721816', - 'info_dict': { - 'id': '3505939', - 'ext': 'mp4', - 'title': 'Font Conference', - 'description': "This video wasn't long enough, so we made it double-spaced.", - 'age_limit': 10, - 'duration': 179, - }, - }, { - # embedded youtube video - 'url': 'http://www.collegehumor.com/embed/6950306', - 'info_dict': { - 'id': 'Z-bao9fg6Yc', - 'ext': 'mp4', - 'title': 'Young Americans Think President John F. Kennedy Died THIS MORNING IN A CAR ACCIDENT!!!', - 'uploader': 'Mark Dice', - 'uploader_id': 'MarkDice', - 'description': 'md5:62c3dab9351fac7bb44b53b69511d87f', - 'upload_date': '20140127', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Youtube'], - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('videoid') - - jsonUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id + '.json' - data = json.loads(self._download_webpage( - jsonUrl, video_id, 'Downloading info JSON')) - vdata = data['video'] - if vdata.get('youtubeId') is not None: - return { - '_type': 'url', - 'url': vdata['youtubeId'], - 'ie_key': 'Youtube', - } - - AGE_LIMITS = {'nc17': 18, 'r': 18, 'pg13': 13, 'pg': 10, 'g': 0} - rating = vdata.get('rating') - if rating: - age_limit = AGE_LIMITS.get(rating.lower()) - else: - age_limit = None # None = No idea - - PREFS = {'high_quality': 2, 'low_quality': 0} - formats = [] - for format_key in ('mp4', 'webm'): - for qname, qurl in vdata.get(format_key, {}).items(): - formats.append({ - 'format_id': format_key + '_' + qname, - 'url': qurl, - 'format': format_key, - 'preference': PREFS.get(qname), - }) - self._sort_formats(formats) - - duration = int_or_none(vdata.get('duration'), 1000) - like_count = int_or_none(vdata.get('likes')) - - return { - 'id': video_id, - 'title': vdata['title'], - 'description': vdata.get('description'), - 'thumbnail': vdata.get('thumbnail'), - 'formats': formats, - 'age_limit': age_limit, - 'duration': duration, - 'like_count': like_count, - } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 0c59102e0..2b6aaa3aa 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -44,10 +44,10 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524 _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow) |https?://(:www\.)? - (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/ + (?P<showname>thedailyshow|thecolbertreport|tosh)\.(?:cc\.)?com/ ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)| (?P<clip> - (?:(?:guests/[^/]+|videos|video-playlists|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+)) + (?:(?:guests/[^/]+|videos|video-(?:clips|playlists)|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+)) |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)) )| @@ -129,6 +129,9 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): }, { 'url': 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel', 'only_matching': True, + }, { + 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', + 'only_matching': True, }] _available_formats = ['3500', '2200', '1700', '1200', '750', '400'] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0843d89af..bfd432160 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -45,6 +45,7 @@ from ..utils import ( unescapeHTML, unified_strdate, url_basename, + xpath_element, xpath_text, xpath_with_ns, determine_protocol, @@ -987,7 +988,7 @@ class InfoExtractor(object): def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=True): + fatal=True, m3u8_id=None): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', @@ -1001,11 +1002,11 @@ class InfoExtractor(object): return self._parse_f4m_formats( manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, - transform_source=transform_source, fatal=fatal) + transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=True): + fatal=True, m3u8_id=None): # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') if akamai_pv is not None and ';' in akamai_pv.text: @@ -1029,9 +1030,26 @@ class InfoExtractor(object): 'base URL', default=None) if base_url: base_url = base_url.strip() + + bootstrap_info = xpath_element( + manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], + 'bootstrap info', default=None) + for i, media_el in enumerate(media_nodes): - if manifest_version == '2.0': - media_url = media_el.attrib.get('href') or media_el.attrib.get('url') + tbr = int_or_none(media_el.attrib.get('bitrate')) + width = int_or_none(media_el.attrib.get('width')) + height = int_or_none(media_el.attrib.get('height')) + format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) + # If <bootstrapInfo> is present, the specified f4m is a + # stream-level manifest, and only set-level manifests may refer to + # external resources. See section 11.4 and section 4 of F4M spec + if bootstrap_info is None: + media_url = None + # @href is introduced in 2.0, see section 11.6 of F4M spec + if manifest_version == '2.0': + media_url = media_el.attrib.get('href') + if media_url is None: + media_url = media_el.attrib.get('url') if not media_url: continue manifest_url = ( @@ -1041,29 +1059,43 @@ class InfoExtractor(object): # since bitrates in parent manifest (this one) and media_url manifest # may differ leading to inability to resolve the format by requested # bitrate in f4m downloader - if determine_ext(manifest_url) == 'f4m': - formats.extend(self._extract_f4m_formats( + ext = determine_ext(manifest_url) + if ext == 'f4m': + f4m_formats = self._extract_f4m_formats( manifest_url, video_id, preference=preference, f4m_id=f4m_id, - transform_source=transform_source, fatal=fatal)) + transform_source=transform_source, fatal=fatal) + # Sometimes stream-level manifest contains single media entry that + # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). + # At the same time parent's media entry in set-level manifest may + # contain it. We will copy it from parent in such cases. + if len(f4m_formats) == 1: + f = f4m_formats[0] + f.update({ + 'tbr': f.get('tbr') or tbr, + 'width': f.get('width') or width, + 'height': f.get('height') or height, + 'format_id': f.get('format_id') if not tbr else format_id, + }) + formats.extend(f4m_formats) + continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', preference=preference, + m3u8_id=m3u8_id, fatal=fatal)) continue - tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ - 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), + 'format_id': format_id, 'url': manifest_url, - 'ext': 'flv', + 'ext': 'flv' if bootstrap_info is not None else None, 'tbr': tbr, - 'width': int_or_none(media_el.attrib.get('width')), - 'height': int_or_none(media_el.attrib.get('height')), + 'width': width, + 'height': height, 'preference': preference, }) return formats - def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, - entry_protocol='m3u8', preference=None, - m3u8_id=None, note=None, errnote=None, - fatal=True, live=False): - - formats = [{ + def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None): + return { 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), 'url': m3u8_url, 'ext': ext, @@ -1071,7 +1103,14 @@ class InfoExtractor(object): 'preference': preference - 1 if preference else -1, 'resolution': 'multiple', 'format_note': 'Quality selection URL', - }] + } + + def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, + entry_protocol='m3u8', preference=None, + m3u8_id=None, note=None, errnote=None, + fatal=True, live=False): + + formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] format_url = lambda u: ( u @@ -1138,12 +1177,15 @@ class InfoExtractor(object): format_id = [] if m3u8_id: format_id.append(m3u8_id) - last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None + last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None + # Despite specification does not mention NAME attribute for + # EXT-X-STREAM-INF it still sometimes may be present + stream_name = last_info.get('NAME') or last_media_name # Bandwidth of live streams may differ over time thus making # format_id unpredictable. So it's better to keep provided # format_id intact. if not live: - format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) + format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), 'url': format_url(line.strip()), @@ -1275,21 +1317,21 @@ class InfoExtractor(object): m3u8_count = 0 srcs = [] - videos = smil.findall(self._xpath_ns('.//video', namespace)) - for video in videos: - src = video.get('src') + media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) + for medium in media: + src = medium.get('src') if not src or src in srcs: continue srcs.append(src) - bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - filesize = int_or_none(video.get('size') or video.get('fileSize')) - width = int_or_none(video.get('width')) - height = int_or_none(video.get('height')) - proto = video.get('proto') - ext = video.get('ext') + bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) + filesize = int_or_none(medium.get('size') or medium.get('fileSize')) + width = int_or_none(medium.get('width')) + height = int_or_none(medium.get('height')) + proto = medium.get('proto') + ext = medium.get('ext') src_ext = determine_ext(src) - streamer = video.get('streamer') or base + streamer = medium.get('streamer') or base if proto == 'rtmp' or streamer.startswith('rtmp'): rtmp_count += 1 diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py new file mode 100644 index 000000000..a901b8d22 --- /dev/null +++ b/youtube_dl/extractor/coub.py @@ -0,0 +1,143 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, + qualities, +) + + +class CoubIE(InfoExtractor): + _VALID_URL = r'(?:coub:|https?://(?:coub\.com/(?:view|embed|coubs)/|c-cdn\.coub\.com/fb-player\.swf\?.*\bcoub(?:ID|id)=))(?P<id>[\da-z]+)' + + _TESTS = [{ + 'url': 'http://coub.com/view/5u5n1', + 'info_dict': { + 'id': '5u5n1', + 'ext': 'mp4', + 'title': 'The Matrix Moonwalk', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 4.6, + 'timestamp': 1428527772, + 'upload_date': '20150408', + 'uploader': 'Артём Лоскутников', + 'uploader_id': 'artyom.loskutnikov', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'age_limit': 0, + }, + }, { + 'url': 'http://c-cdn.coub.com/fb-player.swf?bot_type=vk&coubID=7w5a4', + 'only_matching': True, + }, { + 'url': 'coub:5u5n1', + 'only_matching': True, + }, { + # longer video id + 'url': 'http://coub.com/view/237d5l5h', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + coub = self._download_json( + 'http://coub.com/api/v2/coubs/%s.json' % video_id, video_id) + + if coub.get('error'): + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, coub['error']), expected=True) + + title = coub['title'] + + file_versions = coub['file_versions'] + + QUALITIES = ('low', 'med', 'high') + + MOBILE = 'mobile' + IPHONE = 'iphone' + HTML5 = 'html5' + + SOURCE_PREFERENCE = (MOBILE, IPHONE, HTML5) + + quality_key = qualities(QUALITIES) + preference_key = qualities(SOURCE_PREFERENCE) + + formats = [] + + for kind, items in file_versions.get(HTML5, {}).items(): + if kind not in ('video', 'audio'): + continue + if not isinstance(items, dict): + continue + for quality, item in items.items(): + if not isinstance(item, dict): + continue + item_url = item.get('url') + if not item_url: + continue + formats.append({ + 'url': item_url, + 'format_id': '%s-%s-%s' % (HTML5, kind, quality), + 'filesize': int_or_none(item.get('size')), + 'vcodec': 'none' if kind == 'audio' else None, + 'quality': quality_key(quality), + 'preference': preference_key(HTML5), + }) + + iphone_url = file_versions.get(IPHONE, {}).get('url') + if iphone_url: + formats.append({ + 'url': iphone_url, + 'format_id': IPHONE, + 'preference': preference_key(IPHONE), + }) + + mobile_url = file_versions.get(MOBILE, {}).get('audio_url') + if mobile_url: + formats.append({ + 'url': mobile_url, + 'format_id': '%s-audio' % MOBILE, + 'preference': preference_key(MOBILE), + }) + + self._sort_formats(formats) + + thumbnail = coub.get('picture') + duration = float_or_none(coub.get('duration')) + timestamp = parse_iso8601(coub.get('published_at') or coub.get('created_at')) + uploader = coub.get('channel', {}).get('title') + uploader_id = coub.get('channel', {}).get('permalink') + + view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count')) + like_count = int_or_none(coub.get('likes_count')) + repost_count = int_or_none(coub.get('recoubs_count')) + comment_count = int_or_none(coub.get('comments_count')) + + age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin')) + if age_restricted is not None: + age_limit = 18 if age_restricted is True else 0 + else: + age_limit = None + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'view_count': view_count, + 'like_count': like_count, + 'repost_count': repost_count, + 'comment_count': comment_count, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py new file mode 100644 index 000000000..b60a1d813 --- /dev/null +++ b/youtube_dl/extractor/dailymail.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + determine_protocol, +) + + +class DailyMailIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html', + 'md5': '2f639d446394f53f3a33658b518b6615', + 'info_dict': { + 'id': '1288527', + 'ext': 'mp4', + 'title': 'Turn any video into an impressionist masterpiece', + 'description': 'md5:88ddbcb504367987b2708bb38677c9d2', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = self._parse_json(self._search_regex( + r"data-opts='({.+?})'", webpage, 'video data'), video_id) + title = video_data['title'] + video_sources = self._download_json(video_data.get( + 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) + + formats = [] + for rendition in video_sources['renditions']: + rendition_url = rendition.get('url') + if not rendition_url: + continue + tbr = int_or_none(rendition.get('encodingRate'), 1000) + container = rendition.get('videoContainer') + is_hls = container == 'M2TS' + protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url}) + formats.append({ + 'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''), + 'url': rendition_url, + 'width': int_or_none(rendition.get('frameWidth')), + 'height': int_or_none(rendition.get('frameHeight')), + 'tbr': tbr, + 'vcodec': rendition.get('videoCodec'), + 'container': container, + 'protocol': protocol, + 'ext': 'mp4' if is_hls else None, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('descr'), + 'thumbnail': video_data.get('poster') or video_data.get('thumbnail'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py index ae7c571bd..0f0f0b8d3 100644 --- a/youtube_dl/extractor/dw.py +++ b/youtube_dl/extractor/dw.py @@ -2,13 +2,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + unified_strdate, +) from ..compat import compat_urlparse class DWIE(InfoExtractor): IE_NAME = 'dw' - _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+(?:av|e)-(?P<id>\d+)' _TESTS = [{ # video 'url': 'http://www.dw.com/en/intelligent-light/av-19112290', @@ -31,6 +34,16 @@ class DWIE(InfoExtractor): 'description': 'md5:bc9ca6e4e063361e21c920c53af12405', 'upload_date': '20160311', } + }, { + 'url': 'http://www.dw.com/en/documentaries-welcome-to-the-90s-2016-05-21/e-19220158-9798', + 'md5': '56b6214ef463bfb9a3b71aeb886f3cf1', + 'info_dict': { + 'id': '19274438', + 'ext': 'mp4', + 'title': 'Welcome to the 90s – Hip Hop', + 'description': 'Welcome to the 90s - The Golden Decade of Hip Hop', + 'upload_date': '20160521', + }, }] def _real_extract(self, url): @@ -38,6 +51,7 @@ class DWIE(InfoExtractor): webpage = self._download_webpage(url, media_id) hidden_inputs = self._hidden_inputs(webpage) title = hidden_inputs['media_title'] + media_id = hidden_inputs.get('media_id') or media_id if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1': formats = self._extract_smil_formats( @@ -49,13 +63,20 @@ class DWIE(InfoExtractor): else: formats = [{'url': hidden_inputs['file_name']}] + upload_date = hidden_inputs.get('display_date') + if not upload_date: + upload_date = self._html_search_regex( + r'<span[^>]+class="date">([0-9.]+)\s*\|', webpage, + 'upload date', default=None) + upload_date = unified_strdate(upload_date) + return { 'id': media_id, 'title': title, 'description': self._og_search_description(webpage), 'thumbnail': hidden_inputs.get('preview_image'), 'duration': int_or_none(hidden_inputs.get('file_duration')), - 'upload_date': hidden_inputs.get('display_date'), + 'upload_date': upload_date, 'formats': formats, } diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index e006921ec..ac5d0fe24 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -11,8 +11,8 @@ from ..utils import ( class EpornerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\d+)/(?P<display_id>[\w-]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)/(?P<display_id>[\w-]+)' + _TESTS = [{ 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', 'md5': '39d486f046212d8e1b911c52ab4691f8', 'info_dict': { @@ -23,8 +23,12 @@ class EpornerIE(InfoExtractor): 'duration': 1838, 'view_count': int, 'age_limit': 18, - } - } + }, + }, { + # New (May 2016) URL layout + 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index db4b263bc..66c08bec4 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -8,6 +8,7 @@ class ESPNIE(InfoExtractor): _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://espn.go.com/video/clip?id=10365079', + 'md5': '60e5d097a523e767d06479335d1bdc58', 'info_dict': { 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', 'ext': 'mp4', @@ -15,21 +16,22 @@ class ESPNIE(InfoExtractor): 'description': None, }, 'params': { - # m3u8 download 'skip_download': True, }, + 'add_ie': ['OoyalaExternal'], }, { # intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season 'url': 'http://espn.go.com/video/clip?id=2743663', + 'md5': 'f4ac89b59afc7e2d7dbb049523df6768', 'info_dict': { 'id': '50NDFkeTqRHB0nXBOK-RGdSG5YQPuxHg', 'ext': 'mp4', 'title': 'Must-See Moments: Best of the MLS season', }, 'params': { - # m3u8 download 'skip_download': True, }, + 'add_ie': ['OoyalaExternal'], }, { 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', 'only_matching': True, diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1f95530a5..aa98782a5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -3,6 +3,10 @@ from __future__ import unicode_literals from .abc import ABCIE from .abc7news import Abc7NewsIE +from .abcnews import ( + AbcNewsIE, + AbcNewsVideoIE, +) from .academicearth import AcademicEarthCourseIE from .acast import ( ACastIE, @@ -53,6 +57,7 @@ from .arte import ( ArteTVDDCIE, ArteTVMagazineIE, ArteTVEmbedIE, + ArteTVPlaylistIE, ) from .atresplayer import AtresPlayerIE from .atttechchannel import ATTTechChannelIE @@ -76,6 +81,7 @@ from .bigflix import BigflixIE from .bild import BildIE from .bilibili import BiliBiliIE from .biobiochiletv import BioBioChileTVIE +from .biqle import BIQLEIE from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, @@ -107,6 +113,7 @@ from .cbc import ( CBCPlayerIE, ) from .cbs import CBSIE +from .cbslocal import CBSLocalIE from .cbsinteractive import CBSInteractiveIE from .cbsnews import ( CBSNewsIE, @@ -124,7 +131,6 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE -from .cinemassacre import CinemassacreIE from .cliprs import ClipRsIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE @@ -139,7 +145,7 @@ from .cnn import ( CNNBlogsIE, CNNArticleIE, ) -from .collegehumor import CollegeHumorIE +from .coub import CoubIE from .collegerama import CollegeRamaIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .comcarcoff import ComCarCoffIE @@ -158,6 +164,7 @@ from .cspan import CSpanIE from .ctsnews import CtsNewsIE from .cultureunplugged import CultureUnpluggedIE from .cwtv import CWTVIE +from .dailymail import DailyMailIE from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, @@ -227,6 +234,7 @@ from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE from .expotv import ExpoTVIE from .extremetube import ExtremeTubeIE +from .eyedotv import EyedoTVIE from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE @@ -239,6 +247,7 @@ from .fktv import FKTVIE from .flickr import FlickrIE from .folketinget import FolketingetIE from .footyroom import FootyRoomIE +from .formula1 import Formula1IE from .fourtube import FourTubeIE from .fox import FOXIE from .foxgay import FoxgayIE @@ -366,6 +375,7 @@ from .kuwo import ( ) from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .learnr import LearnrIE from .lecture2go import Lecture2GoIE from .lemonde import LemondeIE from .leeco import ( @@ -373,6 +383,7 @@ from .leeco import ( LePlaylistIE, LetvCloudIE, ) +from .libraryofcongress import LibraryOfCongressIE from .libsyn import LibsynIE from .lifenews import ( LifeNewsIE, @@ -383,6 +394,7 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) +from .litv import LiTVIE from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, @@ -390,6 +402,7 @@ from .livestream import ( LivestreamShortenerIE, ) from .lnkgo import LnkGoIE +from .localnews8 import LocalNews8IE from .lovehomeporn import LoveHomePornIE from .lrt import LRTIE from .lynda import ( @@ -407,6 +420,10 @@ from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE from .mgtv import MGTVIE +from .microsoftvirtualacademy import ( + MicrosoftVirtualAcademyIE, + MicrosoftVirtualAcademyCourseIE, +) from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE from .minoto import MinotoIE @@ -561,7 +578,10 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .people import PeopleIE -from .periscope import PeriscopeIE +from .periscope import ( + PeriscopeIE, + PeriscopeUserIE, +) from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE @@ -602,6 +622,10 @@ from .qqmusic import ( QQMusicPlaylistIE, ) from .r7 import R7IE +from .radiocanada import ( + RadioCanadaIE, + RadioCanadaAudioVideoIE, +) from .radiode import RadioDeIE from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE @@ -615,8 +639,12 @@ from .rds import RDSIE from .redtube import RedTubeIE from .regiotv import RegioTVIE from .restudy import RestudyIE +from .reuters import ReutersIE from .reverbnation import ReverbNationIE -from .revision3 import Revision3IE +from .revision3 import ( + Revision3EmbedIE, + Revision3IE, +) from .rice import RICEIE from .ringtv import RingTVIE from .ro220 import Ro220IE @@ -655,7 +683,9 @@ from .screencast import ScreencastIE from .screencastomatic import ScreencastOMaticIE from .screenjunkies import ScreenJunkiesIE from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE +from .seeker import SeekerIE from .senateisvp import SenateISVPIE +from .sendtonews import SendtoNewsIE from .servingsys import ServingSysIE from .sexu import SexuIE from .shahid import ShahidIE @@ -758,6 +788,7 @@ from .thesixtyone import TheSixtyOneIE from .thestar import TheStarIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE +from .threeqsdn import ThreeQSDNIE from .tinypic import TinyPicIE from .tlc import TlcDeIE from .tmz import ( @@ -810,7 +841,10 @@ from .tvc import ( ) from .tvigle import TvigleIE from .tvland import TVLandIE -from .tvp import TvpIE, TvpSeriesIE +from .tvp import ( + TVPIE, + TVPSeriesIE, +) from .tvplay import TVPlayIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE @@ -825,7 +859,6 @@ from .twitch import ( TwitchVodIE, TwitchProfileIE, TwitchPastBroadcastsIE, - TwitchBookmarksIE, TwitchStreamIE, ) from .twitter import ( @@ -843,7 +876,10 @@ from .unistra import UnistraIE from .urort import UrortIE from .usatoday import USATodayIE from .ustream import UstreamIE, UstreamChannelIE -from .ustudio import UstudioIE +from .ustudio import ( + UstudioIE, + UstudioEmbedIE, +) from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veehd import VeeHDIE @@ -875,6 +911,7 @@ from .videomore import ( ) from .videopremium import VideoPremiumIE from .videott import VideoTtIE +from .vidio import VidioIE from .vidme import ( VidmeIE, VidmeUserIE, @@ -922,13 +959,15 @@ from .vube import VubeIE from .vuclip import VuClipIE from .vulture import VultureIE from .walla import WallaIE -from .washingtonpost import WashingtonPostIE +from .washingtonpost import ( + WashingtonPostIE, + WashingtonPostArticleIE, +) from .wat import WatIE from .watchindianporn import WatchIndianPornIE from .wdr import ( WDRIE, WDRMobileIE, - WDRMausIE, ) from .webofstories import ( WebOfStoriesIE, @@ -975,7 +1014,10 @@ from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE from .ynet import YnetIE from .youjizz import YouJizzIE -from .youku import YoukuIE +from .youku import ( + YoukuIE, + YoukuShowIE, +) from .youporn import YouPornIE from .yourupload import YourUploadIE from .youtube import ( diff --git a/youtube_dl/extractor/eyedotv.py b/youtube_dl/extractor/eyedotv.py new file mode 100644 index 000000000..2f3035147 --- /dev/null +++ b/youtube_dl/extractor/eyedotv.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + parse_duration, + ExtractorError, +) + + +class EyedoTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?eyedo\.tv/[^/]+/(?:#!/)?Live/Detail/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://www.eyedo.tv/en-US/#!/Live/Detail/16301', + 'md5': 'ba14f17995cdfc20c36ba40e21bf73f7', + 'info_dict': { + 'id': '16301', + 'ext': 'mp4', + 'title': 'Journée du conseil scientifique de l\'Afnic 2015', + 'description': 'md5:4abe07293b2f73efc6e1c37028d58c98', + 'uploader': 'Afnic Live', + 'uploader_id': '8023', + } + } + _ROOT_URL = 'http://live.eyedo.net:1935/' + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_xml('http://eyedo.tv/api/live/GetLive/%s' % video_id, video_id) + + def _add_ns(path): + return self._xpath_ns(path, 'http://schemas.datacontract.org/2004/07/EyeDo.Core.Implementation.Web.ViewModels.Api') + + title = xpath_text(video_data, _add_ns('Titre'), 'title', True) + state_live_code = xpath_text(video_data, _add_ns('StateLiveCode'), 'title', True) + if state_live_code == 'avenir': + raise ExtractorError( + '%s said: We\'re sorry, but this video is not yet available.' % self.IE_NAME, + expected=True) + + is_live = state_live_code == 'live' + m3u8_url = None + # http://eyedo.tv/Content/Html5/Scripts/html5view.js + if is_live: + if xpath_text(video_data, 'Cdn') == 'true': + m3u8_url = 'http://rrr.sz.xlcdn.com/?account=eyedo&file=A%s&type=live&service=wowza&protocol=http&output=playlist.m3u8' % video_id + else: + m3u8_url = self._ROOT_URL + 'w/%s/eyedo_720p/playlist.m3u8' % video_id + else: + m3u8_url = self._ROOT_URL + 'replay-w/%s/mp4:%s.mp4/playlist.m3u8' % (video_id, video_id) + + return { + 'id': video_id, + 'title': title, + 'formats': self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8' if is_live else 'm3u8_native'), + 'description': xpath_text(video_data, _add_ns('Description')), + 'duration': parse_duration(xpath_text(video_data, _add_ns('Duration'))), + 'uploader': xpath_text(video_data, _add_ns('Createur')), + 'uploader_id': xpath_text(video_data, _add_ns('CreateurId')), + 'chapter': xpath_text(video_data, _add_ns('ChapitreTitre')), + 'chapter_id': xpath_text(video_data, _add_ns('ChapitreId')), + } diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py index f1f150ef2..8d1010b88 100644 --- a/youtube_dl/extractor/fczenit.py +++ b/youtube_dl/extractor/fczenit.py @@ -1,20 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_urlparse class FczenitIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P<id>[0-9]+)' _TEST = { - 'url': 'http://fc-zenit.ru/video/gl6785/', - 'md5': '458bacc24549173fe5a5aa29174a5606', + 'url': 'http://fc-zenit.ru/video/41044/', + 'md5': '0e3fab421b455e970fa1aa3891e57df0', 'info_dict': { - 'id': '6785', + 'id': '41044', 'ext': 'mp4', - 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»', + 'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»', }, } @@ -22,15 +21,23 @@ class FczenitIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'<div class=\"photoalbum__title\">([^<]+)', webpage, 'title') + video_title = self._html_search_regex( + r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title') + + video_items = self._parse_json(self._search_regex( + r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'), + video_id) - bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL') - bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw) + def merge_dicts(*dicts): + ret = {} + for a_dict in dicts: + ret.update(a_dict) + return ret formats = [{ - 'url': furl, - 'tbr': tbr, - } for furl, tbr in bitrates] + 'url': compat_urlparse.urljoin(url, video_url), + 'tbr': int(tbr), + } for tbr, video_url in merge_dicts(*video_items).items()] self._sort_formats(formats) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 0a3de1498..a8e1bf42a 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -24,13 +24,28 @@ class FlickrIE(InfoExtractor): 'upload_date': '20110423', 'uploader_id': '10922353@N03', 'uploader': 'Forest Wander', + 'uploader_url': 'https://www.flickr.com/photos/forestwander-nature-pictures/', 'comment_count': int, 'view_count': int, 'tags': list, + 'license': 'Attribution-ShareAlike', } } - _API_BASE_URL = 'https://api.flickr.com/services/rest?' + # https://help.yahoo.com/kb/flickr/SLN25525.html + _LICENSES = { + '0': 'All Rights Reserved', + '1': 'Attribution-NonCommercial-ShareAlike', + '2': 'Attribution-NonCommercial', + '3': 'Attribution-NonCommercial-NoDerivs', + '4': 'Attribution', + '5': 'Attribution-ShareAlike', + '6': 'Attribution-NoDerivs', + '7': 'No known copyright restrictions', + '8': 'United States government work', + '9': 'Public Domain Dedication (CC0)', + '10': 'Public Domain Work', + } def _call_api(self, method, video_id, api_key, note, secret=None): query = { @@ -75,6 +90,9 @@ class FlickrIE(InfoExtractor): self._sort_formats(formats) owner = video_info.get('owner', {}) + uploader_id = owner.get('nsid') + uploader_path = owner.get('path_alias') or uploader_id + uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None return { 'id': video_id, @@ -83,11 +101,13 @@ class FlickrIE(InfoExtractor): 'formats': formats, 'timestamp': int_or_none(video_info.get('dateuploaded')), 'duration': int_or_none(video_info.get('video', {}).get('duration')), - 'uploader_id': owner.get('nsid'), + 'uploader_id': uploader_id, 'uploader': owner.get('realname'), + 'uploader_url': uploader_url, 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), 'view_count': int_or_none(video_info.get('views')), - 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])] + 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])], + 'license': self._LICENSES.get(video_info.get('license')), } else: raise ExtractorError('not a video', expected=True) diff --git a/youtube_dl/extractor/formula1.py b/youtube_dl/extractor/formula1.py new file mode 100644 index 000000000..322c41e5a --- /dev/null +++ b/youtube_dl/extractor/formula1.py @@ -0,0 +1,26 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class Formula1IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?formula1\.com/content/fom-website/en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html' + _TEST = { + 'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html', + 'md5': '8c79e54be72078b26b89e0e111c0502b', + 'info_dict': { + 'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV', + 'ext': 'flv', + 'title': 'Race highlights - Spain 2016', + }, + 'add_ie': ['Ooyala'], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + ooyala_embed_code = self._search_regex( + r'data-videoid="([^"]+)"', webpage, 'ooyala embed code') + return self.url_result( + 'ooyala:%s' % ooyala_embed_code, 'Ooyala', ooyala_embed_code) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0f1eb7fa6..90575ab0e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -61,6 +61,9 @@ from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE from .instagram import InstagramIE from .liveleak import LiveLeakIE +from .threeqsdn import ThreeQSDNIE +from .theplatform import ThePlatformIE +from .vessel import VesselIE class GenericIE(InfoExtractor): @@ -716,15 +719,18 @@ class GenericIE(InfoExtractor): }, # Wistia embed { - 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', - 'md5': '8788b683c777a5cf25621eaf286d0c23', + 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', + 'md5': '1953f3a698ab51cfc948ed3992a0b7ff', 'info_dict': { - 'id': '1cfaf6b7ea', + 'id': '6e2wtrbdaf', 'ext': 'mov', - 'title': 'md5:51364a8d3d009997ba99656004b5e20d', - 'duration': 643.0, - 'filesize': 182808282, - 'uploader': 'education-portal.com', + 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', + 'description': 'a Paywall Videos video from Remilon', + 'duration': 644.072, + 'uploader': 'study.com', + 'timestamp': 1459678540, + 'upload_date': '20160403', + 'filesize': 24687186, }, }, { @@ -733,13 +739,29 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': 'uxjb0lwrcz', 'ext': 'mp4', - 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', + 'title': 'Conversation about Hexagonal Rails Part 1', 'description': 'a Martin Fowler video from ThoughtWorks', 'duration': 1715.0, 'uploader': 'thoughtworks.wistia.com', - 'upload_date': '20140603', 'timestamp': 1401832161, + 'upload_date': '20140603', + }, + }, + # Wistia standard embed (async) + { + 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/', + 'info_dict': { + 'id': '807fafadvk', + 'ext': 'mp4', + 'title': 'Drip Brennan Dunn Workshop', + 'description': 'a JV Webinars video from getdrip-1', + 'duration': 4986.95, + 'timestamp': 1463607249, + 'upload_date': '20160518', }, + 'params': { + 'skip_download': True, + } }, # Soundcloud embed { @@ -763,6 +785,19 @@ class GenericIE(InfoExtractor): 'title': 'Rosetta #CometLanding webcast HL 10', } }, + # Another Livestream embed, without 'new.' in URL + { + 'url': 'https://www.freespeech.org/', + 'info_dict': { + 'id': '123537347', + 'ext': 'mp4', + 'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + # Live stream + 'skip_download': True, + }, + }, # LazyYT { 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', @@ -847,18 +882,6 @@ class GenericIE(InfoExtractor): 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', } }, - # Kaltura embed - { - 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15', - 'info_dict': { - 'id': '1_eergr3h1', - 'ext': 'mp4', - 'upload_date': '20150226', - 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com', - 'timestamp': int, - 'title': 'John Carlson Postgame 2/25/15', - }, - }, # Kaltura embed (different embed code) { 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014', @@ -884,6 +907,19 @@ class GenericIE(InfoExtractor): 'uploader_id': 'echojecka', }, }, + # Kaltura embed with single quotes + { + 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY', + 'info_dict': { + 'id': '0_izeg5utt', + 'ext': 'mp4', + 'title': '35871', + 'timestamp': 1355743100, + 'upload_date': '20121217', + 'uploader_id': 'batchUser', + }, + 'add_ie': ['Kaltura'], + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -998,14 +1034,18 @@ class GenericIE(InfoExtractor): }, # UDN embed { - 'url': 'http://www.udn.com/news/story/7314/822787', + 'url': 'https://video.udn.com/news/300346', 'md5': 'fd2060e988c326991037b9aff9df21a6', 'info_dict': { 'id': '300346', 'ext': 'mp4', 'title': '中一中男師變性 全校師生力挺', 'thumbnail': 're:^https?://.*\.jpg$', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, # Ooyala embed { @@ -1173,6 +1213,16 @@ class GenericIE(InfoExtractor): 'uploader': 'Lake8737', } }, + # Duplicated embedded video URLs + { + 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443', + 'info_dict': { + 'id': '149298443_480_16c25b74_2', + 'ext': 'mp4', + 'title': 'vs. Blue Orange Spring Game', + 'uploader': 'www.hudl.com', + }, + }, ] def report_following_redirect(self, new_url): @@ -1427,7 +1477,8 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - video_title = self._html_search_regex( + video_title = self._og_search_title( + webpage, default=None) or self._html_search_regex( r'(?s)<title>(.*?)</title>', webpage, 'video title', default='video') @@ -1445,6 +1496,9 @@ class GenericIE(InfoExtractor): video_uploader = self._search_regex( r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') + video_description = self._og_search_description(webpage, default=None) + video_thumbnail = self._og_search_thumbnail(webpage, default=None) + # Helper method def _playlist_from_matches(matches, getter=None, ie=None): urlrs = orderedSet( @@ -1475,6 +1529,16 @@ class GenericIE(InfoExtractor): if bc_urls: return _playlist_from_matches(bc_urls, ie='BrightcoveNew') + # Look for ThePlatform embeds + tp_urls = ThePlatformIE._extract_urls(webpage) + if tp_urls: + return _playlist_from_matches(tp_urls, ie='ThePlatform') + + # Look for Vessel embeds + vessel_urls = VesselIE._extract_urls(webpage) + if vessel_urls: + return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key()) + # Look for embedded rtl.nl player matches = re.findall( r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', @@ -1543,21 +1607,26 @@ class GenericIE(InfoExtractor): 'url': embed_url, 'ie_key': 'Wistia', 'uploader': video_uploader, - 'title': video_title, - 'id': video_id, } match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) if match: return { '_type': 'url_transparent', - 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')), + 'url': 'wistia:%s' % match.group('id'), 'ie_key': 'Wistia', 'uploader': video_uploader, - 'title': video_title, - 'id': match.group('id') } + match = re.search( + r'''(?sx) + <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? + <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 + ''', webpage) + if match: + return self.url_result(self._proto_relative_url( + 'wistia:%s' % match.group('id')), 'Wistia') + # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: @@ -1833,7 +1902,7 @@ class GenericIE(InfoExtractor): return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"', + r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'Livestream') @@ -1845,7 +1914,7 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'Zapiks') # Look for Kaltura embeds - mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or + mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage)) if mobj is not None: return self.url_result(smuggle_url( @@ -1983,6 +2052,19 @@ class GenericIE(InfoExtractor): if liveleak_url: return self.url_result(liveleak_url, 'LiveLeak') + # Look for 3Q SDN embeds + threeqsdn_url = ThreeQSDNIE._extract_url(webpage) + if threeqsdn_url: + return { + '_type': 'url_transparent', + 'ie_key': ThreeQSDNIE.ie_key(), + 'url': self._proto_relative_url(threeqsdn_url), + 'title': video_title, + 'description': video_description, + 'thumbnail': video_thumbnail, + 'uploader': video_uploader, + } + def check_video(vurl): if YoutubeIE.suitable(vurl): return True @@ -2063,7 +2145,7 @@ class GenericIE(InfoExtractor): raise UnsupportedError(url) entries = [] - for video_url in found: + for video_url in orderedSet(found): video_url = unescapeHTML(video_url) video_url = video_url.replace('\\/', '/') video_url = compat_urlparse.urljoin(url, video_url) diff --git a/youtube_dl/extractor/groupon.py b/youtube_dl/extractor/groupon.py index f6b69662b..a6da90931 100644 --- a/youtube_dl/extractor/groupon.py +++ b/youtube_dl/extractor/groupon.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class GrouponIE(InfoExtractor): - _VALID_URL = r'https?://www\.groupon\.com/deals/(?P<id>[^?#]+)' + _VALID_URL = r'https?://(?:www\.)?groupon\.com/deals/(?P<id>[^/?#&]+)' _TEST = { 'url': 'https://www.groupon.com/deals/bikram-yoga-huntington-beach-2#ooid=tubGNycTo_9Uxg82uESj4i61EYX8nyuf', @@ -14,17 +14,27 @@ class GrouponIE(InfoExtractor): 'description': 'Studio kept at 105 degrees and 40% humidity with anti-microbial and anti-slip Flotex flooring; certified instructors', }, 'playlist': [{ + 'md5': '42428ce8a00585f9bc36e49226eae7a1', 'info_dict': { - 'id': 'tubGNycTo_9Uxg82uESj4i61EYX8nyuf', - 'ext': 'flv', - 'title': 'Bikram Yoga Huntington Beach | Orange County', + 'id': 'fk6OhWpXgIQ', + 'ext': 'mp4', + 'title': 'Bikram Yoga Huntington Beach | Orange County !tubGNycTo@9Uxg82uESj4i61EYX8nyuf', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'duration': 44.961, + 'duration': 45, + 'upload_date': '20160405', + 'uploader_id': 'groupon', + 'uploader': 'Groupon', }, + 'add_ie': ['Youtube'], }], 'params': { - 'skip_download': 'HDS', - } + 'skip_download': True, + }, + } + + _PROVIDERS = { + 'ooyala': ('ooyala:%s', 'Ooyala'), + 'youtube': ('%s', 'Youtube'), } def _real_extract(self, url): @@ -36,12 +46,17 @@ class GrouponIE(InfoExtractor): videos = payload['carousel'].get('dealVideos', []) entries = [] for v in videos: - if v.get('provider') != 'OOYALA': + provider = v.get('provider') + video_id = v.get('media') or v.get('id') or v.get('baseURL') + if not provider or not video_id: + continue + url_pattern, ie_key = self._PROVIDERS.get(provider.lower()) + if not url_pattern: self.report_warning( '%s: Unsupported video provider %s, skipping video' % - (playlist_id, v.get('provider'))) + (playlist_id, provider)) continue - entries.append(self.url_result('ooyala:%s' % v['media'])) + entries.append(self.url_result(url_pattern % video_id, ie_key)) return { '_type': 'playlist', diff --git a/youtube_dl/extractor/hearthisat.py b/youtube_dl/extractor/hearthisat.py index 7d8698655..256453882 100644 --- a/youtube_dl/extractor/hearthisat.py +++ b/youtube_dl/extractor/hearthisat.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( HEADRequest, + KNOWN_EXTENSIONS, sanitized_Request, str_to_int, urlencode_postdata, @@ -17,7 +18,7 @@ from ..utils import ( class HearThisAtIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/]+)/(?P<title>[A-Za-z0-9\-]+)/?$' _PLAYLIST_URL = 'https://hearthis.at/playlist.php' - _TEST = { + _TESTS = [{ 'url': 'https://hearthis.at/moofi/dr-kreep', 'md5': 'ab6ec33c8fed6556029337c7885eb4e0', 'info_dict': { @@ -26,7 +27,7 @@ class HearThisAtIE(InfoExtractor): 'title': 'Moofi - Dr. Kreep', 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1421564134, - 'description': 'Creepy Patch. Mutable Instruments Braids Vowel + Formant Mode.', + 'description': 'Listen to Dr. Kreep by Moofi on hearthis.at - Modular, Eurorack, Mutable Intruments Braids, Valhalla-DSP', 'upload_date': '20150118', 'comment_count': int, 'view_count': int, @@ -34,7 +35,25 @@ class HearThisAtIE(InfoExtractor): 'duration': 71, 'categories': ['Experimental'], } - } + }, { + # 'download' link redirects to the original webpage + 'url': 'https://hearthis.at/twitchsf/dj-jim-hopkins-totally-bitchin-80s-dance-mix/', + 'md5': '5980ceb7c461605d30f1f039df160c6e', + 'info_dict': { + 'id': '811296', + 'ext': 'mp3', + 'title': 'TwitchSF - DJ Jim Hopkins - Totally Bitchin\' 80\'s Dance Mix!', + 'description': 'Listen to DJ Jim Hopkins - Totally Bitchin\' 80\'s Dance Mix! by TwitchSF on hearthis.at - Dance', + 'upload_date': '20160328', + 'timestamp': 1459186146, + 'thumbnail': 're:^https?://.*\.jpg$', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'duration': 4360, + 'categories': ['Dance'], + }, + }] def _real_extract(self, url): m = re.match(self._VALID_URL, url) @@ -90,13 +109,14 @@ class HearThisAtIE(InfoExtractor): ext_handle = self._request_webpage( ext_req, display_id, note='Determining extension') ext = urlhandle_detect_ext(ext_handle) - formats.append({ - 'format_id': 'download', - 'vcodec': 'none', - 'ext': ext, - 'url': download_url, - 'preference': 2, # Usually better quality - }) + if ext in KNOWN_EXTENSIONS: + formats.append({ + 'format_id': 'download', + 'vcodec': 'none', + 'ext': ext, + 'url': download_url, + 'preference': 2, # Usually better quality + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py index e8f51e545..7e36b85ad 100644 --- a/youtube_dl/extractor/howcast.py +++ b/youtube_dl/extractor/howcast.py @@ -8,7 +8,7 @@ class HowcastIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?howcast\.com/videos/(?P<id>\d+)' _TEST = { 'url': 'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly', - 'md5': '8b743df908c42f60cf6496586c7f12c3', + 'md5': '7d45932269a288149483144f01b99789', 'info_dict': { 'id': '390161', 'ext': 'mp4', @@ -19,9 +19,9 @@ class HowcastIE(InfoExtractor): 'duration': 56.823, }, 'params': { - # m3u8 download 'skip_download': True, }, + 'add_ie': ['Ooyala'], } def _real_extract(self, url): diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 8bed8ccd0..3a2b7cec5 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -1,10 +1,10 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..utils import ( + mimetype2ext, qualities, ) @@ -12,9 +12,9 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/[^/]+/vi(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', 'info_dict': { 'id': '2524815897', @@ -22,7 +22,10 @@ class ImdbIE(InfoExtractor): 'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb', 'description': 'md5:9061c2219254e5d14e03c25c98e96a81', } - } + }, { + 'url': 'http://www.imdb.com/video/_/vi2524815897', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -48,13 +51,27 @@ class ImdbIE(InfoExtractor): json_data = self._search_regex( r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>', format_page, 'json data', flags=re.DOTALL) - info = json.loads(json_data) - format_info = info['videoPlayerObject']['video'] - f_id = format_info['ffname'] + info = self._parse_json(json_data, video_id, fatal=False) + if not info: + continue + format_info = info.get('videoPlayerObject', {}).get('video', {}) + if not format_info: + continue + video_info_list = format_info.get('videoInfoList') + if not video_info_list or not isinstance(video_info_list, list): + continue + video_info = video_info_list[0] + if not video_info or not isinstance(video_info, dict): + continue + video_url = video_info.get('videoUrl') + if not video_url: + continue + format_id = format_info.get('ffname') formats.append({ - 'format_id': f_id, - 'url': format_info['videoInfoList'][0]['videoUrl'], - 'quality': quality(f_id), + 'format_id': format_id, + 'url': video_url, + 'ext': mimetype2ext(video_info.get('videoMimeType')), + 'quality': quality(format_id), }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index ffb8008ce..ddcb3c916 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -505,7 +505,10 @@ class IqiyiIE(InfoExtractor): 'enc': md5_text(enc_key + tail), 'qyid': _uuid, 'tn': random.random(), - 'um': 0, + # In iQiyi's flash player, um is set to 1 if there's a logged user + # Some 1080P formats are only available with a logged user. + # Here force um=1 to trick the iQiyi server + 'um': 1, 'authkey': md5_text(md5_text('') + tail), 'k_tag': 1, } diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 8a5e562db..fa6f335e1 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -5,33 +5,50 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, float_or_none, int_or_none, ) class JWPlatformBaseIE(InfoExtractor): - def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True): + def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True, m3u8_id=None, rtmp_params=None): video_data = jwplayer_data['playlist'][0] formats = [] for source in video_data['sources']: source_url = self._proto_relative_url(source['file']) source_type = source.get('type') or '' - if source_type in ('application/vnd.apple.mpegurl', 'hls'): + if source_type in ('application/vnd.apple.mpegurl', 'hls') or determine_ext(source_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', 'm3u8_native', fatal=False)) + source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) elif source_type.startswith('audio'): formats.append({ 'url': source_url, 'vcodec': 'none', }) else: - formats.append({ + a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), 'height': int_or_none(source.get('height')), - }) + } + if source_url.startswith('rtmp'): + a_format['ext'] = 'flv', + + # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as + # of jwplayer.flash.swf + rtmp_url_parts = re.split( + r'((?:mp4|mp3|flv):)', source_url, 1) + if len(rtmp_url_parts) == 3: + rtmp_url, prefix, play_path = rtmp_url_parts + a_format.update({ + 'url': rtmp_url, + 'play_path': prefix + play_path, + }) + if rtmp_params: + a_format.update(rtmp_params) + formats.append(a_format) self._sort_formats(formats) subtitles = {} diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 616ed19e1..11b31a699 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -266,6 +266,7 @@ class KuwoCategoryIE(InfoExtractor): 'info_dict': { 'id': '86375', 'title': '八十年代精选', + 'description': '这些都是属于八十年代的回忆!', }, 'playlist_mincount': 24, } diff --git a/youtube_dl/extractor/learnr.py b/youtube_dl/extractor/learnr.py new file mode 100644 index 000000000..1435e090e --- /dev/null +++ b/youtube_dl/extractor/learnr.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class LearnrIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?learnr\.pro/view/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.learnr.pro/view/video/51624-web-development-tutorial-for-beginners-1-how-to-build-webpages-with-html-css-javascript', + 'md5': '3719fdf0a68397f49899e82c308a89de', + 'info_dict': { + 'id': '51624', + 'ext': 'mp4', + 'title': 'Web Development Tutorial for Beginners (#1) - How to build webpages with HTML, CSS, Javascript', + 'description': 'md5:b36dbfa92350176cdf12b4d388485503', + 'uploader': 'LearnCode.academy', + 'uploader_id': 'learncodeacademy', + 'upload_date': '20131021', + }, + 'add_ie': ['Youtube'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + return { + '_type': 'url_transparent', + 'url': self._search_regex( + r"videoId\s*:\s*'([^']+)'", webpage, 'youtube id'), + 'id': video_id, + } diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py new file mode 100644 index 000000000..0a94366fd --- /dev/null +++ b/youtube_dl/extractor/libraryofcongress.py @@ -0,0 +1,143 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + parse_filesize, +) + + +class LibraryOfCongressIE(InfoExtractor): + IE_NAME = 'loc' + IE_DESC = 'Library of Congress' + _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9]+)' + _TESTS = [{ + # embedded via <div class="media-player" + 'url': 'http://loc.gov/item/90716351/', + 'md5': '353917ff7f0255aa6d4b80a034833de8', + 'info_dict': { + 'id': '90716351', + 'ext': 'mp4', + 'title': "Pa's trip to Mars", + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 0, + 'view_count': int, + }, + }, { + # webcast embedded via mediaObjectId + 'url': 'https://www.loc.gov/today/cyberlc/feature_wdesc.php?rec=5578', + 'info_dict': { + 'id': '5578', + 'ext': 'mp4', + 'title': 'Help! Preservation Training Needs Here, There & Everywhere', + 'duration': 3765, + 'view_count': int, + 'subtitles': 'mincount:1', + }, + 'params': { + 'skip_download': True, + }, + }, { + # with direct download links + 'url': 'https://www.loc.gov/item/78710669/', + 'info_dict': { + 'id': '78710669', + 'ext': 'mp4', + 'title': 'La vie et la passion de Jesus-Christ', + 'duration': 0, + 'view_count': int, + 'formats': 'mincount:4', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + media_id = self._search_regex( + (r'id=(["\'])media-player-(?P<id>.+?)\1', + r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1', + r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1', + r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1'), + webpage, 'media id', group='id') + + data = self._download_json( + 'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id, + video_id)['mediaObject'] + + derivative = data['derivatives'][0] + media_url = derivative['derivativeUrl'] + + title = derivative.get('shortName') or data.get('shortName') or self._og_search_title( + webpage) + + # Following algorithm was extracted from setAVSource js function + # found in webpage + media_url = media_url.replace('rtmp', 'https') + + is_video = data.get('mediaType', 'v').lower() == 'v' + ext = determine_ext(media_url) + if ext not in ('mp4', 'mp3'): + media_url += '.mp4' if is_video else '.mp3' + + if 'vod/mp4:' in media_url: + formats = [{ + 'url': media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8', + 'format_id': 'hls', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'quality': 1, + }] + elif 'vod/mp3:' in media_url: + formats = [{ + 'url': media_url.replace('vod/mp3:', ''), + 'vcodec': 'none', + }] + + download_urls = set() + for m in re.finditer( + r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?: |\s+)\((?P<size>.+?)\))?\s*<', webpage): + format_id = m.group('id').lower() + if format_id == 'gif': + continue + download_url = m.group('url') + if download_url in download_urls: + continue + download_urls.add(download_url) + formats.append({ + 'url': download_url, + 'format_id': format_id, + 'filesize_approx': parse_filesize(m.group('size')), + }) + + self._sort_formats(formats) + + duration = float_or_none(data.get('duration')) + view_count = int_or_none(data.get('viewCount')) + + subtitles = {} + cc_url = data.get('ccUrl') + if cc_url: + subtitles.setdefault('en', []).append({ + 'url': cc_url, + 'ext': 'ttml', + }) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index ba2f80a75..c2b4490c4 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -7,48 +7,53 @@ from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( determine_ext, + ExtractorError, int_or_none, + parse_iso8601, remove_end, - unified_strdate, - ExtractorError, ) class LifeNewsIE(InfoExtractor): - IE_NAME = 'lifenews' - IE_DESC = 'LIFE | NEWS' - _VALID_URL = r'https?://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)' + IE_NAME = 'life' + IE_DESC = 'Life.ru' + _VALID_URL = r'https?://life\.ru/t/[^/]+/(?P<id>\d+)' _TESTS = [{ # single video embedded via video/source - 'url': 'http://lifenews.ru/news/98736', + 'url': 'https://life.ru/t/новости/98736', 'md5': '77c95eaefaca216e32a76a343ad89d23', 'info_dict': { 'id': '98736', 'ext': 'mp4', 'title': 'Мужчина нашел дома архив оборонного завода', 'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26', + 'timestamp': 1344154740, 'upload_date': '20120805', + 'view_count': int, } }, { # single video embedded via iframe - 'url': 'http://lifenews.ru/news/152125', + 'url': 'https://life.ru/t/новости/152125', 'md5': '77d19a6f0886cd76bdbf44b4d971a273', 'info_dict': { 'id': '152125', 'ext': 'mp4', 'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ', 'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ', + 'timestamp': 1427961840, 'upload_date': '20150402', + 'view_count': int, } }, { # two videos embedded via iframe - 'url': 'http://lifenews.ru/news/153461', + 'url': 'https://life.ru/t/новости/153461', 'info_dict': { 'id': '153461', 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве', 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', - 'upload_date': '20150505', + 'timestamp': 1430825520, + 'view_count': int, }, 'playlist': [{ 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795', @@ -57,6 +62,7 @@ class LifeNewsIE(InfoExtractor): 'ext': 'mp4', 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)', 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', + 'timestamp': 1430825520, 'upload_date': '20150505', }, }, { @@ -66,22 +72,25 @@ class LifeNewsIE(InfoExtractor): 'ext': 'mp4', 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)', 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', + 'timestamp': 1430825520, 'upload_date': '20150505', }, }], }, { - 'url': 'http://lifenews.ru/video/13035', + 'url': 'https://life.ru/t/новости/213035', + 'only_matching': True, + }, { + 'url': 'https://life.ru/t/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8/153461', + 'only_matching': True, + }, { + 'url': 'https://life.ru/t/новости/411489/manuel_vals_nazval_frantsiiu_tsieliu_nomier_odin_dlia_ighil', 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - section = mobj.group('section') + video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://lifenews.ru/%s/%s' % (section, video_id), - video_id, 'Downloading page') + webpage = self._download_webpage(url, video_id) video_urls = re.findall( r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage) @@ -95,26 +104,22 @@ class LifeNewsIE(InfoExtractor): title = remove_end( self._og_search_title(webpage), - ' - Первый по срочным новостям — LIFE | NEWS') + ' - Life.ru') description = self._og_search_description(webpage) view_count = self._html_search_regex( - r'<div class=\'views\'>\s*(\d+)\s*</div>', webpage, 'view count', fatal=False) - comment_count = self._html_search_regex( - r'=\'commentCount\'[^>]*>\s*(\d+)\s*<', - webpage, 'comment count', fatal=False) + r'<div[^>]+class=(["\']).*?\bhits-count\b.*?\1[^>]*>\s*(?P<value>\d+)\s*</div>', + webpage, 'view count', fatal=False, group='value') - upload_date = self._html_search_regex( - r'<time[^>]*datetime=\'([^\']+)\'', webpage, 'upload date', fatal=False) - if upload_date is not None: - upload_date = unified_strdate(upload_date) + timestamp = parse_iso8601(self._search_regex( + r'<time[^>]+datetime=(["\'])(?P<value>.+?)\1', + webpage, 'upload date', fatal=False, group='value')) common_info = { 'description': description, 'view_count': int_or_none(view_count), - 'comment_count': int_or_none(comment_count), - 'upload_date': upload_date, + 'timestamp': timestamp, } def make_entry(video_id, video_url, index=None): @@ -183,7 +188,8 @@ class LifeEmbedIE(InfoExtractor): ext = determine_ext(video_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id='m3u8')) + video_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='m3u8')) else: formats.append({ 'url': video_url, diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py new file mode 100644 index 000000000..3356d015d --- /dev/null +++ b/youtube_dl/extractor/litv.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + smuggle_url, + unsmuggle_url, +) + + +class LiTVIE(InfoExtractor): + _VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P<id>[^&]+)' + + _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' + + _TESTS = [{ + 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'info_dict': { + 'id': 'VOD00041606', + 'title': '花千骨', + }, + 'playlist_count': 50, + }, { + 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'info_dict': { + 'id': 'VOD00041610', + 'ext': 'mp4', + 'title': '花千骨第1集', + 'thumbnail': 're:https?://.*\.jpg$', + 'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f', + 'episode_number': 1, + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, # m3u8 download + }, + 'skip': 'Georestricted to Taiwan', + }] + + def _extract_playlist(self, season_list, video_id, vod_data, view_data, prompt=True): + episode_title = view_data['title'] + content_id = season_list['contentId'] + + if prompt: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id)) + + all_episodes = [ + self.url_result(smuggle_url( + self._URL_TEMPLATE % (view_data['contentType'], episode['contentId']), + {'force_noplaylist': True})) # To prevent infinite recursion + for episode in season_list['episode']] + + return self.playlist_result(all_episodes, content_id, episode_title) + + def _real_extract(self, url): + url, data = unsmuggle_url(url, {}) + + video_id = self._match_id(url) + + noplaylist = self._downloader.params.get('noplaylist') + noplaylist_prompt = True + if 'force_noplaylist' in data: + noplaylist = data['force_noplaylist'] + noplaylist_prompt = False + + webpage = self._download_webpage(url, video_id) + + view_data = dict(map(lambda t: (t[0], t[2]), re.findall( + r'viewData\.([a-zA-Z]+)\s*=\s*(["\'])([^"\']+)\2', + webpage))) + + vod_data = self._parse_json(self._search_regex( + 'var\s+vod\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), + video_id) + + season_list = list(vod_data.get('seasonList', {}).values()) + if season_list: + if not noplaylist: + return self._extract_playlist( + season_list[0], video_id, vod_data, view_data, + prompt=noplaylist_prompt) + + if noplaylist_prompt: + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + + # In browsers `getMainUrl` request is always issued. Usually this + # endpoint gives the same result as the data embedded in the webpage. + # If georestricted, there are no embedded data, so an extra request is + # necessary to get the error code + video_data = self._parse_json(self._search_regex( + r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', + webpage, 'video data', default='{}'), video_id) + if not video_data: + payload = { + 'assetId': view_data['assetId'], + 'watchDevices': vod_data['watchDevices'], + 'contentType': view_data['contentType'], + } + video_data = self._download_json( + 'https://www.litv.tv/vod/getMainUrl', video_id, + data=json.dumps(payload).encode('utf-8'), + headers={'Content-Type': 'application/json'}) + + if not video_data.get('fullpath'): + error_msg = video_data.get('errorMessage') + if error_msg == 'vod.error.outsideregionerror': + self.raise_geo_restricted('This video is available in Taiwan only') + if error_msg: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True) + raise ExtractorError('Unexpected result from %s' % self.IE_NAME) + + formats = self._extract_m3u8_formats( + video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls') + for a_format in formats: + # LiTV HLS segments doesn't like compressions + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True + + title = view_data['title'] + view_data.get('secondaryMark', '') + description = view_data.get('description') + thumbnail = view_data.get('imageFile') + categories = [item['name'] for item in vod_data.get('category', [])] + episode = int_or_none(view_data.get('episode')) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'categories': categories, + 'episode_number': episode, + } diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 29fba5f30..ea0565ac0 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -17,7 +17,8 @@ class LiveLeakIE(InfoExtractor): 'ext': 'flv', 'description': 'extremely bad day for this guy..!', 'uploader': 'ljfriel2', - 'title': 'Most unlucky car accident' + 'title': 'Most unlucky car accident', + 'thumbnail': 're:^https?://.*\.jpg$' } }, { 'url': 'http://www.liveleak.com/view?i=f93_1390833151', @@ -28,6 +29,7 @@ class LiveLeakIE(InfoExtractor): 'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.', 'uploader': 'ARD_Stinkt', 'title': 'German Television does first Edward Snowden Interview (ENGLISH)', + 'thumbnail': 're:^https?://.*\.jpg$' } }, { 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', @@ -49,7 +51,8 @@ class LiveLeakIE(InfoExtractor): 'ext': 'mp4', 'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.', 'uploader': 'bony333', - 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia' + 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia', + 'thumbnail': 're:^https?://.*\.jpg$' } }] @@ -72,6 +75,7 @@ class LiveLeakIE(InfoExtractor): age_limit = int_or_none(self._search_regex( r'you confirm that you are ([0-9]+) years and over.', webpage, 'age limit', default=None)) + video_thumbnail = self._og_search_thumbnail(webpage) sources_raw = self._search_regex( r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) @@ -124,4 +128,5 @@ class LiveLeakIE(InfoExtractor): 'uploader': video_uploader, 'formats': formats, 'age_limit': age_limit, + 'thumbnail': video_thumbnail, } diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index eada7c299..bc7894bf1 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -150,7 +150,7 @@ class LivestreamIE(InfoExtractor): } def _extract_stream_info(self, stream_info): - broadcast_id = stream_info['broadcast_id'] + broadcast_id = compat_str(stream_info['broadcast_id']) is_live = stream_info.get('is_live') formats = [] @@ -203,9 +203,10 @@ class LivestreamIE(InfoExtractor): if not videos_info: break for v in videos_info: + v_id = compat_str(v['id']) entries.append(self.url_result( - 'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v['id']), - 'Livestream', v['id'], v['caption'])) + 'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v_id), + 'Livestream', v_id, v.get('caption'))) last_video = videos_info[-1]['id'] return self.playlist_result(entries, event_id, event_data['full_name']) diff --git a/youtube_dl/extractor/localnews8.py b/youtube_dl/extractor/localnews8.py new file mode 100644 index 000000000..aad396135 --- /dev/null +++ b/youtube_dl/extractor/localnews8.py @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class LocalNews8IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?localnews8\.com/(?:[^/]+/)*(?P<display_id>[^/]+)/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.localnews8.com/news/rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings/35183304', + 'md5': 'be4d48aea61aa2bde7be2ee47691ad20', + 'info_dict': { + 'id': '35183304', + 'display_id': 'rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings', + 'ext': 'mp4', + 'title': 'Rexburg business turns carbon fiber scraps into wedding ring', + 'description': 'The process was first invented by Lamborghini and less than a dozen companies around the world use it.', + 'duration': 153, + 'timestamp': 1441844822, + 'upload_date': '20150910', + 'uploader_id': 'api', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + partner_id = self._search_regex( + r'partnerId\s*[:=]\s*(["\'])(?P<id>\d+)\1', + webpage, 'partner id', group='id') + kaltura_id = self._search_regex( + r'videoIdString\s*[:=]\s*(["\'])kaltura:(?P<id>[0-9a-z_]+)\1', + webpage, 'videl id', group='id') + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), + 'ie_key': 'Kaltura', + 'id': video_id, + 'display_id': display_id, + } diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index a14d176a5..9fbc74f5d 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -11,7 +11,7 @@ class MGTVIE(InfoExtractor): _TEST = { 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', - 'md5': '', + 'md5': '1bdadcf760a0b90946ca68ee9a2db41a', 'info_dict': { 'id': '3116640', 'ext': 'mp4', @@ -20,15 +20,6 @@ class MGTVIE(InfoExtractor): 'duration': 7461, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - 'skip_download': True, # m3u8 download - }, - } - - _FORMAT_MAP = { - '标清': ('Standard', 0), - '高清': ('High', 1), - '超清': ('SuperHigh', 2), } def _real_extract(self, url): @@ -40,17 +31,27 @@ class MGTVIE(InfoExtractor): formats = [] for idx, stream in enumerate(api_data['stream']): - format_name = stream.get('name') - format_id, preference = self._FORMAT_MAP.get(format_name, (None, None)) - format_info = self._download_json( - stream['url'], video_id, - note='Download video info for format %s' % format_id or '#%d' % idx) - formats.append({ - 'format_id': format_id, - 'url': format_info['info'], - 'ext': 'mp4', # These are m3u8 playlists - 'preference': preference, - }) + stream_url = stream.get('url') + if not stream_url: + continue + tbr = int_or_none(self._search_regex( + r'(\d+)\.mp4', stream_url, 'tbr', default=None)) + + def extract_format(stream_url, format_id, idx, query={}): + format_info = self._download_json( + stream_url, video_id, + note='Download video info for format %s' % format_id or '#%d' % idx, query=query) + return { + 'format_id': format_id, + 'url': format_info['info'], + 'ext': 'mp4', + 'tbr': tbr, + } + + formats.append(extract_format( + stream_url, 'hls-%d' % tbr if tbr else None, idx * 2)) + formats.append(extract_format(stream_url.replace( + '/playlist.m3u8', ''), 'http-%d' % tbr if tbr else None, idx * 2 + 1, {'pno': 1031})) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py new file mode 100644 index 000000000..afd3e98ec --- /dev/null +++ b/youtube_dl/extractor/microsoftvirtualacademy.py @@ -0,0 +1,192 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_xpath, +) +from ..utils import ( + int_or_none, + parse_duration, + smuggle_url, + unsmuggle_url, + xpath_text, +) + + +class MicrosoftVirtualAcademyBaseIE(InfoExtractor): + def _extract_base_url(self, course_id, display_id): + return self._download_json( + 'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id, + display_id, 'Downloading course base URL') + + def _extract_chapter_and_title(self, title): + if not title: + return None, None + m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title) + return (int(m.group('chapter')), m.group('title')) if m else (None, title) + + +class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): + IE_NAME = 'mva' + IE_DESC = 'Microsoft Virtual Academy videos' + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME + + _TESTS = [{ + 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', + 'md5': '7826c44fc31678b12ad8db11f6b5abb9', + 'info_dict': { + 'id': 'gfVXISmEB_6804984382', + 'ext': 'mp4', + 'title': 'Course Introduction', + 'formats': 'mincount:3', + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, + } + }, { + 'url': 'mva:11788:gfVXISmEB_6804984382', + 'only_matching': True, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('course_id') + video_id = mobj.group('id') + + base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id) + + settings = self._download_xml( + '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id), + video_id, 'Downloading video settings XML') + + _, title = self._extract_chapter_and_title(xpath_text( + settings, './/Title', 'title', fatal=True)) + + formats = [] + + for sources in settings.findall(compat_xpath('.//MediaSources')): + if sources.get('videoType') == 'smoothstreaming': + continue + for source in sources.findall(compat_xpath('./MediaSource')): + video_url = source.text + if not video_url or not video_url.startswith('http'): + continue + video_mode = source.get('videoMode') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) + codec = source.get('codec') + acodec, vcodec = [None] * 2 + if codec: + codecs = codec.split(',') + if len(codecs) == 2: + acodec, vcodec = codecs + elif len(codecs) == 1: + vcodec = codecs[0] + formats.append({ + 'url': video_url, + 'format_id': video_mode, + 'height': height, + 'acodec': acodec, + 'vcodec': vcodec, + }) + self._sort_formats(formats) + + subtitles = {} + for source in settings.findall(compat_xpath('.//MarkerResourceSource')): + subtitle_url = source.text + if not subtitle_url: + continue + subtitles.setdefault('en', []).append({ + 'url': '%s/%s' % (base_url, subtitle_url), + 'ext': source.get('type'), + }) + + return { + 'id': video_id, + 'title': title, + 'subtitles': subtitles, + 'formats': formats + } + + +class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): + IE_NAME = 'mva:course' + IE_DESC = 'Microsoft Virtual Academy courses' + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME + + _TESTS = [{ + 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', + 'info_dict': { + 'id': '11788', + 'title': 'Microsoft Azure Fundamentals: Virtual Machines', + }, + 'playlist_count': 36, + }, { + # with emphasized chapters + 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335', + 'info_dict': { + 'id': '16335', + 'title': 'Developing Windows 10 Games with Construct 2', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', + 'only_matching': True, + }, { + 'url': 'mva:course:11788', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MicrosoftVirtualAcademyIE.suitable(url) else super( + MicrosoftVirtualAcademyCourseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('id') + display_id = mobj.group('display_id') + + base_url = self._extract_base_url(course_id, display_id) + + manifest = self._download_json( + '%s/imsmanifestlite.json' % base_url, + display_id, 'Downloading course manifest JSON')['manifest'] + + organization = manifest['organizations']['organization'][0] + + entries = [] + for chapter in organization['item']: + chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title')) + chapter_id = chapter.get('@identifier') + for item in chapter.get('item', []): + item_id = item.get('@identifier') + if not item_id: + continue + metadata = item.get('resource', {}).get('metadata') or {} + if metadata.get('learningresourcetype') != 'Video': + continue + _, title = self._extract_chapter_and_title(item.get('title')) + duration = parse_duration(metadata.get('duration')) + description = metadata.get('description') + entries.append({ + '_type': 'url_transparent', + 'url': smuggle_url( + 'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}), + 'title': title, + 'description': description, + 'duration': duration, + 'chapter': chapter_title, + 'chapter_number': chapter_number, + 'chapter_id': chapter_id, + }) + + title = organization.get('title') or manifest.get('metadata', {}).get('title') + + return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 46504cd5f..f27c7f139 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -67,6 +67,23 @@ class NBCIE(InfoExtractor): # This video has expired but with an escaped embedURL 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515', 'only_matching': True, + }, + { + # HLS streams requires the 'hdnea3' cookie + 'url': 'http://www.nbc.com/Kings/video/goliath/n1806', + 'info_dict': { + 'id': 'n1806', + 'ext': 'mp4', + 'title': 'Goliath', + 'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.', + 'timestamp': 1237100400, + 'upload_date': '20090315', + 'uploader': 'NBCU-COM', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Only works from US', } ] diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py index 2a1ca80df..96528f649 100644 --- a/youtube_dl/extractor/ndtv.py +++ b/youtube_dl/extractor/ndtv.py @@ -1,19 +1,18 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( - month_by_name, int_or_none, + remove_end, + unified_strdate, ) class NDTVIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?ndtv\.com/video/player/[^/]*/[^/]*/(?P<id>[a-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?ndtv\.com/video/(?:[^/]+/)+[^/?^&]+-(?P<id>\d+)' _TEST = { - 'url': 'http://www.ndtv.com/video/player/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal/300710', + 'url': 'http://www.ndtv.com/video/news/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal-300710', 'md5': '39f992dbe5fb531c395d8bbedb1e5e88', 'info_dict': { 'id': '300710', @@ -22,7 +21,7 @@ class NDTVIE(InfoExtractor): 'description': 'md5:ab2d4b4a6056c5cb4caa6d729deabf02', 'upload_date': '20131208', 'duration': 1327, - 'thumbnail': 'http://i.ndtvimg.com/video/images/vod/medium/2013-12/big_300710_1386518307.jpg', + 'thumbnail': 're:https?://.*\.jpg', }, } @@ -30,36 +29,19 @@ class NDTVIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + title = remove_end(self._og_search_title(webpage), ' - NDTV') + filename = self._search_regex( r"__filename='([^']+)'", webpage, 'video filename') - video_url = ('http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % - filename) + video_url = 'http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % filename duration = int_or_none(self._search_regex( r"__duration='([^']+)'", webpage, 'duration', fatal=False)) - date_m = re.search(r'''(?x) - <p\s+class="vod_dateline">\s* - Published\s+On:\s* - (?P<monthname>[A-Za-z]+)\s+(?P<day>[0-9]+),\s*(?P<year>[0-9]+) - ''', webpage) - upload_date = None - - if date_m is not None: - month = month_by_name(date_m.group('monthname')) - if month is not None: - upload_date = '%s%02d%02d' % ( - date_m.group('year'), month, int(date_m.group('day'))) - - description = self._og_search_description(webpage) - READ_MORE = ' (Read more)' - if description.endswith(READ_MORE): - description = description[:-len(READ_MORE)] + upload_date = unified_strdate(self._html_search_meta( + 'publish-date', webpage, 'upload date', fatal=False)) - title = self._og_search_title(webpage) - TITLE_SUFFIX = ' - NDTV' - if title.endswith(TITLE_SUFFIX): - title = title[:-len(TITLE_SUFFIX)] + description = remove_end(self._og_search_description(webpage), ' (Read more)') return { 'id': video_id, diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py index 51e4a34f7..adcc636bc 100644 --- a/youtube_dl/extractor/nfb.py +++ b/youtube_dl/extractor/nfb.py @@ -2,8 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - sanitized_Request, + clean_html, + determine_ext, + int_or_none, + qualities, urlencode_postdata, + xpath_text, ) @@ -16,12 +20,12 @@ class NFBIE(InfoExtractor): 'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny', 'info_dict': { 'id': 'qallunaat_why_white_people_are_funny', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Qallunaat! Why White People Are Funny ', - 'description': 'md5:836d8aff55e087d04d9f6df554d4e038', + 'description': 'md5:6b8e32dde3abf91e58857b174916620c', 'duration': 3128, + 'creator': 'Mark Sandiford', 'uploader': 'Mark Sandiford', - 'uploader_id': 'mark-sandiford', }, 'params': { # rtmp download @@ -31,65 +35,78 @@ class NFBIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage( - 'https://www.nfb.ca/film/%s' % video_id, video_id, - 'Downloading film page') - uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"', - page, 'director id', fatal=False) - uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>', - page, 'director name', fatal=False) - - request = sanitized_Request( + config = self._download_xml( 'https://www.nfb.ca/film/%s/player_config' % video_id, - urlencode_postdata({'getConfig': 'true'})) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf') - - config = self._download_xml(request, video_id, 'Downloading player config XML') + video_id, 'Downloading player config XML', + data=urlencode_postdata({'getConfig': 'true'}), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'X-NFB-Referer': 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf' + }) - title = None - description = None - thumbnail = None - duration = None - formats = [] - - def extract_thumbnail(media): - thumbnails = {} - for asset in media.findall('assets/asset'): - thumbnails[asset.get('quality')] = asset.find('default/url').text - if not thumbnails: - return None - if 'high' in thumbnails: - return thumbnails['high'] - return list(thumbnails.values())[0] + title, description, thumbnail, duration, uploader, author = [None] * 6 + thumbnails, formats = [[]] * 2 + subtitles = {} for media in config.findall('./player/stream/media'): if media.get('type') == 'posterImage': - thumbnail = extract_thumbnail(media) + quality_key = qualities(('low', 'high')) + thumbnails = [] + for asset in media.findall('assets/asset'): + asset_url = xpath_text(asset, 'default/url', default=None) + if not asset_url: + continue + quality = asset.get('quality') + thumbnails.append({ + 'url': asset_url, + 'id': quality, + 'preference': quality_key(quality), + }) elif media.get('type') == 'video': - duration = int(media.get('duration')) - title = media.find('title').text - description = media.find('description').text - # It seems assets always go from lower to better quality, so no need to sort + title = xpath_text(media, 'title', fatal=True) for asset in media.findall('assets/asset'): - for x in asset: + quality = asset.get('quality') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', quality or '', 'height', default=None)) + for node in asset: + streamer = xpath_text(node, 'streamerURI', default=None) + if not streamer: + continue + play_path = xpath_text(node, 'url', default=None) + if not play_path: + continue formats.append({ - 'url': x.find('streamerURI').text, - 'app': x.find('streamerURI').text.split('/', 3)[3], - 'play_path': x.find('url').text, + 'url': streamer, + 'app': streamer.split('/', 3)[3], + 'play_path': play_path, 'rtmp_live': False, - 'ext': 'mp4', - 'format_id': '%s-%s' % (x.tag, asset.get('quality')), + 'ext': 'flv', + 'format_id': '%s-%s' % (node.tag, quality) if quality else node.tag, + 'height': height, }) + self._sort_formats(formats) + description = clean_html(xpath_text(media, 'description')) + uploader = xpath_text(media, 'author') + duration = int_or_none(media.get('duration')) + for subtitle in media.findall('./subtitles/subtitle'): + subtitle_url = xpath_text(subtitle, 'url', default=None) + if not subtitle_url: + continue + lang = xpath_text(subtitle, 'lang', default='en') + subtitles.setdefault(lang, []).append({ + 'url': subtitle_url, + 'ext': (subtitle.get('format') or determine_ext(subtitle_url)).lower(), + }) return { 'id': video_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'duration': duration, + 'creator': uploader, 'uploader': uploader, - 'uploader_id': uploader_id, 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 51dfc27ac..486e086bb 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,91 +4,219 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_urllib_parse_unquote, -) +from ..compat import compat_urllib_parse_unquote from ..utils import ( - determine_ext, ExtractorError, - float_or_none, + int_or_none, + parse_age_limit, parse_duration, - unified_strdate, ) -class NRKIE(InfoExtractor): - _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' - - _TESTS = [ - { - 'url': 'http://www.nrk.no/video/PS*150533', - # MD5 is unstable - 'info_dict': { - 'id': '150533', - 'ext': 'flv', - 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 263, - } - }, - { - 'url': 'http://www.nrk.no/video/PS*154915', - # MD5 is unstable - 'info_dict': { - 'id': '154915', - 'ext': 'flv', - 'title': 'Slik høres internett ut når du er blind', - 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', - 'duration': 20, - } - }, - ] +class NRKBaseIE(InfoExtractor): + def _extract_formats(self, manifest_url, video_id, fatal=True): + formats = [] + formats.extend(self._extract_f4m_formats( + manifest_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', + video_id, f4m_id='hds', fatal=fatal)) + formats.extend(self._extract_m3u8_formats(manifest_url.replace( + 'akamaihd.net/z/', 'akamaihd.net/i/').replace('/manifest.f4m', '/master.m3u8'), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=fatal)) + return formats def _real_extract(self, url): video_id = self._match_id(url) data = self._download_json( - 'http://v8.psapi.nrk.no/mediaelement/%s' % video_id, - video_id, 'Downloading media JSON') + 'http://%s/mediaelement/%s' % (self._API_HOST, video_id), + video_id, 'Downloading mediaelement JSON') + + title = data.get('fullTitle') or data.get('mainTitle') or data['title'] + video_id = data.get('id') or video_id + + entries = [] + + media_assets = data.get('mediaAssets') + if media_assets and isinstance(media_assets, list): + def video_id_and_title(idx): + return ((video_id, title) if len(media_assets) == 1 + else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) + for num, asset in enumerate(media_assets, 1): + asset_url = asset.get('url') + if not asset_url: + continue + formats = self._extract_formats(asset_url, video_id, fatal=False) + if not formats: + continue + self._sort_formats(formats) + entry_id, entry_title = video_id_and_title(num) + duration = parse_duration(asset.get('duration')) + subtitles = {} + for subtitle in ('webVtt', 'timedText'): + subtitle_url = asset.get('%sSubtitlesUrl' % subtitle) + if subtitle_url: + subtitles.setdefault('no', []).append({ + 'url': compat_urllib_parse_unquote(subtitle_url) + }) + entries.append({ + 'id': asset.get('carrierId') or entry_id, + 'title': entry_title, + 'duration': duration, + 'subtitles': subtitles, + 'formats': formats, + }) - media_url = data.get('mediaUrl') + if not entries: + media_url = data.get('mediaUrl') + if media_url: + formats = self._extract_formats(media_url, video_id) + self._sort_formats(formats) + duration = parse_duration(data.get('duration')) + entries = [{ + 'id': video_id, + 'title': title, + 'duration': duration, + 'formats': formats, + }] - if not media_url: - if data['usageRights']['isGeoBlocked']: + if not entries: + if data.get('usageRights', {}).get('isGeoBlocked'): raise ExtractorError( 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', expected=True) - if determine_ext(media_url) == 'f4m': - formats = self._extract_f4m_formats( - media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds') - self._sort_formats(formats) - else: - formats = [{ - 'url': media_url, - 'ext': 'flv', - }] - - duration = parse_duration(data.get('duration')) + conviva = data.get('convivaStatistics') or {} + series = conviva.get('seriesName') or data.get('seriesTitle') + episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') + thumbnails = None images = data.get('images') - if images: - thumbnails = images['webImages'] - thumbnails.sort(key=lambda image: image['pixelWidth']) - thumbnail = thumbnails[-1]['imageUrl'] - else: - thumbnail = None - - return { - 'id': video_id, - 'title': data['title'], - 'description': data['description'], - 'duration': duration, - 'thumbnail': thumbnail, - 'formats': formats, + if images and isinstance(images, dict): + web_images = images.get('webImages') + if isinstance(web_images, list): + thumbnails = [{ + 'url': image['imageUrl'], + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in web_images if image.get('imageUrl')] + + description = data.get('description') + + common_info = { + 'description': description, + 'series': series, + 'episode': episode, + 'age_limit': parse_age_limit(data.get('legalAge')), + 'thumbnails': thumbnails, } + vcodec = 'none' if data.get('mediaType') == 'Audio' else None + + # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged + + for entry in entries: + entry.update(common_info) + for f in entry['formats']: + f['vcodec'] = vcodec + + return self.playlist_result(entries, video_id, title, description) + + +class NRKIE(NRKBaseIE): + _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' + _API_HOST = 'v8.psapi.nrk.no' + _TESTS = [{ + # video + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': '2f7f6eeb2aacdd99885f355428715cfa', + 'info_dict': { + 'id': '150533', + 'ext': 'mp4', + 'title': 'Dompap og andre fugler i Piip-Show', + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 263, + } + }, { + # audio + 'url': 'http://www.nrk.no/video/PS*154915', + # MD5 is unstable + 'info_dict': { + 'id': '154915', + 'ext': 'flv', + 'title': 'Slik høres internett ut når du er blind', + 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, + } + }] + + +class NRKTVIE(NRKBaseIE): + IE_DESC = 'NRK TV and NRK Radio' + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' + _API_HOST = 'psapi-we.nrk.no' + + _TESTS = [{ + 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', + 'md5': '4e9ca6629f09e588ed240fb11619922a', + 'info_dict': { + 'id': 'MUHH48000314AA', + 'ext': 'mp4', + 'title': '20 spørsmål 23.05.2014', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'duration': 1741.52, + }, + }, { + 'url': 'https://tv.nrk.no/program/mdfp15000514', + 'md5': '43d0be26663d380603a9cf0c24366531', + 'info_dict': { + 'id': 'MDFP15000514CA', + 'ext': 'mp4', + 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', + 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', + 'duration': 4605.08, + }, + }, { + # single playlist video + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'md5': 'adbd1dbd813edaf532b0a253780719c2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + }, + 'skip': 'Only works from Norway', + }, { + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'playlist': [{ + 'md5': '9480285eff92d64f06e02a5367970a7a', + 'info_dict': { + 'id': 'MSPO40010515-part1', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + }, + }, { + 'md5': 'adbd1dbd813edaf532b0a253780719c2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + }, + }], + 'info_dict': { + 'id': 'MSPO40010515', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'duration': 6947.52, + }, + 'skip': 'Only works from Norway', + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', + 'only_matching': True, + }] + class NRKPlaylistIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)' @@ -159,179 +287,3 @@ class NRKSkoleIE(InfoExtractor): nrk_id = self._search_regex(r'data-nrk-id=["\'](\d+)', webpage, 'nrk id') return self.url_result('nrk:%s' % nrk_id) - - -class NRKTVIE(InfoExtractor): - IE_DESC = 'NRK TV and NRK Radio' - _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' - - _TESTS = [ - { - 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'info_dict': { - 'id': 'MUHH48000314', - 'ext': 'mp4', - 'title': '20 spørsmål', - 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', - 'upload_date': '20140523', - 'duration': 1741.52, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'https://tv.nrk.no/program/mdfp15000514', - 'info_dict': { - 'id': 'mdfp15000514', - 'ext': 'mp4', - 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting', - 'description': 'md5:654c12511f035aed1e42bdf5db3b206a', - 'upload_date': '20140524', - 'duration': 4605.08, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - # single playlist video - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', - 'md5': 'adbd1dbd813edaf532b0a253780719c2', - 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'upload_date': '20150106', - }, - 'skip': 'Only works from Norway', - }, - { - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', - 'playlist': [ - { - 'md5': '9480285eff92d64f06e02a5367970a7a', - 'info_dict': { - 'id': 'MSPO40010515-part1', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'upload_date': '20150106', - }, - }, - { - 'md5': 'adbd1dbd813edaf532b0a253780719c2', - 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'upload_date': '20150106', - }, - }, - ], - 'info_dict': { - 'id': 'MSPO40010515', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'upload_date': '20150106', - 'duration': 6947.5199999999995, - }, - 'skip': 'Only works from Norway', - }, - { - 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', - 'only_matching': True, - } - ] - - def _extract_f4m(self, manifest_url, video_id): - return self._extract_f4m_formats( - manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds') - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - part_id = mobj.group('part_id') - base_url = mobj.group('baseurl') - - webpage = self._download_webpage(url, video_id) - - title = self._html_search_meta( - 'title', webpage, 'title') - description = self._html_search_meta( - 'description', webpage, 'description') - - thumbnail = self._html_search_regex( - r'data-posterimage="([^"]+)"', - webpage, 'thumbnail', fatal=False) - upload_date = unified_strdate(self._html_search_meta( - 'rightsfrom', webpage, 'upload date', fatal=False)) - duration = float_or_none(self._html_search_regex( - r'data-duration="([^"]+)"', - webpage, 'duration', fatal=False)) - - # playlist - parts = re.findall( - r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage) - if parts: - entries = [] - for current_part_id, stream_url, part_title in parts: - if part_id and current_part_id != part_id: - continue - video_part_id = '%s-part%s' % (video_id, current_part_id) - formats = self._extract_f4m(stream_url, video_part_id) - entries.append({ - 'id': video_part_id, - 'title': part_title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'formats': formats, - }) - if part_id: - if entries: - return entries[0] - else: - playlist = self.playlist_result(entries, video_id, title, description) - playlist.update({ - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - }) - return playlist - - formats = [] - - f4m_url = re.search(r'data-media="([^"]+)"', webpage) - if f4m_url: - formats.extend(self._extract_f4m(f4m_url.group(1), video_id)) - - m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage) - if m3u8_url: - formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls')) - self._sort_formats(formats) - - subtitles_url = self._html_search_regex( - r'data-subtitlesurl\s*=\s*(["\'])(?P<url>.+?)\1', - webpage, 'subtitle URL', default=None, group='url') - subtitles = {} - if subtitles_url: - subtitles['no'] = [{ - 'ext': 'ttml', - 'url': compat_urlparse.urljoin(base_url, subtitles_url), - }] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 95e982897..2038a6ba5 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -8,6 +8,7 @@ from ..utils import ( float_or_none, ExtractorError, unsmuggle_url, + determine_ext, ) from ..compat import compat_urllib_parse_urlencode @@ -15,71 +16,80 @@ from ..compat import compat_urllib_parse_urlencode class OoyalaBaseIE(InfoExtractor): _PLAYER_BASE = 'http://player.ooyala.com/' _CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/' - _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v1/authorization/embed_code/%s/%s?' + _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s?' def _extract(self, content_tree_url, video_id, domain='example.org'): content_tree = self._download_json(content_tree_url, video_id)['content_tree'] metadata = content_tree[list(content_tree)[0]] embed_code = metadata['embed_code'] pcode = metadata.get('asset_pcode') or embed_code - video_info = { - 'id': embed_code, - 'title': metadata['title'], - 'description': metadata.get('description'), - 'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'), - 'duration': float_or_none(metadata.get('duration'), 1000), - } + title = metadata['title'] + + auth_data = self._download_json( + self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) + + compat_urllib_parse_urlencode({ + 'domain': domain, + 'supportedFormats': 'mp4,rtmp,m3u8,hds', + }), video_id) + + cur_auth_data = auth_data['authorization_data'][embed_code] urls = [] formats = [] - for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'): - auth_data = self._download_json( - self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) + - compat_urllib_parse_urlencode({ - 'domain': domain, - 'supportedFormats': supported_format - }), - video_id, 'Downloading %s JSON' % supported_format) - - cur_auth_data = auth_data['authorization_data'][embed_code] - - if cur_auth_data['authorized']: - for stream in cur_auth_data['streams']: - url = base64.b64decode( - stream['url']['data'].encode('ascii')).decode('utf-8') - if url in urls: - continue - urls.append(url) - delivery_type = stream['delivery_type'] - if delivery_type == 'hls' or '.m3u8' in url: - formats.extend(self._extract_m3u8_formats( - url, embed_code, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif delivery_type == 'hds' or '.f4m' in url: - formats.extend(self._extract_f4m_formats( - url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False)) - elif '.smil' in url: - formats.extend(self._extract_smil_formats( - url, embed_code, fatal=False)) - else: - formats.append({ - 'url': url, - 'ext': stream.get('delivery_type'), - 'vcodec': stream.get('video_codec'), - 'format_id': delivery_type, - 'width': int_or_none(stream.get('width')), - 'height': int_or_none(stream.get('height')), - 'abr': int_or_none(stream.get('audio_bitrate')), - 'vbr': int_or_none(stream.get('video_bitrate')), - 'fps': float_or_none(stream.get('framerate')), - }) - else: - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, cur_auth_data['message']), expected=True) + if cur_auth_data['authorized']: + for stream in cur_auth_data['streams']: + s_url = base64.b64decode( + stream['url']['data'].encode('ascii')).decode('utf-8') + if s_url in urls: + continue + urls.append(s_url) + ext = determine_ext(s_url, None) + delivery_type = stream['delivery_type'] + if delivery_type == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + s_url, embed_code, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif delivery_type == 'hds' or ext == 'f4m': + formats.extend(self._extract_f4m_formats( + s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + s_url, embed_code, fatal=False)) + else: + formats.append({ + 'url': s_url, + 'ext': ext or stream.get('delivery_type'), + 'vcodec': stream.get('video_codec'), + 'format_id': delivery_type, + 'width': int_or_none(stream.get('width')), + 'height': int_or_none(stream.get('height')), + 'abr': int_or_none(stream.get('audio_bitrate')), + 'vbr': int_or_none(stream.get('video_bitrate')), + 'fps': float_or_none(stream.get('framerate')), + }) + else: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, cur_auth_data['message']), expected=True) self._sort_formats(formats) - video_info['formats'] = formats - return video_info + subtitles = {} + for lang, sub in metadata.get('closed_captions_vtt', {}).get('captions', {}).items(): + sub_url = sub.get('url') + if not sub_url: + continue + subtitles[lang] = [{ + 'url': sub_url, + }] + + return { + 'id': embed_code, + 'title': title, + 'description': metadata.get('description'), + 'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'), + 'duration': float_or_none(metadata.get('duration'), 1000), + 'subtitles': subtitles, + 'formats': formats, + } class OoyalaIE(OoyalaBaseIE): diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 456561bcc..5049b870e 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -100,7 +100,7 @@ class OpenloadIE(InfoExtractor): raise ExtractorError('File not found', expected=True) code = self._search_regex( - r'<video[^>]+>\s*<script[^>]+>([^<]+)</script>', + r'</video>\s*</div>\s*<script[^>]+>([^<]+)</script>', webpage, 'JS code') decoded = self.openload_decode(code) diff --git a/youtube_dl/extractor/ora.py b/youtube_dl/extractor/ora.py index 8545fb1b8..1d42be39b 100644 --- a/youtube_dl/extractor/ora.py +++ b/youtube_dl/extractor/ora.py @@ -12,8 +12,8 @@ from ..utils import ( class OraTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ora\.tv/([^/]+/)*(?P<id>[^/\?#]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:ora\.tv|unsafespeech\.com)/([^/]+/)*(?P<id>[^/\?#]+)' + _TESTS = [{ 'url': 'https://www.ora.tv/larrykingnow/2015/12/16/vine-youtube-stars-zach-king-king-bach-on-their-viral-videos-0_36jupg6090pq', 'md5': 'fa33717591c631ec93b04b0e330df786', 'info_dict': { @@ -22,7 +22,10 @@ class OraTVIE(InfoExtractor): 'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!', 'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1', } - } + }, { + 'url': 'http://www.unsafespeech.com/video/2016/5/10/student-self-censorship-and-the-thought-police-on-university-campuses-0_6622bnkppw4d', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 514e9b433..c23b314e7 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -2,11 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( + parse_iso8601, + unescapeHTML, +) class PeriscopeIE(InfoExtractor): IE_DESC = 'Periscope' + IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)' # Alive example URLs can be found here http://onperiscope.com/ _TESTS = [{ @@ -41,8 +45,11 @@ class PeriscopeIE(InfoExtractor): broadcast = broadcast_data['broadcast'] status = broadcast['status'] - uploader = broadcast.get('user_display_name') or broadcast_data.get('user', {}).get('display_name') - uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id') + user = broadcast_data.get('user', {}) + + uploader = broadcast.get('user_display_name') or user.get('display_name') + uploader_id = (broadcast.get('username') or user.get('username') or + broadcast.get('user_id') or user.get('id')) title = '%s - %s' % (uploader, status) if uploader else status state = broadcast.get('state').lower() @@ -79,3 +86,43 @@ class PeriscopeIE(InfoExtractor): 'thumbnails': thumbnails, 'formats': formats, } + + +class PeriscopeUserIE(InfoExtractor): + _VALID_URL = r'https?://www\.periscope\.tv/(?P<id>[^/]+)/?$' + IE_DESC = 'Periscope user videos' + IE_NAME = 'periscope:user' + + _TEST = { + 'url': 'https://www.periscope.tv/LularoeHusbandMike/', + 'info_dict': { + 'id': 'LularoeHusbandMike', + 'title': 'LULAROE HUSBAND MIKE', + 'description': 'md5:6cf4ec8047768098da58e446e82c82f0', + }, + # Periscope only shows videos in the last 24 hours, so it's possible to + # get 0 videos + 'playlist_mincount': 0, + } + + def _real_extract(self, url): + user_id = self._match_id(url) + + webpage = self._download_webpage(url, user_id) + + data_store = self._parse_json( + unescapeHTML(self._search_regex( + r'data-store=(["\'])(?P<data>.+?)\1', + webpage, 'data store', default='{}', group='data')), + user_id) + + user = data_store.get('User', {}).get('user', {}) + title = user.get('display_name') or user.get('username') + description = user.get('description') + + entries = [ + self.url_result( + 'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id'])) + for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcasts', [])] + + return self.playlist_result(entries, user_id, title, description) diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py index 6d138ef25..0bc743118 100644 --- a/youtube_dl/extractor/playwire.py +++ b/youtube_dl/extractor/playwire.py @@ -4,9 +4,8 @@ import re from .common import InfoExtractor from ..utils import ( - xpath_text, + dict_get, float_or_none, - int_or_none, ) @@ -23,6 +22,19 @@ class PlaywireIE(InfoExtractor): 'duration': 145.94, }, }, { + # m3u8 in f4m + 'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json', + 'info_dict': { + 'id': '4840492', + 'ext': 'mp4', + 'title': 'ITV EL SHOW FULL', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # Multiple resolutions while bitrates missing 'url': 'http://cdn.playwire.com/11625/embed/85228.html', 'only_matching': True, }, { @@ -48,25 +60,10 @@ class PlaywireIE(InfoExtractor): thumbnail = content.get('poster') src = content['media']['f4m'] - f4m = self._download_xml(src, video_id) - base_url = xpath_text(f4m, './{http://ns.adobe.com/f4m/1.0}baseURL', 'base url', fatal=True) - formats = [] - for media in f4m.findall('./{http://ns.adobe.com/f4m/1.0}media'): - media_url = media.get('url') - if not media_url: - continue - tbr = int_or_none(media.get('bitrate')) - width = int_or_none(media.get('width')) - height = int_or_none(media.get('height')) - f = { - 'url': '%s/%s' % (base_url, media.attrib['url']), - 'tbr': tbr, - 'width': width, - 'height': height, - } - if not (tbr or width or height): - f['quality'] = 1 if '-hd.' in media_url else 0 - formats.append(f) + formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls') + for a_format in formats: + if not dict_get(a_format, ['tbr', 'width', 'height']): + a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0 self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py new file mode 100644 index 000000000..4f05bbddc --- /dev/null +++ b/youtube_dl/extractor/radiocanada.py @@ -0,0 +1,130 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + find_xpath_attr, + determine_ext, + int_or_none, + unified_strdate, + xpath_element, + ExtractorError, +) + + +class RadioCanadaIE(InfoExtractor): + IE_NAME = 'radiocanada' + _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)' + _TEST = { + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', + 'info_dict': { + 'id': '7184272', + 'ext': 'flv', + 'title': 'Le parcours du tireur capté sur vidéo', + 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', + 'upload_date': '20141023', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + app_code, video_id = re.match(self._VALID_URL, url).groups() + + formats = [] + # TODO: extract m3u8 and f4m formats + # m3u8 formats can be extracted using ipad device_type return 403 error code when ffmpeg try to download segements + # f4m formats can be extracted using flashhd device_type but they produce unplayable file + for device_type in ('flash',): + v_data = self._download_xml( + 'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx', + video_id, note='Downloading %s XML' % device_type, query={ + 'appCode': app_code, + 'idMedia': video_id, + 'connectionType': 'broadband', + 'multibitrate': 'true', + 'deviceType': device_type, + # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction + 'paysJ391wsHjbOJwvCs26toz': 'CA', + 'bypasslock': 'NZt5K62gRqfc', + }) + v_url = xpath_text(v_data, 'url') + if not v_url: + continue + if v_url == 'null': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, xpath_text(v_data, 'message')), expected=True) + ext = determine_ext(v_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats(v_url, video_id, f4m_id='hds', fatal=False)) + else: + ext = determine_ext(v_url) + bitrates = xpath_element(v_data, 'bitrates') + for url_e in bitrates.findall('url'): + tbr = int_or_none(url_e.get('bitrate')) + if not tbr: + continue + formats.append({ + 'format_id': 'rtmp-%d' % tbr, + 'url': re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url), + 'ext': 'flv', + 'protocol': 'rtmp', + 'width': int_or_none(url_e.get('width')), + 'height': int_or_none(url_e.get('height')), + 'tbr': tbr, + }) + self._sort_formats(formats) + + metadata = self._download_xml( + 'http://api.radio-canada.ca/metaMedia/v1/index.ashx', + video_id, note='Downloading metadata XML', query={ + 'appCode': app_code, + 'idMedia': video_id, + }) + + def get_meta(name): + el = find_xpath_attr(metadata, './/Meta', 'name', name) + return el.text if el is not None else None + + return { + 'id': video_id, + 'title': get_meta('Title'), + 'description': get_meta('Description') or get_meta('ShortDescription'), + 'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'), + 'duration': int_or_none(get_meta('length')), + 'series': get_meta('Emission'), + 'season_number': int_or_none('SrcSaison'), + 'episode_number': int_or_none('SrcEpisode'), + 'upload_date': unified_strdate(get_meta('Date')), + 'formats': formats, + } + + +class RadioCanadaAudioVideoIE(InfoExtractor): + 'radiocanada:audiovideo' + _VALID_URL = r'https?://ici\.radio-canada\.ca/audio-video/media-(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', + 'info_dict': { + 'id': '7527184', + 'ext': 'flv', + 'title': 'Barack Obama au Vietnam', + 'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam', + 'upload_date': '20160523', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + return self.url_result('radiocanada:medianet:%s' % self._match_id(url)) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 7ba41ba59..721fc3a9e 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,7 +1,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + str_to_int, + unified_strdate, +) class RedTubeIE(InfoExtractor): @@ -13,6 +18,9 @@ class RedTubeIE(InfoExtractor): 'id': '66418', 'ext': 'mp4', 'title': 'Sucked on a toilet', + 'upload_date': '20120831', + 'duration': 596, + 'view_count': int, 'age_limit': 18, } } @@ -24,12 +32,39 @@ class RedTubeIE(InfoExtractor): if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): raise ExtractorError('Video %s has been removed' % video_id, expected=True) - video_url = self._html_search_regex( - r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') - video_title = self._html_search_regex( - r'<h1 class="videoTitle[^"]*">(.+?)</h1>', - webpage, 'title') - video_thumbnail = self._og_search_thumbnail(webpage) + title = self._html_search_regex( + (r'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>', + r'videoTitle\s*:\s*(["\'])(?P<title>)\1'), + webpage, 'title', group='title') + + formats = [] + sources = self._parse_json( + self._search_regex( + r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), + video_id, fatal=False) + if sources and isinstance(sources, dict): + for format_id, format_url in sources.items(): + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + else: + video_url = self._html_search_regex( + r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') + formats.append({'url': video_url}) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._search_regex( + r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<', + webpage, 'upload date', fatal=False)) + duration = int_or_none(self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + view_count = str_to_int(self._search_regex( + r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', + webpage, 'view count', fatal=False)) # No self-labeling, but they describe themselves as # "Home of Videos Porno" @@ -37,9 +72,12 @@ class RedTubeIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, 'ext': 'mp4', - 'title': video_title, - 'thumbnail': video_thumbnail, + 'title': title, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, 'age_limit': age_limit, + 'formats': formats, } diff --git a/youtube_dl/extractor/reuters.py b/youtube_dl/extractor/reuters.py new file mode 100644 index 000000000..961d504eb --- /dev/null +++ b/youtube_dl/extractor/reuters.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + int_or_none, + unescapeHTML, +) + + +class ReutersIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?reuters\.com/.*?\?.*?videoId=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.reuters.com/video/2016/05/20/san-francisco-police-chief-resigns?videoId=368575562', + 'md5': '8015113643a0b12838f160b0b81cc2ee', + 'info_dict': { + 'id': '368575562', + 'ext': 'mp4', + 'title': 'San Francisco police chief resigns', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://www.reuters.com/assets/iframe/yovideo?videoId=%s' % video_id, video_id) + video_data = js_to_json(self._search_regex( + r'(?s)Reuters\.yovideo\.drawPlayer\(({.*?})\);', + webpage, 'video data')) + + def get_json_value(key, fatal=False): + return self._search_regex('"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal) + + title = unescapeHTML(get_json_value('title', fatal=True)) + mmid, fid = re.search(r',/(\d+)\?f=(\d+)', get_json_value('flv', fatal=True)).groups() + + mas_data = self._download_json( + 'http://mas-e.cds1.yospace.com/mas/%s/%s?trans=json' % (mmid, fid), + video_id, transform_source=js_to_json) + formats = [] + for f in mas_data: + f_url = f.get('url') + if not f_url: + continue + method = f.get('method') + if method == 'hls': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + container = f.get('container') + ext = '3gp' if method == 'mobile' else container + formats.append({ + 'format_id': ext, + 'url': f_url, + 'ext': ext, + 'container': container if method != 'mobile' else None, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': get_json_value('thumb'), + 'duration': int_or_none(get_json_value('seconds')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py index 99979ebe1..833d8a2f0 100644 --- a/youtube_dl/extractor/revision3.py +++ b/youtube_dl/extractor/revision3.py @@ -13,8 +13,64 @@ from ..utils import ( ) +class Revision3EmbedIE(InfoExtractor): + IE_NAME = 'revision3:embed' + _VALID_URL = r'(?:revision3:(?:(?P<playlist_type>[^:]+):)?|https?://(?:(?:(?:www|embed)\.)?(?:revision3|animalist)|(?:(?:api|embed)\.)?seekernetwork)\.com/player/embed\?videoId=)(?P<playlist_id>\d+)' + _TEST = { + 'url': 'http://api.seekernetwork.com/player/embed?videoId=67558', + 'md5': '83bcd157cab89ad7318dd7b8c9cf1306', + 'info_dict': { + 'id': '67558', + 'ext': 'mp4', + 'title': 'The Pros & Cons Of Zoos', + 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', + 'uploader_id': 'dnews', + 'uploader': 'DNews', + } + } + _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('playlist_id') + playlist_type = mobj.group('playlist_type') or 'video_id' + video_data = self._download_json( + 'http://revision3.com/api/getPlaylist.json', playlist_id, query={ + 'api_key': self._API_KEY, + 'codecs': 'h264,vp8,theora', + playlist_type: playlist_id, + })['items'][0] + + formats = [] + for vcodec, media in video_data['media'].items(): + for quality_id, quality in media.items(): + if quality_id == 'hls': + formats.extend(self._extract_m3u8_formats( + quality['url'], playlist_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': quality['url'], + 'format_id': '%s-%s' % (vcodec, quality_id), + 'tbr': int_or_none(quality.get('bitrate')), + 'vcodec': vcodec, + }) + self._sort_formats(formats) + + return { + 'id': playlist_id, + 'title': unescapeHTML(video_data['title']), + 'description': unescapeHTML(video_data.get('summary')), + 'uploader': video_data.get('show', {}).get('name'), + 'uploader_id': video_data.get('show', {}).get('slug'), + 'duration': int_or_none(video_data.get('duration')), + 'formats': formats, + } + + class Revision3IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|testtube|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)' + IE_NAME = 'revision' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)' _TESTS = [{ 'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016', 'md5': 'd94a72d85d0a829766de4deb8daaf7df', @@ -32,52 +88,14 @@ class Revision3IE(InfoExtractor): } }, { # Show - 'url': 'http://testtube.com/brainstuff', - 'info_dict': { - 'id': '251', - 'title': 'BrainStuff', - 'description': 'Whether the topic is popcorn or particle physics, you can count on the HowStuffWorks team to explore-and explain-the everyday science in the world around us on BrainStuff.', - }, - 'playlist_mincount': 93, - }, { - 'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial', - 'info_dict': { - 'id': '58227', - 'display_id': 'dnews/5-weird-ways-plants-can-eat-animals', - 'duration': 275, - 'ext': 'webm', - 'title': '5 Weird Ways Plants Can Eat Animals', - 'description': 'Why have some plants evolved to eat meat?', - 'upload_date': '20150120', - 'timestamp': 1421763300, - 'uploader': 'DNews', - 'uploader_id': 'dnews', - }, - }, { - 'url': 'http://testtube.com/tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min', - 'info_dict': { - 'id': '71618', - 'ext': 'mp4', - 'display_id': 'tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min', - 'title': 'The Israel-Palestine Conflict Explained in Ten Minutes', - 'description': 'If you\'d like to learn about the struggle between Israelis and Palestinians, this video is a great place to start', - 'uploader': 'Editors\' Picks', - 'uploader_id': 'tt-editors-picks', - 'timestamp': 1453309200, - 'upload_date': '20160120', - }, - 'add_ie': ['Youtube'], + 'url': 'http://revision3.com/variant', + 'only_matching': True, }, { # Tag - 'url': 'http://testtube.com/tech-news', - 'info_dict': { - 'id': '21018', - 'title': 'tech news', - }, - 'playlist_mincount': 9, + 'url': 'http://revision3.com/vr', + 'only_matching': True, }] _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s' - _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62' def _real_extract(self, url): domain, display_id = re.match(self._VALID_URL, url).groups() @@ -119,33 +137,9 @@ class Revision3IE(InfoExtractor): }) return info - video_data = self._download_json( - 'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id), - video_id)['items'][0] - - formats = [] - for vcodec, media in video_data['media'].items(): - for quality_id, quality in media.items(): - if quality_id == 'hls': - formats.extend(self._extract_m3u8_formats( - quality['url'], video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': quality['url'], - 'format_id': '%s-%s' % (vcodec, quality_id), - 'tbr': int_or_none(quality.get('bitrate')), - 'vcodec': vcodec, - }) - self._sort_formats(formats) - info.update({ - 'title': unescapeHTML(video_data['title']), - 'description': unescapeHTML(video_data.get('summary')), - 'uploader': video_data.get('show', {}).get('name'), - 'uploader_id': video_data.get('show', {}).get('slug'), - 'duration': int_or_none(video_data.get('duration')), - 'formats': formats, + '_type': 'url_transparent', + 'url': 'revision3:%s' % video_id, }) return info else: diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 79af47715..f11e3588b 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -6,6 +6,9 @@ import re import time from .common import InfoExtractor +from ..compat import ( + compat_struct_unpack, +) from ..utils import ( ExtractorError, float_or_none, @@ -13,7 +16,6 @@ from ..utils import ( remove_start, sanitized_Request, std_headers, - struct_unpack, ) @@ -21,7 +23,7 @@ def _decrypt_url(png): encrypted_data = base64.b64decode(png.encode('utf-8')) text_index = encrypted_data.find(b'tEXt') text_chunk = encrypted_data[text_index - 4:] - length = struct_unpack('!I', text_chunk[:4])[0] + length = compat_struct_unpack('!I', text_chunk[:4])[0] # Use bytearray to get integers when iterating in both python 2.x and 3.x data = bytearray(text_chunk[8:8 + length]) data = [chr(b) for b in data if b != 0] @@ -62,7 +64,7 @@ def _decrypt_url(png): class RTVEALaCartaIE(InfoExtractor): IE_NAME = 'rtve.es:alacarta' IE_DESC = 'RTVE a la carta' - _VALID_URL = r'https?://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://www\.rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', @@ -85,6 +87,9 @@ class RTVEALaCartaIE(InfoExtractor): }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, + }, { + 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', + 'only_matching': True, }] def _real_initialize(self): diff --git a/youtube_dl/extractor/seeker.py b/youtube_dl/extractor/seeker.py new file mode 100644 index 000000000..3b9c65e7e --- /dev/null +++ b/youtube_dl/extractor/seeker.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class SeekerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P<display_id>.*)-(?P<article_id>\d+)\.html' + _TESTS = [{ + # player.loadRevision3Item + 'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html', + 'md5': '30c1dc4030cc715cf05b423d0947ac18', + 'info_dict': { + 'id': '76243', + 'ext': 'webm', + 'title': 'Should Trump Be Required To Release His Tax Returns?', + 'description': 'Donald Trump has been secretive about his "big," "beautiful" tax returns. So what can we learn if he decides to release them?', + 'uploader': 'Seeker Daily', + 'uploader_id': 'seekerdaily', + } + }, { + 'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html', + 'playlist': [ + { + 'md5': '83bcd157cab89ad7318dd7b8c9cf1306', + 'info_dict': { + 'id': '67558', + 'ext': 'mp4', + 'title': 'The Pros & Cons Of Zoos', + 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', + 'uploader': 'DNews', + 'uploader_id': 'dnews', + }, + } + ], + 'info_dict': { + 'id': '1834116536', + 'title': 'After Gorilla Killing, Changes Ahead for Zoos', + 'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.', + }, + }] + + def _real_extract(self, url): + display_id, article_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) + mobj = re.search(r"player\.loadRevision3Item\('([^']+)'\s*,\s*(\d+)\);", webpage) + if mobj: + playlist_type, playlist_id = mobj.groups() + return self.url_result( + 'revision3:%s:%s' % (playlist_type, playlist_id), 'Revision3Embed', playlist_id) + else: + entries = [self.url_result('revision3:video_id:%s' % video_id, 'Revision3Embed', video_id) for video_id in re.findall( + r'<iframe[^>]+src=[\'"](?:https?:)?//api\.seekernetwork\.com/player/embed\?videoId=(\d+)', webpage)] + return self.playlist_result( + entries, article_id, self._og_search_title(webpage), self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py new file mode 100644 index 000000000..1c636f672 --- /dev/null +++ b/youtube_dl/extractor/sendtonews.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .jwplatform import JWPlatformBaseIE +from ..compat import compat_parse_qs +from ..utils import ( + ExtractorError, + parse_duration, +) + + +class SendtoNewsIE(JWPlatformBaseIE): + _VALID_URL = r'https?://embed\.sendtonews\.com/player/embed\.php\?(?P<query>[^#]+)' + + _TEST = { + # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/ + 'url': 'http://embed.sendtonews.com/player/embed.php?SK=GxfCe0Zo7D&MK=175909&PK=5588&autoplay=on&sound=yes', + 'info_dict': { + 'id': 'GxfCe0Zo7D-175909-5588', + 'ext': 'mp4', + 'title': 'Recap: CLE 15, CIN 6', + 'description': '5/16/16: Indians\' bats explode for 15 runs in a win', + 'duration': 49, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + _URL_TEMPLATE = '//embed.sendtonews.com/player/embed.php?SK=%s&MK=%s&PK=%s' + + @classmethod + def _extract_url(cls, webpage): + mobj = re.search(r'''(?x)<script[^>]+src=([\'"]) + (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\? + .*\bSC=(?P<SC>[0-9a-zA-Z-]+).* + \1>''', webpage) + if mobj: + sk, mk, pk = mobj.group('SC').split('-') + return cls._URL_TEMPLATE % (sk, mk, pk) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + params = compat_parse_qs(mobj.group('query')) + + if 'SK' not in params or 'MK' not in params or 'PK' not in params: + raise ExtractorError('Invalid URL', expected=True) + + video_id = '-'.join([params['SK'][0], params['MK'][0], params['PK'][0]]) + + webpage = self._download_webpage(url, video_id) + + jwplayer_data_str = self._search_regex( + r'jwplayer\("[^"]+"\)\.setup\((.+?)\);', webpage, 'JWPlayer data') + js_vars = { + 'w': 1024, + 'h': 768, + 'modeVar': 'html5', + } + for name, val in js_vars.items(): + js_val = '%d' % val if isinstance(val, int) else '"%s"' % val + jwplayer_data_str = jwplayer_data_str.replace(':%s,' % name, ':%s,' % js_val) + + info_dict = self._parse_jwplayer_data( + self._parse_json(jwplayer_data_str, video_id), + video_id, require_title=False, rtmp_params={'no_resume': True}) + + title = self._html_search_regex( + r'<div[^>]+class="embedTitle">([^<]+)</div>', webpage, 'title') + description = self._html_search_regex( + r'<div[^>]+class="embedSubTitle">([^<]+)</div>', webpage, + 'description', fatal=False) + duration = parse_duration(self._html_search_regex( + r'<div[^>]+class="embedDetails">([0-9:]+)', webpage, + 'duration', fatal=False)) + + info_dict.update({ + 'title': title, + 'description': description, + 'duration': duration, + }) + + return info_dict diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index d03f1b1d4..8fc66732a 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -4,28 +4,35 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode -from ..utils import sanitized_Request +from ..utils import ( + HEADRequest, + ExtractorError, + int_or_none, + update_url_query, + qualities, + get_element_by_attribute, + clean_html, +) class SinaIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(.*?\.)?video\.sina\.com\.cn/ - ( - (.+?/(((?P<pseudo_id>\d+).html)|(.*?(\#|(vid=)|b/)(?P<id>\d+?)($|&|\-)))) - | + _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/ + (?: + (?:view/|.*\#)(?P<video_id>\d+)| + .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)| # This is used by external sites like Weibo - (api/sinawebApi/outplay.php/(?P<token>.+?)\.swf) + api/sinawebApi/outplay.php/(?P<token>.+?)\.swf ) ''' _TESTS = [ { - 'url': 'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898', - 'md5': 'd65dd22ddcf44e38ce2bf58a10c3e71f', + 'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622', + 'md5': 'd38433e2fc886007729735650ae4b3e9', 'info_dict': { - 'id': '110028898', - 'ext': 'flv', - 'title': '《中国新闻》 朝鲜要求巴拿马立即释放被扣船员', + 'id': '250576622', + 'ext': 'mp4', + 'title': '现场:克鲁兹宣布退选 特朗普将稳获提名', } }, { @@ -35,37 +42,74 @@ class SinaIE(InfoExtractor): 'ext': 'flv', 'title': '军方提高对朝情报监视级别', }, + 'skip': 'the page does not exist or has been deleted', + }, + { + 'url': 'http://video.sina.com.cn/view/250587748.html', + 'md5': '3d1807a25c775092aab3bc157fff49b4', + 'info_dict': { + 'id': '250587748', + 'ext': 'mp4', + 'title': '瞬间泪目:8年前汶川地震珍贵视频首曝光', + }, }, ] - def _extract_video(self, video_id): - data = compat_urllib_parse_urlencode({'vid': video_id}) - url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data, - video_id, 'Downloading video url') - image_page = self._download_webpage( - 'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data, - video_id, 'Downloading thumbnail info') - - return {'id': video_id, - 'url': url_doc.find('./durl/url').text, - 'ext': 'flv', - 'title': url_doc.find('./vname').text, - 'thumbnail': image_page.split('=')[1], - } - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - if mobj.group('token') is not None: - # The video id is in the redirected url - self.to_screen('Getting video id') - request = sanitized_Request(url) - request.get_method = lambda: 'HEAD' - (_, urlh) = self._download_webpage_handle(request, 'NA', False) - return self._real_extract(urlh.geturl()) - elif video_id is None: - pseudo_id = mobj.group('pseudo_id') - webpage = self._download_webpage(url, pseudo_id) - video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, 'video id') - return self._extract_video(video_id) + video_id = mobj.group('video_id') + if not video_id: + if mobj.group('token') is not None: + # The video id is in the redirected url + self.to_screen('Getting video id') + request = HEADRequest(url) + (_, urlh) = self._download_webpage_handle(request, 'NA', False) + return self._real_extract(urlh.geturl()) + else: + pseudo_id = mobj.group('pseudo_id') + webpage = self._download_webpage(url, pseudo_id) + error = get_element_by_attribute('class', 'errtitle', webpage) + if error: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, clean_html(error)), expected=True) + video_id = self._search_regex( + r"video_id\s*:\s*'(\d+)'", webpage, 'video id') + + video_data = self._download_json( + 'http://s.video.sina.com.cn/video/h5play', + video_id, query={'video_id': video_id}) + if video_data['code'] != 1: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, video_data['message']), expected=True) + else: + video_data = video_data['data'] + title = video_data['title'] + description = video_data.get('description') + if description: + description = description.strip() + + preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd']) + formats = [] + for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items(): + file_api = quality.get('file_api') + file_id = quality.get('file_id') + if not file_api or not file_id: + continue + formats.append({ + 'format_id': quality_id, + 'url': update_url_query(file_api, {'vid': file_id}), + 'preference': preference(quality_id), + 'ext': 'mp4', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': video_data.get('image'), + 'duration': int_or_none(video_data.get('length')), + 'timestamp': int_or_none(video_data.get('create_time')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 692fd78e8..92a7120a3 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -96,20 +96,18 @@ class SpankwireIE(InfoExtractor): formats = [] for height, video_url in zip(heights, video_urls): path = compat_urllib_parse_urlparse(video_url).path - _, quality = path.split('/')[4].split('_')[:2] - f = { + m = re.search(r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', path) + if m: + tbr = int(m.group('tbr')) + height = int(m.group('height')) + else: + tbr = None + formats.append({ 'url': video_url, + 'format_id': '%dp' % height, 'height': height, - } - tbr = self._search_regex(r'^(\d+)[Kk]$', quality, 'tbr', default=None) - if tbr: - f.update({ - 'tbr': int(tbr), - 'format_id': '%dp' % height, - }) - else: - f['format_id'] = quality - formats.append(f) + 'tbr': tbr, + }) self._sort_formats(formats) age_limit = self._rta_search(webpage) diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py index e0477382c..d14d93e3a 100644 --- a/youtube_dl/extractor/teachingchannel.py +++ b/youtube_dl/extractor/teachingchannel.py @@ -11,6 +11,7 @@ class TeachingChannelIE(InfoExtractor): _TEST = { 'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution', + 'md5': '3d6361864d7cac20b57c8784da17166f', 'info_dict': { 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', 'ext': 'mp4', @@ -19,9 +20,9 @@ class TeachingChannelIE(InfoExtractor): 'duration': 422.255, }, 'params': { - # m3u8 download 'skip_download': True, }, + 'add_ie': ['Ooyala'], } def _real_extract(self, url): diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index b49ab5f5b..79a778920 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -88,7 +88,7 @@ class TeamcocoIE(InfoExtractor): preload_codes = self._html_search_regex( r'(function.+)setTimeout\(function\(\)\{playlist', webpage, 'preload codes') - base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes) + base64_fragments = re.findall(r'"([a-zA-Z0-9+/=]+)"', preload_codes) base64_fragments.remove('init') def _check_sequence(cur_fragments): diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 6f8333cfc..9092e9b85 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -2,14 +2,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( + determine_ext, + remove_end, +) class TelegraafIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html' _TEST = { 'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html', - 'md5': '83245a9779bcc4a24454bfd53c65b6dc', 'info_dict': { 'id': '24353229', 'ext': 'mp4', @@ -18,18 +20,60 @@ class TelegraafIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 33, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): - playlist_id = self._match_id(url) + video_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + webpage = self._download_webpage(url, video_id) + player_url = self._html_search_regex( + r'<iframe[^>]+src="([^"]+")', webpage, 'player URL') + player_page = self._download_webpage( + player_url, video_id, note='Download player webpage') playlist_url = self._search_regex( - r"iframe\.loadPlayer\('([^']+)'", webpage, 'player') + r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL') + playlist_data = self._download_json(playlist_url, video_id) + + item = playlist_data['items'][0] + formats = [] + locations = item['locations'] + for location in locations.get('adaptive', []): + manifest_url = location['src'] + ext = determine_ext(manifest_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, ext='mp4', m3u8_id='hls')) + elif ext == 'mpd': + # TODO: Current DASH formats are broken - $Time$ pattern in + # <SegmentTemplate> not implemented yet + continue + else: + self.report_warning('Unknown adaptive format %s' % ext) + for location in locations.get('progressive', []): + formats.append({ + 'url': location['sources'][0]['src'], + 'width': location.get('width'), + 'height': location.get('height'), + 'format_id': 'http-%s' % location['label'], + }) + + self._sort_formats(formats) - entries = self._extract_xspf_playlist(playlist_url, playlist_id) title = remove_end(self._og_search_title(webpage), ' - VIDEO') description = self._og_search_description(webpage) + duration = item.get('duration') + thumbnail = item.get('poster') - return self.playlist_result(entries, playlist_id, title, description) + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'duration': duration, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 3f54b2744..6c848dc6f 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html' + _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P<id>[^/?#.]+)' _TESTS = [{ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { @@ -48,6 +48,6 @@ class TF1IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) wat_id = self._html_search_regex( - r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})(?:#.*?)?\1', + r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8}).*?\1', webpage, 'wat id', group='id') return self.url_result('wat:%s' % wat_id, 'Wat') diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index a25417f94..5793ec6ef 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -14,11 +14,13 @@ from ..compat import ( compat_urllib_parse_urlparse, ) from ..utils import ( + determine_ext, ExtractorError, float_or_none, int_or_none, sanitized_Request, unsmuggle_url, + update_url_query, xpath_with_ns, mimetype2ext, find_xpath_attr, @@ -48,6 +50,12 @@ class ThePlatformBaseIE(OnceIE): if OnceIE.suitable(_format['url']): formats.extend(self._extract_once_formats(_format['url'])) else: + media_url = _format['url'] + if determine_ext(media_url) == 'm3u8': + hdnea2 = self._get_cookies(media_url).get('hdnea2') + if hdnea2: + _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value}) + formats.append(_format) subtitles = self._parse_smil_subtitles(meta, default_ns) @@ -151,6 +159,22 @@ class ThePlatformIE(ThePlatformBaseIE): 'only_matching': True, }] + @classmethod + def _extract_urls(cls, webpage): + m = re.search( + r'''(?x) + <meta\s+ + property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+ + content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2 + ''', webpage) + if m: + return [m.group('url')] + + matches = re.findall( + r'<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) + if matches: + return list(zip(*matches))[1] + @staticmethod def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False): flags = '10' if include_qs else '00' diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py index d8b1fd281..d63aef5de 100644 --- a/youtube_dl/extractor/thesixtyone.py +++ b/youtube_dl/extractor/thesixtyone.py @@ -12,7 +12,7 @@ class TheSixtyOneIE(InfoExtractor): s| song/comments/list| song - )/(?P<id>[A-Za-z0-9]+)/?$''' + )/(?:[^/]+/)?(?P<id>[A-Za-z0-9]+)/?$''' _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}' _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream' _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop' @@ -45,6 +45,10 @@ class TheSixtyOneIE(InfoExtractor): 'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/', 'only_matching': True, }, + { + 'url': 'http://www.thesixtyone.com/maryatmidnight/song/StrawberriesandCream/yvWtLp0c4GQ/', + 'only_matching': True, + }, ] _DECODE_MAP = { diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py new file mode 100644 index 000000000..c77a07989 --- /dev/null +++ b/youtube_dl/extractor/threeqsdn.py @@ -0,0 +1,139 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + js_to_json, + mimetype2ext, +) + + +class ThreeQSDNIE(InfoExtractor): + IE_NAME = '3qsdn' + IE_DESC = '3Q SDN' + _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + # ondemand from http://www.philharmonie.tv/veranstaltung/26/ + 'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http', + 'md5': 'ab040e37bcfa2e0c079f92cb1dd7f6cd', + 'info_dict': { + 'id': '0280d6b9-1215-11e6-b427-0cc47a188158', + 'ext': 'mp4', + 'title': '0280d6b9-1215-11e6-b427-0cc47a188158', + 'is_live': False, + }, + 'expected_warnings': ['Failed to download MPD manifest'], + }, { + # live video stream + 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true', + 'info_dict': { + 'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f', + 'ext': 'mp4', + 'title': 'd755d94b-4ab9-11e3-9162-0025907ad44f', + 'is_live': False, + }, + }, { + # live audio stream + 'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48', + 'only_matching': True, + }, { + # live audio stream with some 404 URLs + 'url': 'http://playout.3qsdn.com/ac5c3186-777a-11e2-9c30-9acf09e2db48', + 'only_matching': True, + }, { + # geo restricted with 'This content is not available in your country' + 'url': 'http://playout.3qsdn.com/d63a3ffe-75e8-11e2-9c30-9acf09e2db48', + 'only_matching': True, + }, { + # geo restricted with 'playout.3qsdn.com/forbidden' + 'url': 'http://playout.3qsdn.com/8e330f26-6ae2-11e2-a16a-9acf09e2db48', + 'only_matching': True, + }, { + # live video with rtmp link + 'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % ThreeQSDNIE._VALID_URL, webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + js = self._download_webpage( + 'http://playout.3qsdn.com/%s' % video_id, video_id, + query={'js': 'true'}) + + if any(p in js for p in ( + '>This content is not available in your country', + 'playout.3qsdn.com/forbidden')): + self.raise_geo_restricted() + + stream_content = self._search_regex( + r'streamContent\s*:\s*(["\'])(?P<content>.+?)\1', js, + 'stream content', default='demand', group='content') + + live = stream_content == 'live' + + stream_type = self._search_regex( + r'streamType\s*:\s*(["\'])(?P<type>audio|video)\1', js, + 'stream type', default='video', group='type') + + formats = [] + urls = set() + + def extract_formats(item_url, item={}): + if not item_url or item_url in urls: + return + urls.add(item_url) + type_ = item.get('type') + ext = determine_ext(item_url, default_ext=None) + if type_ == 'application/dash+xml' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + item_url, video_id, mpd_id='mpd', fatal=False)) + elif type_ in ('application/vnd.apple.mpegURL', 'application/x-mpegurl') or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + item_url, video_id, 'mp4', + entry_protocol='m3u8' if live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + item_url, video_id, f4m_id='hds', fatal=False)) + else: + if not self._is_valid_url(item_url, video_id): + return + formats.append({ + 'url': item_url, + 'format_id': item.get('quality'), + 'ext': 'mp4' if item_url.startswith('rtsp') else mimetype2ext(type_) or ext, + 'vcodec': 'none' if stream_type == 'audio' else None, + }) + + for item_js in re.findall(r'({.*?\b(?:src|source)\s*:\s*["\'].+?})', js): + f = self._parse_json( + item_js, video_id, transform_source=js_to_json, fatal=False) + if not f: + continue + extract_formats(f.get('src'), f) + + # More relaxed version to collect additional URLs and acting + # as a future-proof fallback + for _, src in re.findall(r'\b(?:src|source)\s*:\s*(["\'])((?:https?|rtsp)://.+?)\1', js): + extract_formats(src) + + self._sort_formats(formats) + + title = self._live_title(video_id) if live else video_id + + return { + 'id': video_id, + 'title': title, + 'is_live': live, + 'formats': formats, + } diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index f57d609d4..a4997cb89 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re @@ -6,20 +6,13 @@ import re from .common import InfoExtractor -class TvpIE(InfoExtractor): - IE_NAME = 'tvp.pl' - _VALID_URL = r'https?://(?:vod|www)\.tvp\.pl/.*/(?P<id>\d+)$' +class TVPIE(InfoExtractor): + IE_NAME = 'tvp' + IE_DESC = 'Telewizja Polska' + _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:(?!\d+/)[^/]+/)*(?P<id>\d+)' _TESTS = [{ - 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem/wideo/odc-2/4278035', - 'md5': 'cdd98303338b8a7f7abab5cd14092bf2', - 'info_dict': { - 'id': '4278035', - 'ext': 'wmv', - 'title': 'Ogniem i mieczem, odc. 2', - }, - }, { - 'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536', + 'url': 'http://vod.tvp.pl/194536/i-seria-odc-13', 'md5': '8aa518c15e5cc32dfe8db400dc921fbb', 'info_dict': { 'id': '194536', @@ -36,12 +29,22 @@ class TvpIE(InfoExtractor): }, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', - 'md5': 'c3b15ed1af288131115ff17a17c19dda', - 'info_dict': { - 'id': '17834272', - 'ext': 'mp4', - 'title': 'Na sygnale, odc. 39', - }, + 'only_matching': True, + }, { + 'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200', + 'only_matching': True, + }, { + 'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa', + 'only_matching': True, + }, { + 'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach', + 'only_matching': True, + }, { + 'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum', + 'only_matching': True, + }, { + 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji', + 'only_matching': True, }] def _real_extract(self, url): @@ -92,8 +95,8 @@ class TvpIE(InfoExtractor): } -class TvpSeriesIE(InfoExtractor): - IE_NAME = 'tvp.pl:Series' +class TVPSeriesIE(InfoExtractor): + IE_NAME = 'tvp:series' _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$' _TESTS = [{ @@ -127,7 +130,7 @@ class TvpSeriesIE(InfoExtractor): videos_paths = re.findall( '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist) entries = [ - self.url_result('http://vod.tvp.pl%s' % v_path, ie=TvpIE.ie_key()) + self.url_result('http://vod.tvp.pl%s' % v_path, ie=TVPIE.ie_key()) for v_path in videos_paths] return { diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index e03e2dbaa..4025edf02 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -47,7 +47,8 @@ class TwentyFourVideoIE(InfoExtractor): title = self._og_search_title(webpage) description = self._html_search_regex( - r'<span itemprop="description">([^<]+)</span>', webpage, 'description', fatal=False) + r'<(p|span)[^>]+itemprop="description"[^>]*>(?P<description>[^<]+)</\1>', + webpage, 'description', fatal=False, group='description') thumbnail = self._og_search_thumbnail(webpage) duration = int_or_none(self._og_search_property( 'duration', webpage, 'duration', fatal=False)) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 36ee1adff..d898f14c3 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -171,6 +171,7 @@ class TwitchVideoIE(TwitchItemBaseIE): 'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG', }, 'playlist_mincount': 12, + 'skip': 'HTTP Error 404: Not Found', } @@ -187,6 +188,7 @@ class TwitchChapterIE(TwitchItemBaseIE): 'title': 'ACRL Off Season - Sports Cars @ Nordschleife', }, 'playlist_mincount': 3, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361', 'only_matching': True, @@ -258,7 +260,7 @@ class TwitchVodIE(TwitchItemBaseIE): 'nauth': access_token['token'], 'nauthsig': access_token['sig'], })), - item_id, 'mp4') + item_id, 'mp4', entry_protocol='m3u8_native') self._prefer_source(formats) info['formats'] = formats @@ -355,31 +357,6 @@ class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE): } -class TwitchBookmarksIE(TwitchPlaylistBaseIE): - IE_NAME = 'twitch:bookmarks' - _VALID_URL = r'%s/(?P<id>[^/]+)/profile/bookmarks/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE - _PLAYLIST_URL = '%s/api/bookmark/?user=%%s&offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE - _PLAYLIST_TYPE = 'bookmarks' - - _TEST = { - 'url': 'http://www.twitch.tv/ognos/profile/bookmarks', - 'info_dict': { - 'id': 'ognos', - 'title': 'Ognos', - }, - 'playlist_mincount': 3, - } - - def _extract_playlist_page(self, response): - entries = [] - for bookmark in response.get('bookmarks', []): - video = bookmark.get('video') - if not video: - continue - entries.append(video['url']) - return entries - - class TwitchStreamIE(TwitchBaseIE): IE_NAME = 'twitch:stream' _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index ea673054f..b73842986 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, float_or_none, xpath_text, remove_end, @@ -52,7 +53,7 @@ class TwitterCardIE(TwitterBaseIE): 'id': 'dq4Oj5quskI', 'ext': 'mp4', 'title': 'Ubuntu 11.10 Overview', - 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/', + 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10...', 'upload_date': '20111013', 'uploader': 'OMG! Ubuntu!', 'uploader_id': 'omgubuntu', @@ -116,13 +117,16 @@ class TwitterCardIE(TwitterBaseIE): video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') if video_url: - f = { - 'url': video_url, - } + if determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls')) + else: + f = { + 'url': video_url, + } - _search_dimensions_in_video_url(f, video_url) + _search_dimensions_in_video_url(f, video_url) - formats.append(f) + formats.append(f) vmap_url = config.get('vmapUrl') or config.get('vmap_url') if vmap_url: @@ -207,6 +211,7 @@ class TwitterIE(InfoExtractor): 'uploader_id': 'giphz', }, 'expected_warnings': ['height', 'width'], + 'skip': 'Account suspended', }, { 'url': 'https://twitter.com/starwars/status/665052190608723968', 'md5': '39b7199856dee6cd4432e72c74bc69d4', @@ -239,10 +244,10 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'jay - BEAT PROD: @suhmeduh #Damndaniel', - 'description': 'jay on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'title': 'Donte The Dumbass - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'Donte The Dumbass on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', 'thumbnail': 're:^https?://.*\.jpg', - 'uploader': 'jay', + 'uploader': 'Donte The Dumbass', 'uploader_id': 'jaydingeer', }, 'params': { @@ -262,7 +267,6 @@ class TwitterIE(InfoExtractor): 'add_ie': ['Vine'], }, { 'url': 'https://twitter.com/captainamerica/status/719944021058060289', - # md5 constantly changes 'info_dict': { 'id': '719944021058060289', 'ext': 'mp4', @@ -271,6 +275,9 @@ class TwitterIE(InfoExtractor): 'uploader_id': 'captainamerica', 'uploader': 'Captain America', }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }] def _real_extract(self, url): @@ -278,7 +285,11 @@ class TwitterIE(InfoExtractor): user_id = mobj.group('user_id') twid = mobj.group('id') - webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid) + webpage, urlh = self._download_webpage_handle( + self._TEMPLATE_URL % (user_id, twid), twid) + + if 'twitter.com/account/suspended' in urlh.geturl(): + raise ExtractorError('Account suspended by Twitter.', expected=True) username = remove_end(self._og_search_title(webpage), ' on Twitter') diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index d1e6f2703..89b869559 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, ) @@ -84,18 +83,19 @@ class UdemyIE(InfoExtractor): if enroll_url: webpage = self._download_webpage( combine_url(base_url, enroll_url), - course_id, 'Enrolling in the course') + course_id, 'Enrolling in the course', + headers={'Referer': base_url}) if '>You have enrolled in' in webpage: self.to_screen('%s: Successfully enrolled in the course' % course_id) def _download_lecture(self, course_id, lecture_id): return self._download_json( - 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( - course_id, lecture_id, compat_urllib_parse_urlencode({ - 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', - })), - lecture_id, 'Downloading lecture JSON') + 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?' + % (course_id, lecture_id), + lecture_id, 'Downloading lecture JSON', query={ + 'fields[lecture]': 'title,description,view_html,asset', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', + }) def _handle_error(self, response): if not isinstance(response, dict): @@ -142,7 +142,9 @@ class UdemyIE(InfoExtractor): self._LOGIN_URL, None, 'Downloading login popup') def is_logged(webpage): - return any(p in webpage for p in ['href="https://www.udemy.com/user/logout/', '>Logout<']) + return any(re.search(p, webpage) for p in ( + r'href=["\'](?:https://www\.udemy\.com)?/user/logout/', + r'>Logout<')) # already logged in if is_logged(login_popup): @@ -155,13 +157,13 @@ class UdemyIE(InfoExtractor): 'password': password, }) - request = sanitized_Request( - self._LOGIN_URL, urlencode_postdata(login_form)) - request.add_header('Referer', self._ORIGIN_URL) - request.add_header('Origin', self._ORIGIN_URL) - response = self._download_webpage( - request, None, 'Logging in as %s' % username) + self._LOGIN_URL, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers={ + 'Referer': self._ORIGIN_URL, + 'Origin': self._ORIGIN_URL, + }) if not is_logged(response): error = self._html_search_regex( diff --git a/youtube_dl/extractor/udn.py b/youtube_dl/extractor/udn.py index ee35b7227..57dd73aef 100644 --- a/youtube_dl/extractor/udn.py +++ b/youtube_dl/extractor/udn.py @@ -2,10 +2,13 @@ from __future__ import unicode_literals import json +import re + from .common import InfoExtractor from ..utils import ( + determine_ext, + int_or_none, js_to_json, - ExtractorError, ) from ..compat import compat_urlparse @@ -16,13 +19,16 @@ class UDNEmbedIE(InfoExtractor): _VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL _TESTS = [{ 'url': 'http://video.udn.com/embed/news/300040', - 'md5': 'de06b4c90b042c128395a88f0384817e', 'info_dict': { 'id': '300040', 'ext': 'mp4', 'title': '生物老師男變女 全校挺"做自己"', 'thumbnail': 're:^https?://.*\.jpg$', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'https://video.udn.com/embed/news/300040', 'only_matching': True, @@ -38,39 +44,53 @@ class UDNEmbedIE(InfoExtractor): page = self._download_webpage(url, video_id) options = json.loads(js_to_json(self._html_search_regex( - r'var options\s*=\s*([^;]+);', page, 'video urls dictionary'))) + r'var\s+options\s*=\s*([^;]+);', page, 'video urls dictionary'))) video_urls = options['video'] if video_urls.get('youtube'): return self.url_result(video_urls.get('youtube'), 'Youtube') - try: - del video_urls['youtube'] - except KeyError: - pass + formats = [] + for video_type, api_url in video_urls.items(): + if not api_url: + continue - formats = [{ - 'url': self._download_webpage( + video_url = self._download_webpage( compat_urlparse.urljoin(url, api_url), video_id, - 'retrieve url for %s video' % video_type), - 'format_id': video_type, - 'preference': 0 if video_type == 'mp4' else -1, - } for video_type, api_url in video_urls.items() if api_url] + note='retrieve url for %s video' % video_type) - if not formats: - raise ExtractorError('No videos found', expected=True) + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, ext='mp4', m3u8_id='hls')) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds')) + else: + mobj = re.search(r'_(?P<height>\d+)p_(?P<tbr>\d+).mp4', video_url) + a_format = { + 'url': video_url, + # video_type may be 'mp4', which confuses YoutubeDL + 'format_id': 'http-' + video_type, + } + if mobj: + a_format.update({ + 'height': int_or_none(mobj.group('height')), + 'tbr': int_or_none(mobj.group('tbr')), + }) + formats.append(a_format) self._sort_formats(formats) - thumbnail = None - - if options.get('gallery') and len(options['gallery']): - thumbnail = options['gallery'][0].get('original') + thumbnails = [{ + 'url': img_url, + 'id': img_type, + } for img_type, img_url in options.get('gallery', [{}])[0].items() if img_url] return { 'id': video_id, 'formats': formats, 'title': options['title'], - 'thumbnail': thumbnail + 'thumbnails': thumbnails, } diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py index cafc082b6..3484a2046 100644 --- a/youtube_dl/extractor/ustudio.py +++ b/youtube_dl/extractor/ustudio.py @@ -6,10 +6,12 @@ from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, + unescapeHTML, ) class UstudioIE(InfoExtractor): + IE_NAME = 'ustudio' _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)' _TEST = { 'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge', @@ -27,9 +29,7 @@ class UstudioIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + video_id, display_id = re.match(self._VALID_URL, url).groups() config = self._download_xml( 'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id, @@ -37,7 +37,7 @@ class UstudioIE(InfoExtractor): def extract(kind): return [{ - 'url': item.attrib['url'], + 'url': unescapeHTML(item.attrib['url']), 'width': int_or_none(item.get('width')), 'height': int_or_none(item.get('height')), } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')] @@ -65,3 +65,61 @@ class UstudioIE(InfoExtractor): 'uploader': uploader, 'formats': formats, } + + +class UstudioEmbedIE(InfoExtractor): + IE_NAME = 'ustudio:embed' + _VALID_URL = r'https?://(?:(?:app|embed)\.)?ustudio\.com/embed/(?P<uid>[^/]+)/(?P<id>[^/]+)' + _TEST = { + 'url': 'http://app.ustudio.com/embed/DeN7VdYRDKhP/Uw7G1kMCe65T', + 'md5': '47c0be52a09b23a7f40de9469cec58f4', + 'info_dict': { + 'id': 'Uw7G1kMCe65T', + 'ext': 'mp4', + 'title': '5 Things IT Should Know About Video', + 'description': 'md5:93d32650884b500115e158c5677d25ad', + 'uploader_id': 'DeN7VdYRDKhP', + } + } + + def _real_extract(self, url): + uploader_id, video_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'http://app.ustudio.com/embed/%s/%s/config.json' % (uploader_id, video_id), + video_id)['videos'][0] + title = video_data['name'] + + formats = [] + for ext, qualities in video_data.get('transcodes', {}).items(): + for quality in qualities: + quality_url = quality.get('url') + if not quality_url: + continue + height = int_or_none(quality.get('height')) + formats.append({ + 'format_id': '%s-%dp' % (ext, height) if height else ext, + 'url': quality_url, + 'width': int_or_none(quality.get('width')), + 'height': height, + }) + self._sort_formats(formats) + + thumbnails = [] + for image in video_data.get('images', []): + image_url = image.get('url') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'uploader_id': uploader_id, + 'tags': video_data.get('keywords'), + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index 23ce0a0d1..0f5d68738 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -37,6 +37,7 @@ class VeohIE(InfoExtractor): 'uploader': 'afp-news', 'duration': 123, }, + 'skip': 'This video has been deleted.', }, { 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py index 1a0ff3395..2cd617b91 100644 --- a/youtube_dl/extractor/vessel.py +++ b/youtube_dl/extractor/vessel.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor from ..utils import ( @@ -12,11 +13,11 @@ from ..utils import ( class VesselIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P<id>[0-9a-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?vessel\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z]+)' _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s' _LOGIN_URL = 'https://www.vessel.com/api/account/login' _NETRC_MACHINE = 'vessel' - _TEST = { + _TESTS = [{ 'url': 'https://www.vessel.com/videos/HDN7G5UMs', 'md5': '455cdf8beb71c6dd797fd2f3818d05c4', 'info_dict': { @@ -28,7 +29,16 @@ class VesselIE(InfoExtractor): 'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?', 'timestamp': int, }, - } + }, { + 'url': 'https://www.vessel.com/embed/G4U7gUJ6a?w=615&h=346', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?vessel\.com/embed/[0-9a-zA-Z]+.*?)\1', + webpage)] @staticmethod def make_json_request(url, data): @@ -98,16 +108,24 @@ class VesselIE(InfoExtractor): formats = [] for f in video_asset.get('sources', []): - if f['name'] == 'hls-index': + location = f.get('location') + if not location: + continue + name = f.get('name') + if name == 'hls-index': formats.extend(self._extract_m3u8_formats( - f['location'], video_id, ext='mp4', m3u8_id='m3u8')) + location, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='m3u8', fatal=False)) + elif name == 'dash-index': + formats.extend(self._extract_mpd_formats( + location, video_id, mpd_id='dash', fatal=False)) else: formats.append({ - 'format_id': f['name'], + 'format_id': name, 'tbr': f.get('bitrate'), 'height': f.get('height'), 'width': f.get('width'), - 'url': f['location'], + 'url': location, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index c0ef08c02..388b4debe 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -203,7 +203,8 @@ class VevoIE(VevoBaseIE): json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id response = self._download_json( - json_url, video_id, 'Downloading video info', 'Unable to download info') + json_url, video_id, 'Downloading video info', + 'Unable to download info', fatal=False) or {} video_info = response.get('video') or {} artist = None featured_artist = None @@ -212,19 +213,17 @@ class VevoIE(VevoBaseIE): formats = [] if not video_info: - if response.get('statusCode') != 909: + try: + self._initialize_api(video_id) + except ExtractorError: ytid = response.get('errorInfo', {}).get('ytid') if ytid: self.report_warning( 'Video is geoblocked, trying with the YouTube video %s' % ytid) return self.url_result(ytid, 'Youtube', ytid) - if 'statusMessage' in response: - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, response['statusMessage']), expected=True) - raise ExtractorError('Unable to extract videos') + raise - self._initialize_api(video_id) video_info = self._call_api( 'video/%s' % video_id, video_id, 'Downloading api video info', 'Failed to download video info') diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 95daf4dfd..e2b2ce098 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -11,12 +11,14 @@ class ViceIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1', + 'md5': 'e9d77741f9e42ba583e683cd170660f7', 'info_dict': { 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', 'ext': 'flv', 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', 'duration': 725.983, }, + 'add_ie': ['Ooyala'], }, { 'url': 'http://www.vice.com/video/how-to-hack-a-car', 'md5': '6fb2989a3fed069fb8eab3401fc2d3c9', @@ -29,6 +31,7 @@ class ViceIE(InfoExtractor): 'uploader': 'Motherboard', 'upload_date': '20140529', }, + 'add_ie': ['Youtube'], }, { 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', 'only_matching': True, diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py new file mode 100644 index 000000000..6898042de --- /dev/null +++ b/youtube_dl/extractor/vidio.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class VidioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vidio\.com/watch/(?P<id>\d+)-(?P<display_id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015', + 'md5': 'cd2801394afc164e9775db6a140b91fe', + 'info_dict': { + 'id': '165683', + 'display_id': 'dj_ambred-booyah-live-2015', + 'ext': 'mp4', + 'title': 'DJ_AMBRED - Booyah (Live 2015)', + 'description': 'md5:27dc15f819b6a78a626490881adbadf8', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 149, + 'like_count': int, + }, + }, { + 'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title(webpage) + + m3u8_url, duration, thumbnail = [None] * 3 + + clips = self._parse_json( + self._html_search_regex( + r'data-json-clips\s*=\s*(["\'])(?P<data>\[.+?\])\1', + webpage, 'video data', default='[]', group='data'), + display_id, fatal=False) + if clips: + clip = clips[0] + m3u8_url = clip.get('sources', [{}])[0].get('file') + duration = clip.get('clip_duration') + thumbnail = clip.get('image') + + m3u8_url = m3u8_url or self._search_regex( + r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>.+?)\1', webpage, 'hls url') + formats = self._extract_m3u8_formats(m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + + duration = int_or_none(duration or self._search_regex( + r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration')) + thumbnail = thumbnail or self._og_search_thumbnail(webpage) + + like_count = int_or_none(self._search_regex( + (r'<span[^>]+data-comment-vote-count=["\'](\d+)', + r'<span[^>]+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'), + webpage, 'like count', fatal=False)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnail': thumbnail, + 'duration': duration, + 'like_count': like_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index dd4a13a4a..19500eba8 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -141,6 +141,10 @@ class ViewLiftIE(ViewLiftBaseIE): }, { 'url': 'http://www.kesari.tv/news/video/1461919076414', 'only_matching': True, + }, { + # Was once Kaltura embed + 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 67220f1b7..79c819bc3 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -26,12 +26,16 @@ class VKIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)| + (?: + (?:m\.)?vk\.com/video_| + (?:www\.)?daxab.com/ + ) + ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)| (?: (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| - (?:www\.)?biqle\.ru/watch/ + (?:www\.)?daxab.com/embed/ ) - (?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$) + (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))? ) ''' _NETRC_MACHINE = 'vk' @@ -75,7 +79,8 @@ class VKIE(InfoExtractor): 'duration': 101, 'upload_date': '20120730', 'view_count': int, - } + }, + 'skip': 'This video has been removed from public access.', }, { # VIDEO NOW REMOVED @@ -142,7 +147,7 @@ class VKIE(InfoExtractor): 'id': 'V3K4mi0SYkc', 'ext': 'webm', 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", - 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', + 'description': 'md5:d9903938abdc74c738af77f527ca0596', 'duration': 178, 'upload_date': '20130116', 'uploader': "Children's Joy Foundation", @@ -174,11 +179,6 @@ class VKIE(InfoExtractor): 'only_matching': True, }, { - # vk wrapper - 'url': 'http://www.biqle.ru/watch/847655_160197695', - 'only_matching': True, - }, - { # pladform embed 'url': 'https://vk.com/video-76116461_171554880', 'only_matching': True, @@ -217,20 +217,21 @@ class VKIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') - if not video_id: + if video_id: + info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id + # Some videos (removed?) can only be downloaded with list id specified + list_id = mobj.group('list_id') + if list_id: + info_url += '&list=%s' % list_id + else: + info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query') video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) - info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id - - # Some videos (removed?) can only be downloaded with list id specified - list_id = mobj.group('list_id') - if list_id: - info_url += '&list=%s' % list_id - info_page = self._download_webpage(info_url, video_id) error_message = self._html_search_regex( - r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', + [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', + r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'], info_page, 'error message', default=None) if error_message: raise ExtractorError(error_message, expected=True) @@ -305,17 +306,17 @@ class VKIE(InfoExtractor): view_count = None views = self._html_search_regex( r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', - info_page, 'view count', fatal=False) + info_page, 'view count', default=None) if views: view_count = str_to_int(self._search_regex( r'([\d,.]+)', views, 'view count', fatal=False)) formats = [] for k, v in data.items(): - if not k.startswith('url') and k != 'extra_data' or not v: + if not k.startswith('url') and not k.startswith('cache') and k != 'extra_data' or not v: continue height = int_or_none(self._search_regex( - r'^url(\d+)', k, 'height', default=None)) + r'^(?:url|cache)(\d+)', k, 'height', default=None)) formats.append({ 'format_id': k, 'url': v, diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index a672ea9c5..8d671cca7 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -1,8 +1,7 @@ # coding: utf-8 -from __future__ import division, unicode_literals +from __future__ import unicode_literals import re -import time from .common import InfoExtractor from ..utils import ( @@ -10,6 +9,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + remove_start, ) from ..compat import compat_urllib_parse_urlencode @@ -23,7 +23,7 @@ class VLiveIE(InfoExtractor): 'info_dict': { 'id': '1326', 'ext': 'mp4', - 'title': "[V] Girl's Day's Broadcast", + 'title': "[V LIVE] Girl's Day's Broadcast", 'creator': "Girl's Day", 'view_count': int, }, @@ -35,24 +35,12 @@ class VLiveIE(InfoExtractor): webpage = self._download_webpage( 'http://www.vlive.tv/video/%s' % video_id, video_id) - # UTC+x - UTC+9 (KST) - tz = time.altzone if time.localtime().tm_isdst == 1 else time.timezone - tz_offset = -tz // 60 - 9 * 60 - self._set_cookie('vlive.tv', 'timezoneOffset', '%d' % tz_offset) - - status_params = self._download_json( - 'http://www.vlive.tv/video/status?videoSeq=%s' % video_id, - video_id, 'Downloading JSON status', - headers={'Referer': url.encode('utf-8')}) - status = status_params.get('status') - air_start = status_params.get('onAirStartAt', '') - is_live = status_params.get('isLive') - video_params = self._search_regex( - r'vlive\.tv\.video\.ajax\.request\.handler\.init\((.+)\)', + r'\bvlive\.video\.init\(([^)]+)\)', webpage, 'video params') - live_params, long_video_id, key = re.split( - r'"\s*,\s*"', video_params)[1:4] + status, _, _, live_params, long_video_id, key = re.split( + r'"\s*,\s*"', video_params)[2:8] + status = remove_start(status, 'PRODUCT_') if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR': live_params = self._parse_json('"%s"' % live_params, video_id) @@ -61,8 +49,6 @@ class VLiveIE(InfoExtractor): elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO': if long_video_id and key: return self._replay(video_id, webpage, long_video_id, key) - elif is_live: - status = 'LIVE_END' else: status = 'COMING_SOON' @@ -70,7 +56,7 @@ class VLiveIE(InfoExtractor): raise ExtractorError('Uploading for replay. Please wait...', expected=True) elif status == 'COMING_SOON': - raise ExtractorError('Coming soon! %s' % air_start, expected=True) + raise ExtractorError('Coming soon!', expected=True) elif status == 'CANCELED': raise ExtractorError('We are sorry, ' 'but the live broadcast has been canceled.', diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py index 0c6b1f030..b1b32ad44 100644 --- a/youtube_dl/extractor/voxmedia.py +++ b/youtube_dl/extractor/voxmedia.py @@ -15,7 +15,8 @@ class VoxMediaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Google\'s new material design direction', 'description': 'md5:2f44f74c4d14a1f800ea73e1c6832ad2', - } + }, + 'add_ie': ['Ooyala'], }, { # data-ooyala-id 'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', @@ -25,7 +26,8 @@ class VoxMediaIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Nexus 6: hands-on with Google\'s phablet', 'description': 'md5:87a51fe95ff8cea8b5bdb9ac7ae6a6af', - } + }, + 'add_ie': ['Ooyala'], }, { # volume embed 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', @@ -35,7 +37,8 @@ class VoxMediaIE(InfoExtractor): 'ext': 'mp4', 'title': 'The new frontier of LGBTQ civil rights, explained', 'description': 'md5:0dc58e94a465cbe91d02950f770eb93f', - } + }, + 'add_ie': ['Ooyala'], }, { # youtube embed 'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance', @@ -48,7 +51,8 @@ class VoxMediaIE(InfoExtractor): 'upload_date': '20160324', 'uploader_id': 'voxdotcom', 'uploader': 'Vox', - } + }, + 'add_ie': ['Youtube'], }, { # SBN.VideoLinkset.entryGroup multiple ooyala embeds 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', @@ -117,7 +121,7 @@ class VoxMediaIE(InfoExtractor): volume_webpage = self._download_webpage( 'http://volume.vox-cdn.com/embed/%s' % volume_uuid, volume_uuid) video_data = self._parse_json(self._search_regex( - r'Volume\.createVideo\(({.+})\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid) + r'Volume\.createVideo\(({.+})\s*,\s*{.*}\s*,\s*\[.*\]\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid) for provider_video_type in ('ooyala', 'youtube'): provider_video_id = video_data.get('%s_id' % provider_video_type) if provider_video_id: diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index ec8b99998..839cad986 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -11,7 +11,96 @@ from ..utils import ( class WashingtonPostIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])' + IE_NAME = 'washingtonpost' + _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TEST = { + 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', + 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', + 'info_dict': { + 'id': '480ba4ee-1ec7-11e6-82c2-a7dcb313287d', + 'ext': 'mp4', + 'title': 'Egypt finds belongings, debris from plane crash', + 'description': 'md5:a17ceee432f215a5371388c1f680bd86', + 'upload_date': '20160520', + 'uploader': 'Reuters', + 'timestamp': 1463778452, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id, + video_id, transform_source=strip_jsonp)[0]['contentConfig'] + title = video_data['title'] + + urls = [] + formats = [] + for s in video_data.get('streams', []): + s_url = s.get('url') + if not s_url or s_url in urls: + continue + urls.append(s_url) + video_type = s.get('type') + if video_type == 'smil': + continue + elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url): + m3u8_formats = self._extract_m3u8_formats( + s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + for m3u8_format in m3u8_formats: + width = m3u8_format.get('width') + if not width: + continue + vbr = self._search_regex( + r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None) + if vbr: + m3u8_format.update({ + 'vbr': int_or_none(vbr), + }) + formats.extend(m3u8_formats) + else: + width = int_or_none(s.get('width')) + vbr = int_or_none(s.get('bitrate')) + has_width = width != 0 + formats.append({ + 'format_id': ( + '%s-%d-%d' % (video_type, width, vbr) + if width + else video_type), + 'vbr': vbr if has_width else None, + 'width': width, + 'height': int_or_none(s.get('height')), + 'acodec': s.get('audioCodec'), + 'vcodec': s.get('videoCodec') if has_width else 'none', + 'filesize': int_or_none(s.get('fileSize')), + 'url': s_url, + 'ext': 'mp4', + 'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None, + }) + source_media_url = video_data.get('sourceMediaURL') + if source_media_url: + formats.append({ + 'format_id': 'source_media', + 'url': source_media_url, + }) + self._sort_formats( + formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id')) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('blurb'), + 'uploader': video_data.get('credits', {}).get('source'), + 'formats': formats, + 'duration': int_or_none(video_data.get('videoDuration'), 100), + 'timestamp': int_or_none( + video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000), + } + + +class WashingtonPostArticleIE(InfoExtractor): + IE_NAME = 'washingtonpost:article' + _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/(?:[^/]+/)*(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/', 'info_dict': { @@ -63,6 +152,10 @@ class WashingtonPostIE(InfoExtractor): }] }] + @classmethod + def suitable(cls, url): + return False if WashingtonPostIE.suitable(url) else super(WashingtonPostArticleIE, cls).suitable(url) + def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) @@ -74,54 +167,7 @@ class WashingtonPostIE(InfoExtractor): <div\s+class="posttv-video-embed[^>]*?data-uuid=| data-video-uuid= )"([^"]+)"''', webpage) - entries = [] - for i, uuid in enumerate(uuids, start=1): - vinfo_all = self._download_json( - 'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % uuid, - page_id, - transform_source=strip_jsonp, - note='Downloading information of video %d/%d' % (i, len(uuids)) - ) - vinfo = vinfo_all[0]['contentConfig'] - uploader = vinfo.get('credits', {}).get('source') - timestamp = int_or_none( - vinfo.get('dateConfig', {}).get('dateFirstPublished'), 1000) - - formats = [{ - 'format_id': ( - '%s-%s-%s' % (s.get('type'), s.get('width'), s.get('bitrate')) - if s.get('width') - else s.get('type')), - 'vbr': s.get('bitrate') if s.get('width') != 0 else None, - 'width': s.get('width'), - 'height': s.get('height'), - 'acodec': s.get('audioCodec'), - 'vcodec': s.get('videoCodec') if s.get('width') != 0 else 'none', - 'filesize': s.get('fileSize'), - 'url': s.get('url'), - 'ext': 'mp4', - 'preference': -100 if s.get('type') == 'smil' else None, - 'protocol': { - 'MP4': 'http', - 'F4F': 'f4m', - }.get(s.get('type')), - } for s in vinfo.get('streams', [])] - source_media_url = vinfo.get('sourceMediaURL') - if source_media_url: - formats.append({ - 'format_id': 'source_media', - 'url': source_media_url, - }) - self._sort_formats(formats) - entries.append({ - 'id': uuid, - 'title': vinfo['title'], - 'description': vinfo.get('blurb'), - 'uploader': uploader, - 'formats': formats, - 'duration': int_or_none(vinfo.get('videoDuration'), 100), - 'timestamp': timestamp, - }) + entries = [self.url_result('washingtonpost:%s' % uuid, 'WashingtonPost', uuid) for uuid in uuids] return { '_type': 'playlist', diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 5227bb5ad..de7d6b559 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -2,25 +2,26 @@ from __future__ import unicode_literals import re -import hashlib from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, unified_strdate, + HEADRequest, + float_or_none, ) class WatIE(InfoExtractor): - _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|https?://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)' + _VALID_URL = r'(?:wat:|https?://(?:www\.)?wat\.tv/video/.*-)(?P<id>[0-9a-z]+)' IE_NAME = 'wat.tv' _TESTS = [ { 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', - 'md5': 'ce70e9223945ed26a8056d413ca55dc9', + 'md5': '83d882d9de5c9d97f0bb2c6273cde56a', 'info_dict': { 'id': '11713067', - 'display_id': 'soupe-figues-l-orange-aux-epices', 'ext': 'mp4', 'title': 'Soupe de figues à l\'orange et aux épices', 'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.', @@ -33,7 +34,6 @@ class WatIE(InfoExtractor): 'md5': 'fbc84e4378165278e743956d9c1bf16b', 'info_dict': { 'id': '11713075', - 'display_id': 'gregory-lemarchal-voix-ange', 'ext': 'mp4', 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)', 'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3', @@ -44,96 +44,85 @@ class WatIE(InfoExtractor): }, ] - def download_video_info(self, real_id): - # 'contentv4' is used in the website, but it also returns the related - # videos, we don't need them - info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id) - return info['media'] - def _real_extract(self, url): - def real_id_for_chapter(chapter): - return chapter['tc_start'].split('-')[0] - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - real_id = mobj.group('real_id') - if not real_id: - short_id = mobj.group('short_id') - webpage = self._download_webpage(url, display_id or short_id) - real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id') + video_id = self._match_id(url) + video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) - video_info = self.download_video_info(real_id) + # 'contentv4' is used in the website, but it also returns the related + # videos, we don't need them + video_info = self._download_json( + 'http://www.wat.tv/interface/contentv3/' + video_id, video_id)['media'] error_desc = video_info.get('error_desc') if error_desc: raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True) - geo_list = video_info.get('geoList') - country = geo_list[0] if geo_list else '' - chapters = video_info['chapters'] first_chapter = chapters[0] - files = video_info['files'] - first_file = files[0] - if real_id_for_chapter(first_chapter) != real_id: - self.to_screen('Multipart video detected') - chapter_urls = [] - for chapter in chapters: - chapter_id = real_id_for_chapter(chapter) - # Yes, when we this chapter is processed by WatIE, - # it will download the info again - chapter_info = self.download_video_info(chapter_id) - chapter_urls.append(chapter_info['url']) - entries = [self.url_result(chapter_url) for chapter_url in chapter_urls] - return self.playlist_result(entries, real_id, video_info['title']) + def video_id_for_chapter(chapter): + return chapter['tc_start'].split('-')[0] - upload_date = None - if 'date_diffusion' in first_chapter: - upload_date = unified_strdate(first_chapter['date_diffusion']) + if video_id_for_chapter(first_chapter) != video_id: + self.to_screen('Multipart video detected') + entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] + return self.playlist_result(entries, video_id, video_info['title']) # Otherwise we can continue and extract just one part, we have to use - # the short id for getting the video url - - formats = [{ - 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id, - 'format_id': 'Mobile', - }] - - fmts = [('SD', 'web')] - if first_file.get('hasHD'): - fmts.append(('HD', 'webhd')) - - def compute_token(param): - timestamp = '%08x' % int(self._download_webpage( - 'http://www.wat.tv/servertime', real_id, - 'Downloading server time').split('|')[0]) - magic = '9b673b13fa4682ed14c3cfa5af5310274b514c4133e9b3a81e6e3aba009l2564' - return '%s/%s' % (hashlib.md5((magic + param + timestamp).encode('ascii')).hexdigest(), timestamp) - - for fmt in fmts: - webid = '/%s/%s' % (fmt[1], real_id) - video_url = self._download_webpage( - 'http://www.wat.tv/get%s?token=%s&getURL=1&country=%s' % (webid, compute_token(webid), country), - real_id, - 'Downloading %s video URL' % fmt[0], - 'Failed to download %s video URL' % fmt[0], - False) - if not video_url: + # the video id for getting the video url + + date_diffusion = first_chapter.get('date_diffusion') + upload_date = unified_strdate(date_diffusion) if date_diffusion else None + + def extract_url(path_template, url_type): + req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) + head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type) + red_url = head.geturl() + if req_url == red_url: + raise ExtractorError( + '%s said: Sorry, this video is not available from your country.' % self.IE_NAME, + expected=True) + return red_url + + m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8') + http_url = extract_url('android5/%s.mp4', 'http') + + formats = [] + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') + formats.extend(m3u8_formats) + formats.extend(self._extract_f4m_formats( + m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + for m3u8_format in m3u8_formats: + mobj = re.search( + r'audio.*?%3D(\d+)(?:-video.*?%3D(\d+))?', m3u8_format['url']) + if not mobj: continue - formats.append({ - 'url': video_url, - 'ext': 'mp4', - 'format_id': fmt[0], + abr, vbr = mobj.groups() + abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) + m3u8_format.update({ + 'vbr': vbr, + 'abr': abr, + }) + if not vbr or not abr: + continue + f = m3u8_format.copy() + f.update({ + 'url': re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url), + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', }) + formats.append(f) + self._sort_formats(formats) return { - 'id': real_id, - 'display_id': display_id, + 'id': video_id, 'title': first_chapter['title'], 'thumbnail': first_chapter['preview'], 'description': first_chapter['description'], 'view_count': video_info['views'], 'upload_date': upload_date, - 'duration': first_file['duration'], + 'duration': video_info['files'][0]['duration'], 'formats': formats, } diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 31c904303..a9238cbeb 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -1,214 +1,222 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import itertools import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urlparse, -) from ..utils import ( + determine_ext, + ExtractorError, + js_to_json, + strip_jsonp, unified_strdate, - qualities, + update_url_query, + urlhandle_detect_ext, ) class WDRIE(InfoExtractor): - _PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?' - _VALID_URL = r'(?P<url>https?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P<id>.+?)(?P<player>%s)?\.html' % _PLAYER_REGEX + _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' + _PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P<type>[^/]+)/(?P<display_id>.+)\.html' + _VALID_URL = r'(?P<page_url>https?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL _TESTS = [ { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html', + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html', + # HDS download, MD5 is unstable 'info_dict': { - 'id': 'mdb-362427', + 'id': 'mdb-1058683', 'ext': 'flv', - 'title': 'Servicezeit', - 'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb', - 'upload_date': '20140310', - 'is_live': False + 'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100', + 'title': 'Geheimnis Aachener Dom', + 'alt_title': 'Doku am Freitag', + 'upload_date': '20160304', + 'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318', + 'is_live': False, + 'subtitles': {'de': [{ + 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml' + }]}, }, - 'params': { - 'skip_download': True, + }, + { + 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html', + 'md5': 'f4c1f96d01cf285240f53ea4309663d8', + 'info_dict': { + 'id': 'mdb-1072000', + 'ext': 'mp3', + 'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100', + 'title': 'Schriftstellerin Juli Zeh', + 'alt_title': 'WDR 3 Gespräch am Samstag', + 'upload_date': '20160312', + 'description': 'md5:e127d320bc2b1f149be697ce044a3dd7', + 'is_live': False, + 'subtitles': {} }, - 'skip': 'Page Not Found', }, { - 'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html', + 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { - 'id': 'mdb-363194', - 'ext': 'flv', - 'title': 'Marga Spiegel ist tot', - 'description': 'md5:2309992a6716c347891c045be50992e4', - 'upload_date': '20140311', - 'is_live': False + 'id': 'mdb-103364', + 'ext': 'mp4', + 'display_id': 'index', + 'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'alt_title': 'WDR Fernsehen Live', + 'upload_date': None, + 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', + 'is_live': True, + 'subtitles': {} }, 'params': { - 'skip_download': True, + 'skip_download': True, # m3u8 download }, - 'skip': 'Page Not Found', }, { - 'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html', - 'md5': '83e9e8fefad36f357278759870805898', + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', + 'playlist_mincount': 8, 'info_dict': { - 'id': 'mdb-194332', - 'ext': 'mp3', - 'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)', - 'description': 'md5:2309992a6716c347891c045be50992e4', - 'upload_date': '20091129', - 'is_live': False + 'id': 'aktuelle-stunde/aktuelle-stunde-120', }, }, { - 'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html', - 'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa', + 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', 'info_dict': { - 'id': 'mdb-478135', - 'ext': 'mp3', - 'title': 'Flavia Coelho: Amar é Amar', - 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', - 'upload_date': '20140717', - 'is_live': False + 'id': 'mdb-1096487', + 'ext': 'flv', + 'upload_date': 're:^[0-9]{8}$', + 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', + 'description': '- Die Sendung mit der Maus -', }, - 'skip': 'Page Not Found', + 'skip': 'The id changes from week to week because of the new episode' }, { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html', - 'playlist_mincount': 146, + 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5', + # HDS download, MD5 is unstable 'info_dict': { - 'id': 'mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100', - } + 'id': 'mdb-186083', + 'ext': 'flv', + 'upload_date': '20130919', + 'title': 'Sachgeschichte - Achterbahn ', + 'description': '- Die Sendung mit der Maus -', + }, }, { - 'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html', + 'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html', + # Live stream, MD5 unstable 'info_dict': { - 'id': 'mdb-103364', - 'title': 're:^WDR Fernsehen Live [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', + 'id': 'mdb-869971', 'ext': 'flv', - 'upload_date': '20150101', - 'is_live': True - }, - 'params': { - 'skip_download': True, + 'title': 'Funkhaus Europa Livestream', + 'description': 'md5:2309992a6716c347891c045be50992e4', + 'upload_date': '20160101', }, } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - page_url = mobj.group('url') - page_id = mobj.group('id') - - webpage = self._download_webpage(url, page_id) - - if mobj.group('player') is None: + url_type = mobj.group('type') + page_url = mobj.group('page_url') + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) + + # for wdr.de the data-extension is in a tag with the class "mediaLink" + # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" + # for wdrmaus its in a link to the page in a multiline "videoLink"-tag + json_metadata = self._html_search_regex( + r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', + webpage, 'media link', default=None, flags=re.MULTILINE) + + if not json_metadata: entries = [ - self.url_result(page_url + href, 'WDR') + self.url_result(page_url + href[0], 'WDR') for href in re.findall( - r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX, + r'<a href="(%s)"[^>]+data-extension=' % self._PAGE_REGEX, webpage) ] if entries: # Playlist page - return self.playlist_result(entries, page_id) - - # Overview page - entries = [] - for page_num in itertools.count(2): - hrefs = re.findall( - r'<li class="mediathekvideo"\s*>\s*<img[^>]*>\s*<a href="(/mediathek/video/[^"]+)"', - webpage) - entries.extend( - self.url_result(page_url + href, 'WDR') - for href in hrefs) - next_url_m = re.search( - r'<li class="nextToLast">\s*<a href="([^"]+)"', webpage) - if not next_url_m: - break - next_url = page_url + next_url_m.group(1) - webpage = self._download_webpage( - next_url, page_id, - note='Downloading playlist page %d' % page_num) - return self.playlist_result(entries, page_id) + return self.playlist_result(entries, playlist_id=display_id) - flashvars = compat_parse_qs(self._html_search_regex( - r'<param name="flashvars" value="([^"]+)"', webpage, 'flashvars')) + raise ExtractorError('No downloadable streams found', expected=True) - page_id = flashvars['trackerClipId'][0] - video_url = flashvars['dslSrc'][0] - title = flashvars['trackerClipTitle'][0] - thumbnail = flashvars['startPicture'][0] if 'startPicture' in flashvars else None - is_live = flashvars.get('isLive', ['0'])[0] == '1' + media_link_obj = self._parse_json(json_metadata, display_id, + transform_source=js_to_json) + jsonp_url = media_link_obj['mediaObj']['url'] - if is_live: - title = self._live_title(title) - - if 'trackerClipAirTime' in flashvars: - upload_date = flashvars['trackerClipAirTime'][0] - else: - upload_date = self._html_search_meta( - 'DC.Date', webpage, 'upload date') + metadata = self._download_json( + jsonp_url, 'metadata', transform_source=strip_jsonp) - if upload_date: - upload_date = unified_strdate(upload_date) + metadata_tracker_data = metadata['trackerData'] + metadata_media_resource = metadata['mediaResource'] formats = [] - preference = qualities(['S', 'M', 'L', 'XL']) - if video_url.endswith('.f4m'): - formats.extend(self._extract_f4m_formats( - video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', page_id, - f4m_id='hds', fatal=False)) - elif video_url.endswith('.smil'): - formats.extend(self._extract_smil_formats( - video_url, page_id, False, { - 'hdcore': '3.3.0', - 'plugin': 'aasp-3.3.0.99.43', - })) - else: - formats.append({ - 'url': video_url, - 'http_headers': { - 'User-Agent': 'mobile', - }, - }) + # check if the metadata contains a direct URL to a file + for kind, media_resource in metadata_media_resource.items(): + if kind not in ('dflt', 'alt'): + continue + + for tag_name, medium_url in media_resource.items(): + if tag_name not in ('videoURL', 'audioURL'): + continue + + ext = determine_ext(medium_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + medium_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls')) + elif ext == 'f4m': + manifest_url = update_url_query( + medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) + formats.extend(self._extract_f4m_formats( + manifest_url, display_id, f4m_id='hds', fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + medium_url, 'stream', fatal=False)) + else: + a_format = { + 'url': medium_url + } + if ext == 'unknown_video': + urlh = self._request_webpage( + medium_url, display_id, note='Determining extension') + ext = urlhandle_detect_ext(urlh) + a_format['ext'] = ext + formats.append(a_format) - m3u8_url = self._search_regex( - r'rel="adaptiv"[^>]+href="([^"]+)"', - webpage, 'm3u8 url', default=None) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, page_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + self._sort_formats(formats) - direct_urls = re.findall( - r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage) - if direct_urls: - for quality, video_url in direct_urls: - formats.append({ - 'url': video_url, - 'preference': preference(quality), - 'http_headers': { - 'User-Agent': 'mobile', - }, - }) + subtitles = {} + caption_url = metadata_media_resource.get('captionURL') + if caption_url: + subtitles['de'] = [{ + 'url': caption_url + }] - self._sort_formats(formats) + title = metadata_tracker_data.get('trackerClipTitle') + is_live = url_type == 'live' + + if is_live: + title = self._live_title(title) + upload_date = None + elif 'trackerClipAirTime' in metadata_tracker_data: + upload_date = metadata_tracker_data['trackerClipAirTime'] + else: + upload_date = self._html_search_meta('DC.Date', webpage, 'upload date') - description = self._html_search_meta('Description', webpage, 'description') + if upload_date: + upload_date = unified_strdate(upload_date) return { - 'id': page_id, - 'formats': formats, + 'id': metadata_tracker_data.get('trackerClipId', display_id), + 'display_id': display_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'), + 'formats': formats, 'upload_date': upload_date, - 'is_live': is_live + 'description': self._html_search_meta('Description', webpage), + 'is_live': is_live, + 'subtitles': subtitles, } @@ -241,81 +249,3 @@ class WDRMobileIE(InfoExtractor): 'User-Agent': 'mobile', }, } - - -class WDRMausIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))' - IE_DESC = 'Sendung mit der Maus' - _TESTS = [{ - 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', - 'info_dict': { - 'id': 'aktuelle-sendung', - 'ext': 'mp4', - 'thumbnail': 're:^http://.+\.jpg', - 'upload_date': 're:^[0-9]{8}$', - 'title': 're:^[0-9.]{10} - Aktuelle Sendung$', - } - }, { - 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/40_jahre_maus.php5', - 'md5': '3b1227ca3ed28d73ec5737c65743b2a3', - 'info_dict': { - 'id': '40_jahre_maus', - 'ext': 'mp4', - 'thumbnail': 're:^http://.+\.jpg', - 'upload_date': '20131007', - 'title': '12.03.2011 - 40 Jahre Maus', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - param_code = self._html_search_regex( - r'<a href="\?startVideo=1&([^"]+)"', webpage, 'parameters') - - title_date = self._search_regex( - r'<div class="sendedatum"><p>Sendedatum:\s*([0-9\.]+)</p>', - webpage, 'air date') - title_str = self._html_search_regex( - r'<h1>(.*?)</h1>', webpage, 'title') - title = '%s - %s' % (title_date, title_str) - upload_date = unified_strdate( - self._html_search_meta('dc.date', webpage)) - - fields = compat_parse_qs(param_code) - video_url = fields['firstVideo'][0] - thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0]) - - formats = [{ - 'format_id': 'rtmp', - 'url': video_url, - }] - - jscode = self._download_webpage( - 'http://www.wdrmaus.de/codebase/js/extended-medien.min.js', - video_id, fatal=False, - note='Downloading URL translation table', - errnote='Could not download URL translation table') - if jscode: - for m in re.finditer( - r"stream:\s*'dslSrc=(?P<stream>[^']+)',\s*download:\s*'(?P<dl>[^']+)'\s*\}", - jscode): - if video_url.startswith(m.group('stream')): - http_url = video_url.replace( - m.group('stream'), m.group('dl')) - formats.append({ - 'format_id': 'http', - 'url': http_url, - }) - break - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - } diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 8b14840a2..c634b8dec 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -3,16 +3,17 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, - sanitized_Request, int_or_none, + float_or_none, ) class WistiaIE(InfoExtractor): - _VALID_URL = r'https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)' - _API_URL = 'http://fast.wistia.com/embed/medias/{0:}.json' + _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.net/embed/iframe/)(?P<id>[a-z0-9]+)' + _API_URL = 'http://fast.wistia.com/embed/medias/%s.json' + _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s' - _TEST = { + _TESTS = [{ 'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt', 'md5': 'cafeb56ec0c53c18c97405eecb3133df', 'info_dict': { @@ -24,36 +25,54 @@ class WistiaIE(InfoExtractor): 'timestamp': 1386185018, 'duration': 117, }, - } + }, { + 'url': 'wistia:sh7fpupwlt', + 'only_matching': True, + }, { + # with hls video + 'url': 'wistia:807fafadvk', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - request = sanitized_Request(self._API_URL.format(video_id)) - request.add_header('Referer', url) # Some videos require this. - data_json = self._download_json(request, video_id) + data_json = self._download_json( + self._API_URL % video_id, video_id, + # Some videos require this. + headers={ + 'Referer': url if url.startswith('http') else self._IFRAME_URL % video_id, + }) + if data_json.get('error'): - raise ExtractorError('Error while getting the playlist', - expected=True) + raise ExtractorError( + 'Error while getting the playlist', expected=True) + data = data_json['media'] title = data['name'] formats = [] thumbnails = [] for a in data['assets']: + aurl = a.get('url') + if not aurl: + continue astatus = a.get('status') atype = a.get('type') - if (astatus is not None and astatus != 2) or atype == 'preview': + if (astatus is not None and astatus != 2) or atype in ('preview', 'storyboard'): continue elif atype in ('still', 'still_image'): thumbnails.append({ - 'url': a['url'], - 'resolution': '%dx%d' % (a['width'], a['height']), + 'url': aurl, + 'width': int_or_none(a.get('width')), + 'height': int_or_none(a.get('height')), }) else: + aext = a.get('ext') + is_m3u8 = a.get('container') == 'm3u8' or aext == 'm3u8' formats.append({ 'format_id': atype, - 'url': a['url'], + 'url': aurl, 'tbr': int_or_none(a.get('bitrate')), 'vbr': int_or_none(a.get('opt_vbitrate')), 'width': int_or_none(a.get('width')), @@ -61,7 +80,8 @@ class WistiaIE(InfoExtractor): 'filesize': int_or_none(a.get('size')), 'vcodec': a.get('codec'), 'container': a.get('container'), - 'ext': a.get('ext'), + 'ext': 'mp4' if is_m3u8 else aext, + 'protocol': 'm3u8' if is_m3u8 else None, 'preference': 1 if atype == 'original' else None, }) @@ -73,6 +93,6 @@ class WistiaIE(InfoExtractor): 'description': data.get('seoDescription'), 'formats': formats, 'thumbnails': thumbnails, - 'duration': int_or_none(data.get('duration')), + 'duration': float_or_none(data.get('duration')), 'timestamp': int_or_none(data.get('createdAt')), } diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index b3547174d..bd8e1af2e 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -12,37 +12,52 @@ from ..utils import ( class XHamsterIE(InfoExtractor): - _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' - _TESTS = [ - { - 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', - 'info_dict': { - 'id': '1509445', - 'ext': 'mp4', - 'title': 'FemaleAgent Shy beauty takes the bait', - 'upload_date': '20121014', - 'uploader': 'Ruseful2011', - 'duration': 893.52, - 'age_limit': 18, - } + _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.*?)\.html(?:\?.*)?' + _TESTS = [{ + 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', + 'md5': '8281348b8d3c53d39fffb377d24eac4e', + 'info_dict': { + 'id': '1509445', + 'ext': 'mp4', + 'title': 'FemaleAgent Shy beauty takes the bait', + 'upload_date': '20121014', + 'uploader': 'Ruseful2011', + 'duration': 893.52, + 'age_limit': 18, }, - { - 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', - 'info_dict': { - 'id': '2221348', - 'ext': 'mp4', - 'title': 'Britney Spears Sexy Booty', - 'upload_date': '20130914', - 'uploader': 'jojo747400', - 'duration': 200.48, - 'age_limit': 18, - } + }, { + 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', + 'info_dict': { + 'id': '2221348', + 'ext': 'mp4', + 'title': 'Britney Spears Sexy Booty', + 'upload_date': '20130914', + 'uploader': 'jojo747400', + 'duration': 200.48, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + # empty seo + 'url': 'http://xhamster.com/movies/5667973/.html', + 'info_dict': { + 'id': '5667973', + 'ext': 'mp4', + 'title': '....', + 'upload_date': '20160208', + 'uploader': 'parejafree', + 'duration': 72.0, + 'age_limit': 18, }, - { - 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', - 'only_matching': True, + 'params': { + 'skip_download': True, }, - ] + }, { + 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', + 'only_matching': True, + }] def _real_extract(self, url): def extract_video_url(webpage, name): @@ -170,7 +185,7 @@ class XHamsterEmbedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_url = self._search_regex( - r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id, + r'href="(https?://xhamster\.com/movies/%s/[^"]*\.html[^"]*)"' % video_id, webpage, 'xhamster url', default=None) if not video_url: diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 710ad5041..1dfe031ca 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -8,7 +8,6 @@ from ..utils import ( clean_html, ExtractorError, determine_ext, - sanitized_Request, ) @@ -25,8 +24,6 @@ class XVideosIE(InfoExtractor): } } - _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19' - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -35,31 +32,34 @@ class XVideosIE(InfoExtractor): if mobj: raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True) - video_url = compat_urllib_parse_unquote( - self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL')) video_title = self._html_search_regex( r'<title>(.*?)\s+-\s+XVID', webpage, 'title') video_thumbnail = self._search_regex( r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) - formats = [{ - 'url': video_url, - }] + formats = [] - android_req = sanitized_Request(url) - android_req.add_header('User-Agent', self._ANDROID_USER_AGENT) - android_webpage = self._download_webpage(android_req, video_id, fatal=False) + video_url = compat_urllib_parse_unquote(self._search_regex( + r'flv_url=(.+?)&', webpage, 'video URL', default='')) + if video_url: + formats.append({'url': video_url}) - if android_webpage is not None: - player_params_str = self._search_regex( - 'mobileReplacePlayerDivTwoQual\(([^)]+)\)', - android_webpage, 'player parameters', default='') - player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(','))) - if player_params: - formats.extend([{ - 'url': param, - 'preference': -10, - } for param in player_params if determine_ext(param) == 'mp4']) + player_args = self._search_regex( + r'(?s)new\s+HTML5Player\((.+?)\)', webpage, ' html5 player', default=None) + if player_args: + for arg in player_args.split(','): + format_url = self._search_regex( + r'(["\'])(?P<url>https?://.+?)\1', arg, 'url', + default=None, group='url') + if not format_url: + continue + ext = determine_ext(format_url) + if ext == 'mp4': + formats.append({'url': format_url}) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) @@ -67,7 +67,6 @@ class XVideosIE(InfoExtractor): 'id': video_id, 'formats': formats, 'title': video_title, - 'ext': 'flv', 'thumbnail': video_thumbnail, 'age_limit': 18, } diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index ce3723b55..b37d0eab6 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -10,8 +10,6 @@ from ..utils import ( ExtractorError, int_or_none, float_or_none, - sanitized_Request, - urlencode_postdata, ) @@ -22,18 +20,24 @@ class YandexMusicBaseIE(InfoExtractor): error = response.get('error') if error: raise ExtractorError(error, expected=True) + if response.get('type') == 'captcha' or 'captcha' in response: + YandexMusicBaseIE._raise_captcha() + + @staticmethod + def _raise_captcha(): + raise ExtractorError( + 'YandexMusic has considered youtube-dl requests automated and ' + 'asks you to solve a CAPTCHA. You can either wait for some ' + 'time until unblocked and optionally use --sleep-interval ' + 'in future or alternatively you can go to https://music.yandex.ru/ ' + 'solve CAPTCHA, then export cookies and pass cookie file to ' + 'youtube-dl with --cookies', + expected=True) def _download_webpage(self, *args, **kwargs): webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: - raise ExtractorError( - 'YandexMusic has considered youtube-dl requests automated and ' - 'asks you to solve a CAPTCHA. You can either wait for some ' - 'time until unblocked and optionally use --sleep-interval ' - 'in future or alternatively you can go to https://music.yandex.ru/ ' - 'solve CAPTCHA, then export cookies and pass cookie file to ' - 'youtube-dl with --cookies', - expected=True) + self._raise_captcha() return webpage def _download_json(self, *args, **kwargs): @@ -177,7 +181,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): IE_NAME = 'yandexmusic:playlist' IE_DESC = 'Яндекс.Музыка - Плейлист' - _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)' + _VALID_URL = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by)/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)' _TESTS = [{ 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', @@ -196,47 +200,64 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'id': '1036', 'title': 'Музыка 90-х', }, - 'playlist_count': 310, + 'playlist_mincount': 300, 'skip': 'Travis CI servers blocked by YandexMusic', }] def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - mu = self._parse_json( - self._search_regex( - r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'), - playlist_id) - - playlist = mu['pageData']['playlist'] - tracks, track_ids = playlist['tracks'], playlist['trackIds'] - - # tracks dictionary shipped with webpage is limited to 150 tracks, + mobj = re.match(self._VALID_URL, url) + tld = mobj.group('tld') + user = mobj.group('user') + playlist_id = mobj.group('id') + + playlist = self._download_json( + 'https://music.yandex.%s/handlers/playlist.jsx' % tld, + playlist_id, 'Downloading missing tracks JSON', + fatal=False, + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + 'X-Retpath-Y': url, + }, + query={ + 'owner': user, + 'kinds': playlist_id, + 'light': 'true', + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + })['playlist'] + + tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds']) + + # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, # missing tracks should be retrieved manually. if len(tracks) < len(track_ids): - present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) - missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) - request = sanitized_Request( - 'https://music.yandex.ru/handlers/track-entries.jsx', - urlencode_postdata({ + present_track_ids = set([ + compat_str(track['id']) + for track in tracks if track.get('id')]) + missing_track_ids = [ + track_id for track_id in track_ids + if track_id not in present_track_ids] + missing_tracks = self._download_json( + 'https://music.yandex.%s/handlers/track-entries.jsx' % tld, + playlist_id, 'Downloading missing tracks JSON', + fatal=False, + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + }, + query={ 'entries': ','.join(missing_track_ids), - 'lang': mu.get('settings', {}).get('lang', 'en'), - 'external-domain': 'music.yandex.ru', + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, 'overembed': 'false', - 'sign': mu.get('authData', {}).get('user', {}).get('sign'), 'strict': 'true', - })) - request.add_header('Referer', url) - request.add_header('X-Requested-With', 'XMLHttpRequest') - - missing_tracks = self._download_json( - request, playlist_id, 'Downloading missing tracks JSON', fatal=False) + }) if missing_tracks: tracks.extend(missing_tracks) return self.playlist_result( self._build_playlist(tracks), compat_str(playlist_id), - playlist['title'], playlist.get('description')) + playlist.get('title'), playlist.get('description')) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 349ce0941..147608ebe 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals import base64 +import itertools import random +import re import string import time @@ -13,6 +15,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + get_element_by_attribute, sanitized_Request, ) @@ -275,6 +278,8 @@ class YoukuIE(InfoExtractor): 'format_id': self.get_format_name(fm), 'ext': self.parse_ext_l(fm), 'filesize': int(seg['size']), + 'width': stream.get('width'), + 'height': stream.get('height'), }) return { @@ -283,3 +288,52 @@ class YoukuIE(InfoExtractor): 'title': title, 'entries': entries, } + + +class YoukuShowIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?youku\.com/show_page/id_(?P<id>[0-9a-z]+)\.html' + IE_NAME = 'youku:show' + + _TEST = { + 'url': 'http://www.youku.com/show_page/id_zc7c670be07ff11e48b3f.html', + 'info_dict': { + 'id': 'zc7c670be07ff11e48b3f', + 'title': '花千骨 未删减版', + 'description': 'md5:578d4f2145ae3f9128d9d4d863312910', + }, + 'playlist_count': 50, + } + + _PAGE_SIZE = 40 + + def _find_videos_in_page(self, webpage): + videos = re.findall( + r'<li><a[^>]+href="(?P<url>https?://v\.youku\.com/[^"]+)"[^>]+title="(?P<title>[^"]+)"', webpage) + return [ + self.url_result(video_url, YoukuIE.ie_key(), title) + for video_url, title in videos] + + def _real_extract(self, url): + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) + + entries = self._find_videos_in_page(webpage) + + playlist_title = self._html_search_regex( + r'<span[^>]+class="name">([^<]+)</span>', webpage, 'playlist title', fatal=False) + detail_div = get_element_by_attribute('class', 'detail', webpage) or '' + playlist_description = self._html_search_regex( + r'<span[^>]+style="display:none"[^>]*>([^<]+)</span>', + detail_div, 'playlist description', fatal=False) + + for idx in itertools.count(1): + episodes_page = self._download_webpage( + 'http://www.youku.com/show_episode/id_%s.html' % show_id, + show_id, query={'divid': 'reload_%d' % (idx * self._PAGE_SIZE + 1)}, + note='Downloading episodes page %d' % idx) + new_entries = self._find_videos_in_page(episodes_page) + entries.extend(new_entries) + if len(new_entries) < self._PAGE_SIZE: + break + + return self.playlist_result(entries, show_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b7c3cb63f..6c9f77d95 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -344,6 +344,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'}, '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'}, '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, # Dash webm '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, @@ -1326,9 +1328,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if video_description: video_description = re.sub(r'''(?x) <a\s+ - (?:[a-zA-Z-]+="[^"]+"\s+)*? + (?:[a-zA-Z-]+="[^"]*"\s+)*? (?:title|href)="([^"]+)"\s+ - (?:[a-zA-Z-]+="[^"]+"\s+)*? + (?:[a-zA-Z-]+="[^"]*"\s+)*? class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*> [^<]+\.{3}\s* </a> |