diff options
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | docs/supportedsites.md | 9 | ||||
-rwxr-xr-x | youtube_dl/YoutubeDL.py | 14 | ||||
-rw-r--r-- | youtube_dl/extractor/__init__.py | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/crunchyroll.py | 30 | ||||
-rw-r--r-- | youtube_dl/extractor/facebook.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/nowtv.py | 192 | ||||
-rw-r--r-- | youtube_dl/extractor/rtlnow.py | 174 | ||||
-rw-r--r-- | youtube_dl/extractor/senateisvp.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/soompi.py | 146 | ||||
-rw-r--r-- | youtube_dl/extractor/spiegeltv.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/tf1.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/tube8.py | 14 | ||||
-rw-r--r-- | youtube_dl/extractor/tubitv.py | 84 | ||||
-rw-r--r-- | youtube_dl/extractor/vgtv.py | 28 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 20 | ||||
-rw-r--r-- | youtube_dl/postprocessor/embedthumbnail.py | 4 | ||||
-rw-r--r-- | youtube_dl/version.py | 2 |
18 files changed, 526 insertions, 226 deletions
@@ -168,7 +168,7 @@ which means you can modify it, redistribute it or use it however you like. --no-progress Do not print progress bar --console-title Display progress in console titlebar -v, --verbose Print various debugging information - --dump-pages Print downloaded pages to debug problems (very verbose) + --dump-pages Print downloaded pages encoded using base64 to debug problems (very verbose) --write-pages Write downloaded intermediary pages to files in the current directory to debug problems --print-traffic Display sent and read HTTP traffic -C, --call-home Contact the youtube-dl server for debugging @@ -220,7 +220,7 @@ which means you can modify it, redistribute it or use it however you like. --embed-thumbnail Embed thumbnail in the audio as cover art --add-metadata Write metadata to the video file --metadata-from-title FORMAT Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed - parameters replace existing values. Additional templates: %(album), %(artist). Example: --metadata-from-title "%(artist)s - + parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like "Coldplay - Paradise" --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default; diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a4879bd9a..a421ae62b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -26,8 +26,7 @@ - **anitube.se** - **AnySex** - **Aparat** - - **AppleDailyAnimationNews** - - **AppleDailyRealtimeNews** + - **AppleDaily** - **AppleTrailers** - **archive.org**: archive.org videos - **ARD** @@ -152,7 +151,6 @@ - **fc2** - **fernsehkritik.tv** - **fernsehkritik.tv:postecke** - - **Firedrive** - **Firstpost** - **Flickr** - **Folketinget**: Folketinget (ft.dk; Danish parliament) @@ -230,6 +228,7 @@ - **KanalPlay**: Kanal 5/9/11 Play - **Kankan** - **Karaoketv** + - **KarriereVideos** - **keek** - **KeezMovies** - **KhanAcademy** @@ -322,6 +321,7 @@ - **NosVideo** - **novamov**: NovaMov - **Nowness** + - **NowTV** - **nowvideo**: NowVideo - **npo.nl** - **npo.nl:live** @@ -393,7 +393,6 @@ - **Rte** - **rtl.nl**: rtl.nl and rtlxl.nl - **RTL2** - - **RTLnow** - **RTP** - **RTS**: RTS.ch - **rtve.es:alacarta**: RTVE a la carta @@ -431,7 +430,6 @@ - **smotri:community**: Smotri.com community videos - **smotri:user**: Smotri.com user videos - **Snotr** - - **Sockshare** - **Sohu** - **soundcloud** - **soundcloud:playlist** @@ -564,6 +562,7 @@ - **vier:videos** - **Viewster** - **viki** + - **viki:channel** - **vimeo** - **vimeo:album** - **vimeo:channel** diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d1953c18f..21d247f23 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1047,6 +1047,8 @@ class YoutubeDL(object): if not formats: raise ExtractorError('No video formats found!') + formats_dict = {} + # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): if 'url' not in format: @@ -1054,6 +1056,18 @@ class YoutubeDL(object): if format.get('format_id') is None: format['format_id'] = compat_str(i) + format_id = format['format_id'] + if format_id not in formats_dict: + formats_dict[format_id] = [] + formats_dict[format_id].append(format) + + # Make sure all formats have unique format_id + for format_id, ambiguous_formats in formats_dict.items(): + if len(ambiguous_formats) > 1: + for i, format in enumerate(ambiguous_formats): + format['format_id'] = '%s-%d' % (format_id, i) + + for i, format in enumerate(formats): if format.get('format') is None: format['format'] = '{id} - {res}{note}'.format( id=format['format_id'], diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d20ad286d..4dc07efe0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -354,6 +354,7 @@ from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE from .novamov import NovaMovIE from .nowness import NownessIE +from .nowtv import NowTVIE from .nowvideo import NowVideoIE from .npo import ( NPOIE, @@ -438,7 +439,6 @@ from .roxwel import RoxwelIE from .rtbf import RTBFIE from .rte import RteIE from .rtlnl import RtlNlIE -from .rtlnow import RTLnowIE from .rtl2 import RTL2IE from .rtp import RTPIE from .rts import RTSIE @@ -481,6 +481,10 @@ from .smotri import ( ) from .snotr import SnotrIE from .sohu import SohuIE +from .soompi import ( + SoompiIE, + SoompiShowIE, +) from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, @@ -566,6 +570,7 @@ from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE from .trutube import TruTubeIE from .tube8 import Tube8IE +from .tubitv import TubiTvIE from .tudou import TudouIE from .tumblr import TumblrIE from .tunein import TuneInIE diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 1c77df47e..41f0c736d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -76,8 +76,8 @@ class CrunchyrollIE(InfoExtractor): self._login() def _decrypt_subtitles(self, data, iv, id): - data = bytes_to_intlist(data) - iv = bytes_to_intlist(iv) + data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) + iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8'))) id = int(id) def obfuscate_key_aux(count, modulo, start): @@ -179,6 +179,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text return output + def _extract_subtitles(self, subtitle): + sub_root = xml.etree.ElementTree.fromstring(subtitle) + return [{ + 'ext': 'srt', + 'data': self._convert_subtitles_to_srt(sub_root), + }, { + 'ext': 'ass', + 'data': self._convert_subtitles_to_ass(sub_root), + }] + def _get_subtitles(self, video_id, webpage): subtitles = {} for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): @@ -190,25 +200,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False) if not id or not iv or not data: continue - id = int(id) - iv = base64.b64decode(iv) - data = base64.b64decode(data) - subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) if not lang_code: continue - sub_root = xml.etree.ElementTree.fromstring(subtitle) - subtitles[lang_code] = [ - { - 'ext': 'srt', - 'data': self._convert_subtitles_to_srt(sub_root), - }, - { - 'ext': 'ass', - 'data': self._convert_subtitles_to_ass(sub_root), - }, - ] + subtitles[lang_code] = self._extract_subtitles(subtitle) return subtitles def _real_extract(self, url): diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index e8d682716..82dc27bc6 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -152,7 +152,7 @@ class FacebookIE(InfoExtractor): raise ExtractorError('Cannot find video formats') video_title = self._html_search_regex( - r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title', + r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title', default=None) if not video_title: video_title = self._html_search_regex( diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py new file mode 100644 index 000000000..173e46cd8 --- /dev/null +++ b/youtube_dl/extractor/nowtv.py @@ -0,0 +1,192 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + parse_duration, + remove_start, +) + + +class NowTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player' + + _TESTS = [{ + # rtl + 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player', + 'info_dict': { + 'id': '203519', + 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', + 'ext': 'mp4', + 'title': 'Die neuen Bauern und eine Hochzeit', + 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432580700, + 'upload_date': '20150525', + 'duration': 2786, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # rtl2 + 'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player', + 'info_dict': { + 'id': '203481', + 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', + 'ext': 'mp4', + 'title': 'Berlin - Tag & Nacht (Folge 934)', + 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432666800, + 'upload_date': '20150526', + 'duration': 2641, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # rtlnitro + 'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player', + 'info_dict': { + 'id': '165780', + 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', + 'ext': 'mp4', + 'title': 'Hals- und Beinbruch', + 'description': 'md5:b50d248efffe244e6f56737f0911ca57', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432415400, + 'upload_date': '20150523', + 'duration': 2742, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # superrtl + 'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player', + 'info_dict': { + 'id': '99205', + 'display_id': 'medicopter-117/angst', + 'ext': 'mp4', + 'title': 'Angst!', + 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1222632900, + 'upload_date': '20080928', + 'duration': 3025, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # ntv + 'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player', + 'info_dict': { + 'id': '203521', + 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', + 'ext': 'mp4', + 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', + 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432751700, + 'upload_date': '20150527', + 'duration': 1083, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # vox + 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player', + 'info_dict': { + 'id': '128953', + 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', + 'ext': 'mp4', + 'title': "Büro-Fall / Chihuahua 'Joel'", + 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432408200, + 'upload_date': '20150523', + 'duration': 3092, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + station = mobj.group('station') + + info = self._download_json( + 'https://api.nowtv.de/v3/movies/%s?fields=*,format,files' % display_id, + display_id) + + video_id = compat_str(info['id']) + + files = info['files'] + if not files: + if info.get('geoblocked', False): + raise ExtractorError( + 'Video %s is not available from your location due to geo restriction' % video_id, + expected=True) + if not info.get('free', True): + raise ExtractorError( + 'Video %s is not available for free' % video_id, expected=True) + + f = info.get('format', {}) + station = f.get('station') or station + + STATIONS = { + 'rtl': 'rtlnow', + 'rtl2': 'rtl2now', + 'vox': 'voxnow', + 'nitro': 'rtlnitronow', + 'ntv': 'n-tvnow', + 'superrtl': 'superrtlnow' + } + + formats = [] + for item in files['items']: + item_path = remove_start(item['path'], '/') + tbr = int_or_none(item['bitrate']) + m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path) + m3u8_url = m3u8_url.replace('now/', 'now/videos/') + formats.append({ + 'url': m3u8_url, + 'format_id': '%s-%sk' % (item['id'], tbr), + 'ext': 'mp4', + 'tbr': tbr, + }) + self._sort_formats(formats) + + title = info['title'] + description = info.get('articleLong') or info.get('articleShort') + timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') + duration = parse_duration(info.get('duration')) + thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py deleted file mode 100644 index 785a8045e..000000000 --- a/youtube_dl/extractor/rtlnow.py +++ /dev/null @@ -1,174 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - clean_html, - unified_strdate, - int_or_none, -) - - -class RTLnowIE(InfoExtractor): - """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" - _VALID_URL = r'''(?x) - (?:https?://)? - (?P<url> - (?P<domain> - rtl-now\.rtl\.de| - rtl2now\.rtl2\.de| - (?:www\.)?voxnow\.de| - (?:www\.)?rtlnitronow\.de| - (?:www\.)?superrtlnow\.de| - (?:www\.)?n-tvnow\.de) - /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\? - (?:container_id|film_id)=(?P<video_id>[0-9]+)& - player=1(?:&season=[0-9]+)?(?:&.*)? - )''' - - _TESTS = [ - { - 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', - 'info_dict': { - 'id': '90419', - 'ext': 'flv', - 'title': 'Ahornallee - Folge 1 - Der Einzug', - 'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de', - 'upload_date': '20070416', - 'duration': 1685, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only works from Germany', - }, - { - 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', - 'info_dict': { - 'id': '69756', - 'ext': 'flv', - 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.', - 'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0', - 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg', - 'upload_date': '20120519', - 'duration': 1245, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only works from Germany', - }, - { - 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', - 'info_dict': { - 'id': '13883', - 'ext': 'flv', - 'title': 'Voxtours - Südafrika-Reporter II', - 'description': 'md5:de7f8d56be6fd4fed10f10f57786db00', - 'upload_date': '20090627', - 'duration': 1800, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1', - 'info_dict': { - 'id': '99205', - 'ext': 'flv', - 'title': 'Medicopter 117 - Angst!', - 'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin', - 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg', - 'upload_date': '20080928', - 'duration': 2691, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://rtl-now.rtl.de/der-bachelor/folge-4.php?film_id=188729&player=1&season=5', - 'info_dict': { - 'id': '188729', - 'ext': 'flv', - 'upload_date': '20150204', - 'description': 'md5:5e1ce23095e61a79c166d134b683cecc', - 'title': 'Der Bachelor - Folge 4', - } - }, { - 'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_page_url = 'http://%s/' % mobj.group('domain') - video_id = mobj.group('video_id') - - webpage = self._download_webpage('http://' + mobj.group('url'), video_id) - - mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage) - if mobj: - raise ExtractorError(clean_html(mobj.group(1)), expected=True) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage, default=None) - - upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date')) - - mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage) - duration = int(mobj.group('seconds')) if mobj else None - - playerdata_url = self._html_search_regex( - r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url') - - playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML') - - videoinfo = playerdata.find('./playlist/videoinfo') - - formats = [] - for filename in videoinfo.findall('filename'): - mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text) - if mobj: - fmt = { - 'url': mobj.group('url'), - 'play_path': 'mp4:' + mobj.group('play_path'), - 'page_url': video_page_url, - 'player_url': video_page_url + 'includes/vodplayer.swf', - } - else: - mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text) - if mobj: - fmt = { - 'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'), - 'play_path': 'mp4:' + mobj.group('play_path'), - 'page_url': url, - 'player_url': video_page_url + 'includes/vodplayer.swf', - } - else: - fmt = { - 'url': filename.text, - } - fmt.update({ - 'width': int_or_none(filename.get('width')), - 'height': int_or_none(filename.get('height')), - 'vbr': int_or_none(filename.get('bitrate')), - 'ext': 'flv', - }) - formats.append(fmt) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index d3b8a1be4..9c53704ea 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor): ["arch", "", "http://ussenate-f.akamaihd.net/"] ] _IE_NAME = 'senate.gov' - _VALID_URL = r'http://www\.senate\.gov/isvp/\?(?P<qs>.+)' + _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)' _TESTS = [{ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', 'info_dict': { @@ -72,12 +72,16 @@ class SenateISVPIE(InfoExtractor): 'ext': 'mp4', 'title': 'Integrated Senate Video Player' } + }, { + # From http://www.c-span.org/video/?96791-1 + 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', + 'only_matching': True, }] @staticmethod def _search_iframe_url(webpage): mobj = re.search( - r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/\?[^'\"]+)['\"]", + r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", webpage) if mobj: return mobj.group('url') diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py new file mode 100644 index 000000000..5da66ca9e --- /dev/null +++ b/youtube_dl/extractor/soompi.py @@ -0,0 +1,146 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .crunchyroll import CrunchyrollIE + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + remove_start, + xpath_text, +) + + +class SoompiBaseIE(InfoExtractor): + def _get_episodes(self, webpage, episode_filter=None): + episodes = self._parse_json( + self._search_regex( + r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'), + None) + return list(filter(episode_filter, episodes)) + + +class SoompiIE(SoompiBaseIE, CrunchyrollIE): + IE_NAME = 'soompi' + _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://tv.soompi.com/en/watch/29235', + 'info_dict': { + 'id': '29235', + 'ext': 'mp4', + 'title': 'Episode 1096', + 'description': '2015-05-20' + }, + 'params': { + 'skip_download': True, + }, + }] + + def _get_episode(self, webpage, video_id): + return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0] + + def _get_subtitles(self, config, video_id): + sub_langs = {} + for subtitle in config.findall('./{default}preload/subtitles/subtitle'): + sub_langs[subtitle.attrib['id']] = subtitle.attrib['title'] + + subtitles = {} + for s in config.findall('./{default}preload/subtitle'): + lang_code = sub_langs.get(s.attrib['id']) + if not lang_code: + continue + sub_id = s.get('id') + data = xpath_text(s, './data', 'data') + iv = xpath_text(s, './iv', 'iv') + if not id or not iv or not data: + continue + subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8') + subtitles[lang_code] = self._extract_subtitles(subtitle) + return subtitles + + def _real_extract(self, url): + video_id = self._match_id(url) + + try: + webpage = self._download_webpage( + url, video_id, 'Downloading episode page') + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + webpage = ee.cause.read() + block_message = self._html_search_regex( + r'(?s)<div class="block-message">(.+?)</div>', webpage, + 'block message', default=None) + if block_message: + raise ExtractorError(block_message, expected=True) + raise + + formats = [] + config = None + for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage): + config = self._download_xml( + 'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id), + video_id, 'Downloading %s XML' % format_id) + m3u8_url = xpath_text( + config, './{default}preload/stream_info/file', + '%s m3u8 URL' % format_id) + if not m3u8_url: + continue + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', m3u8_id=format_id)) + self._sort_formats(formats) + + episode = self._get_episode(webpage, video_id) + + title = episode['name'] + description = episode.get('description') + duration = int_or_none(episode.get('duration')) + + thumbnails = [{ + 'id': thumbnail_id, + 'url': thumbnail_url, + } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()] + + subtitles = self.extract_subtitles(config, video_id) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles + } + + +class SoompiShowIE(SoompiBaseIE): + IE_NAME = 'soompi:show' + _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)' + _TESTS = [{ + 'url': 'http://tv.soompi.com/en/shows/liar-game', + 'info_dict': { + 'id': 'liar-game', + 'title': 'Liar Game', + 'description': 'md5:52c02bce0c1a622a95823591d0589b66', + }, + 'playlist_count': 14, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + + webpage = self._download_webpage( + url, show_id, 'Downloading show page') + + title = remove_start(self._og_search_title(webpage), 'SoompiTV | ') + description = self._og_search_description(webpage) + + entries = [ + self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi') + for episode in self._get_episodes(webpage)] + + return self.playlist_result(entries, show_id, title, description) diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index 98cf92d89..359722ad6 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -51,9 +51,9 @@ class SpiegeltvIE(InfoExtractor): is_wide = media_json['is_wide'] server_json = self._download_json( - 'http://www.spiegel.tv/streaming_servers/', video_id, - note='Downloading server information') - server = server_json[0]['endpoint'] + 'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json', + video_id, note='Downloading server information') + server = server_json['streamingserver'][0]['endpoint'] thumbnails = [] for image in media_json['images']: @@ -76,5 +76,6 @@ class SpiegeltvIE(InfoExtractor): 'ext': 'm4v', 'description': description, 'duration': duration, - 'thumbnails': thumbnails + 'thumbnails': thumbnails, + 'rtmp_live': True, } diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 656410528..3a68eaa80 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://(?:videos\.tf1|www\.tfou|www\.tf1)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html' + _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html' _TESTS = [{ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { @@ -35,6 +35,9 @@ class TF1IE(InfoExtractor): }, { 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', 'only_matching': True, + }, { + 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index d73ad3762..6ca8840b0 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -47,7 +47,7 @@ class Tube8IE(InfoExtractor): webpage = self._download_webpage(req, display_id) flashvars = json.loads(self._html_search_regex( - r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars')) + r'flashvars\s*=\s*({.+?})', webpage, 'flashvars')) video_url = flashvars['video_url'] if flashvars.get('encrypted') is True: @@ -58,19 +58,19 @@ class Tube8IE(InfoExtractor): thumbnail = flashvars.get('image_url') title = self._html_search_regex( - r'videotitle\s*=\s*"([^"]+)', webpage, 'title') + r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') description = self._html_search_regex( - r'>Description:</strong>(.+?)<', webpage, 'description', fatal=False) + r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False) uploader = self._html_search_regex( - r'<strong class="video-username">(?:<a href="[^"]+">)?([^<]+)(?:</a>)?</strong>', + r'<span class="username">\s*(.+?)\s*<', webpage, 'uploader', fatal=False) like_count = int_or_none(self._html_search_regex( - r"rupVar\s*=\s*'(\d+)'", webpage, 'like count', fatal=False)) + r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False)) dislike_count = int_or_none(self._html_search_regex( - r"rdownVar\s*=\s*'(\d+)'", webpage, 'dislike count', fatal=False)) + r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False)) view_count = self._html_search_regex( - r'<strong>Views: </strong>([\d,\.]+)</li>', webpage, 'view count', fatal=False) + r'<strong>Views: </strong>([\d,\.]+)\s*</li>', webpage, 'view count', fatal=False) if view_count: view_count = str_to_int(view_count) comment_count = self._html_search_regex( diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py new file mode 100644 index 000000000..2c4b21807 --- /dev/null +++ b/youtube_dl/extractor/tubitv.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import codecs +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse, + compat_urllib_request +) +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class TubiTvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P<id>[0-9]+)' + _LOGIN_URL = 'http://tubitv.com/login' + _NETRC_MACHINE = 'tubitv' + _TEST = { + 'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01', + 'info_dict': { + 'id': '54411', + 'ext': 'mp4', + 'title': 'The Kitchen Musical - EP01', + 'thumbnail': 're:^https?://.*\.png$', + 'description': 'md5:37532716166069b353e8866e71fefae7', + 'duration': 2407, + }, + 'params': { + 'skip_download': 'HLS download', + }, + } + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + self.report_login() + form_data = { + 'username': username, + 'password': password, + } + payload = compat_urllib_parse.urlencode(form_data).encode('utf-8') + request = compat_urllib_request.Request(self._LOGIN_URL, payload) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + login_page = self._download_webpage( + request, None, False, 'Wrong login info') + if not re.search(r'id="tubi-logout"', login_page): + raise ExtractorError( + 'Login failed (invalid username/password)', expected=True) + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage): + raise ExtractorError( + 'This video requires login, use --username and --password ' + 'options to provide account credentials.', expected=True) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration')) + + apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu') + m3u8_url = codecs.decode(apu, 'rot_13')[::-1] + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'description': description, + 'duration': duration, + } diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index e6ee1e471..f38a72fde 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import float_or_none +from ..utils import ( + ExtractorError, + float_or_none, +) class VGTVIE(InfoExtractor): @@ -59,16 +62,16 @@ class VGTVIE(InfoExtractor): }, { # streamType: live - 'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen', + 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', 'info_dict': { - 'id': '100015', + 'id': '113063', 'ext': 'flv', - 'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!', - 'description': 'md5:9a60cc23fa349f761628924e56eeec2d', + 'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:b3743425765355855f88e096acc93231', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 0, - 'timestamp': 1407423348, - 'upload_date': '20140807', + 'timestamp': 1432975582, + 'upload_date': '20150530', 'view_count': int, }, 'params': { @@ -97,7 +100,12 @@ class VGTVIE(InfoExtractor): % (host, video_id, HOST_WEBSITES[host]), video_id, 'Downloading media JSON') + if data.get('status') == 'inactive': + raise ExtractorError( + 'Video %s is no longer available' % video_id, expected=True) + streams = data['streamUrls'] + stream_type = data.get('streamType') formats = [] @@ -107,7 +115,8 @@ class VGTVIE(InfoExtractor): hls_url, video_id, 'mp4', m3u8_id='hls')) hds_url = streams.get('hds') - if hds_url: + # wasLive hds are always 404 + if hds_url and stream_type != 'wasLive': formats.extend(self._extract_f4m_formats( hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds')) @@ -135,13 +144,14 @@ class VGTVIE(InfoExtractor): return { 'id': video_id, - 'title': data['title'], + 'title': self._live_title(data['title']), 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], 'duration': float_or_none(data['duration'], 1000), 'view_count': data['displays'], 'formats': formats, + 'is_live': True if stream_type == 'live' else False, } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0301682b8..fcdbfe0bc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1399,6 +1399,26 @@ class YoutubeChannelIE(InfoExtractor): channel_id = self._match_id(url) url = self._TEMPLATE_URL % channel_id + + # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) + # Workaround by extracting as a playlist if managed to obtain channel playlist URL + # otherwise fallback on channel by page extraction + channel_page = self._download_webpage( + url + '?view=57', channel_id, + 'Downloading channel page', fatal=False) + channel_playlist_id = self._search_regex( + [r'<meta itemprop="channelId" content="([^"]+)">', + r'data-channel-external-id="([^"]+)"'], + channel_page, 'channel id', default=None) + if channel_playlist_id and channel_playlist_id.startswith('UC'): + playlist_id = 'UU' + channel_playlist_id[2:] + channel_playlist = unescapeHTML(self._search_regex( + r'href="/?(watch\?v=[0-9A-Za-z_-]{11}&list=%s)"' % playlist_id, + channel_page, 'channel playlist URL', default=None)) + if channel_playlist: + return self.url_result( + compat_urlparse.urljoin(url, '/%s' % channel_playlist), 'YoutubePlaylist') + channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') autogenerated = re.search(r'''(?x) class="[^"]*?(?: diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 8f825f785..774494efd 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -49,7 +49,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor): os.remove(encodeFilename(filename)) os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - elif info['ext'] == 'm4a': + elif info['ext'] in ['m4a', 'mp4']: if not check_executable('AtomicParsley', ['-v']): raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') @@ -82,6 +82,6 @@ class EmbedThumbnailPP(FFmpegPostProcessor): os.remove(encodeFilename(filename)) os.rename(encodeFilename(temp_filename), encodeFilename(filename)) else: - raise EmbedThumbnailPPError('Only mp3 and m4a are supported for thumbnail embedding for now.') + raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.') return [], info diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b33385153..653710131 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.05.20' +__version__ = '2015.05.29' |