diff options
42 files changed, 1197 insertions, 636 deletions
| @@ -140,3 +140,6 @@ Behrouz Abbasi  ngld  nyuszika7h  Shaun Walbridge +Lee Jenkins +Anssi Hannula +Lukáš Lalinský @@ -9,6 +9,7 @@ youtube-dl - download videos from youtube.com or other video platforms  - [VIDEO SELECTION](#video-selection)  - [FAQ](#faq)  - [DEVELOPER INSTRUCTIONS](#developer-instructions) +- [EMBEDDING YOUTUBE-DL](#embedding-youtube-dl)  - [BUGS](#bugs)  - [COPYRIGHT](#copyright) @@ -261,7 +262,7 @@ For example:  machine youtube login myaccount@gmail.com password my_youtube_password  machine twitch login my_twitch_account_name password my_twitch_password  ``` -To activate authentication with `.netrc` file you should pass `--netrc` to youtube-dl or to place it in [configuration file](#configuration). +To activate authentication with `.netrc` file you should pass `--netrc` to youtube-dl or place it in [configuration file](#configuration).  On Windows you may also need to setup `%HOME%` environment variable manually. @@ -277,8 +278,8 @@ The `-o` option allows users to indicate a template for the output file names. T   - `ext`: The sequence will be replaced by the appropriate extension (like flv or mp4).   - `epoch`: The sequence will be replaced by the Unix epoch when creating the file.   - `autonumber`: The sequence will be replaced by a five-digit number that will be increased with each download, starting at zero. - - `playlist`: The name or the id of the playlist that contains the video. - - `playlist_index`: The index of the video in the playlist, a five-digit number. + - `playlist`: The sequence will be replaced by the name or the id of the playlist that contains the video. + - `playlist_index`: The sequence will be replaced by the index of the video in the playlist padded with leading zeros according to the total length of the playlist.   - `format_id`: The sequence will be replaced by the format code specified by `--format`.  The current default template is `%(title)s-%(id)s.%(ext)s`. diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 04b9959ac..ab153af6b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -122,7 +122,6 @@   - **defense.gouv.fr**   - **DHM**: Filmarchiv - Deutsches Historisches Museum   - **Discovery** - - **divxstage**: DivxStage   - **Dotsub**   - **DouyuTV**: 斗鱼   - **dramafever** @@ -195,7 +194,7 @@   - **GodTube**   - **GoldenMoustache**   - **Golem** - - **GorillaVid**: GorillaVid.in, daclips.in, movpod.in, fastvideo.in and realvid.net + - **GorillaVid**: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net and filehoot.com   - **Goshgay**   - **Groupon**   - **Hark** @@ -286,7 +285,7 @@   - **Minhateca**   - **MinistryGrid**   - **miomio.tv** - - **mitele.es** + - **MiTele**: mitele.es   - **mixcloud**   - **MLB**   - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net @@ -309,7 +308,6 @@   - **mtvservices:embedded**   - **MuenchenTV**: münchen.tv   - **MusicPlayOn** - - **MusicVault**   - **muzu.tv**   - **Mwave**   - **MySpace** @@ -318,7 +316,6 @@   - **Myvi**   - **myvideo**   - **MyVidster** - - **N-JOY**   - **n-tv.de**   - **NationalGeographic**   - **Naver** @@ -327,7 +324,9 @@   - **NBCNews**   - **NBCSports**   - **NBCSportsVPlayer** - - **ndr**: NDR.de - Mediathek + - **ndr**: NDR.de - Norddeutscher Rundfunk + - **ndr:embed** + - **ndr:embed:base**   - **NDTV**   - **NerdCubedFeed**   - **Nerdist** @@ -350,12 +349,16 @@   - **nhl.com:videocenter**: NHL videocenter category   - **niconico**: ニコニコ動画   - **NiconicoPlaylist** + - **njoy**: N-JOY + - **njoy:embed**   - **Noco**   - **Normalboots**   - **NosVideo**   - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz   - **novamov**: NovaMov - - **Nowness** + - **nowness** + - **nowness:playlist** + - **nowness:series**   - **NowTV**   - **nowvideo**: NowVideo   - **npo**: npo.nl and ntr.nl @@ -376,7 +379,6 @@   - **OnionStudios**   - **Ooyala**   - **OoyalaExternal** - - **OpenFilm**   - **orf:fm4**: radio FM4   - **orf:iptv**: iptv.ORF.at   - **orf:oe1**: Radio Österreich 1 @@ -531,7 +533,7 @@   - **techtv.mit.edu**   - **ted**   - **TeleBruxelles** - - **telecinco.es** + - **Telecinco**: telecinco.es, cuatro.com and mediaset.es   - **Telegraaf**   - **TeleMB**   - **TeleTask** @@ -633,6 +635,7 @@   - **vine:user**   - **vk**: VK   - **vk:uservideos**: VK - User's Videos + - **vlive**   - **Vodlocker**   - **VoiceRepublic**   - **Vporn** diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index e32bef279..1ff42d94b 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -81,6 +81,11 @@ except ImportError:      import BaseHTTPServer as compat_http_server  try: +    compat_str = unicode  # Python 2 +except NameError: +    compat_str = str + +try:      from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes      from urllib.parse import unquote as compat_urllib_parse_unquote      from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus @@ -100,7 +105,7 @@ except ImportError:  # Python 2              # Is it a string-like object?              string.split              return b'' -        if isinstance(string, unicode): +        if isinstance(string, compat_str):              string = string.encode('utf-8')          bits = string.split(b'%')          if len(bits) == 1: @@ -151,11 +156,6 @@ except ImportError:  # Python 2          return compat_urllib_parse_unquote(string, encoding, errors)  try: -    compat_str = unicode  # Python 2 -except NameError: -    compat_str = str - -try:      compat_basestring = basestring  # Python 2  except NameError:      compat_basestring = str @@ -234,7 +234,7 @@ else:      # Working around shlex issue with unicode strings on some python 2      # versions (see http://bugs.python.org/issue1548891)      def compat_shlex_split(s, comments=False, posix=True): -        if isinstance(s, unicode): +        if isinstance(s, compat_str):              s = s.encode('utf-8')          return shlex.split(s, comments, posix) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 71aafdc73..a62d2047b 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -28,10 +28,19 @@ class HlsFD(FileDownloader):              return False          ffpp.check_version() -        args = [ -            encodeArgument(opt) -            for opt in (ffpp.executable, '-y', '-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc')] -        args.append(encodeFilename(tmpfilename, True)) +        args = [ffpp.executable, '-y'] + +        if info_dict['http_headers']: +            # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: +            # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. +            args += [ +                '-headers', +                ''.join('%s: %s\r\n' % (key, val) for key, val in info_dict['http_headers'].items())] + +        args += ['-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] + +        args = [encodeArgument(opt) for opt in args] +        args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True))          self._debug_cmd(args) @@ -92,6 +101,7 @@ class NativeHlsFD(FragmentFD):                  return False              down, frag_sanitized = sanitize_open(frag_filename, 'rb')              ctx['dest_stream'].write(down.read()) +            down.close()              frags_filenames.append(frag_sanitized)          self._finish_frag_download(ctx) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f8d4c8462..a73a1317e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -138,7 +138,6 @@ from .dump import DumpIE  from .dumpert import DumpertIE  from .defense import DefenseGouvFrIE  from .discovery import DiscoveryIE -from .divxstage import DivxStageIE  from .dropbox import DropboxIE  from .eagleplatform import EaglePlatformIE  from .ebaumsworld import EbaumsWorldIE @@ -226,7 +225,6 @@ from .historicfilms import HistoricFilmsIE  from .history import HistoryIE  from .hitbox import HitboxIE, HitboxLiveIE  from .hornbunny import HornBunnyIE -from .hostingbulk import HostingBulkIE  from .hotnewhiphop import HotNewHipHopIE  from .howcast import HowcastIE  from .howstuffworks import HowStuffWorksIE @@ -364,6 +362,9 @@ from .nbc import (  from .ndr import (      NDRIE,      NJoyIE, +    NDREmbedBaseIE, +    NDREmbedIE, +    NJoyEmbedIE,  )  from .ndtv import NDTVIE  from .netzkino import NetzkinoIE @@ -399,7 +400,11 @@ from .normalboots import NormalbootsIE  from .nosvideo import NosVideoIE  from .nova import NovaIE  from .novamov import NovaMovIE -from .nowness import NownessIE +from .nowness import ( +    NownessIE, +    NownessPlaylistIE, +    NownessSeriesIE, +)  from .nowtv import NowTVIE  from .nowvideo import NowVideoIE  from .npo import ( @@ -429,7 +434,6 @@ from .ooyala import (      OoyalaIE,      OoyalaExternalIE,  ) -from .openfilm import OpenFilmIE  from .orf import (      ORFTVthekIE,      ORFOE1IE, diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 76de24477..2a00da3ee 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -4,6 +4,10 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import ( +    compat_parse_qs, +    compat_urllib_parse_urlparse, +)  from ..utils import (      find_xpath_attr,      unified_strdate, @@ -77,7 +81,13 @@ class ArteTVPlus7IE(InfoExtractor):      def _extract_from_webpage(self, webpage, video_id, lang):          json_url = self._html_search_regex(              [r'arte_vp_url=["\'](.*?)["\']', r'data-url=["\']([^"]+)["\']'], -            webpage, 'json vp url') +            webpage, 'json vp url', default=None) +        if not json_url: +            iframe_url = self._html_search_regex( +                r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', +                webpage, 'iframe url', group='url') +            json_url = compat_parse_qs( +                compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]          return self._extract_from_json_url(json_url, video_id, lang)      def _extract_from_json_url(self, json_url, video_id, lang): diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index abc5a44a1..42526357a 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -21,6 +21,7 @@ class BBCCoUkIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'      _MEDIASELECTOR_URLS = [ +        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',      ] @@ -189,6 +190,12 @@ class BBCCoUkIE(InfoExtractor):              # Skip DASH until supported              elif transfer_format == 'dash':                  pass +            elif transfer_format == 'hls': +                m3u8_formats = self._extract_m3u8_formats( +                    href, programme_id, ext='mp4', entry_protocol='m3u8_native', +                    m3u8_id=supplier, fatal=False) +                if m3u8_formats: +                    formats.extend(m3u8_formats)              # Direct link              else:                  formats.append({ diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py index 14f215c5c..1dfa7c12e 100644 --- a/youtube_dl/extractor/clubic.py +++ b/youtube_dl/extractor/clubic.py @@ -12,9 +12,9 @@ from ..utils import (  class ClubicIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?clubic\.com/video/[^/]+/video.*-(?P<id>[0-9]+)\.html' +    _VALID_URL = r'http://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html' -    _TEST = { +    _TESTS = [{          'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html',          'md5': '1592b694ba586036efac1776b0b43cd3',          'info_dict': { @@ -24,7 +24,10 @@ class ClubicIE(InfoExtractor):              'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*',              'thumbnail': 're:^http://img\.clubic\.com/.*\.jpg$',          } -    } +    }, { +        'url': 'http://www.clubic.com/video/video-clubic-week-2-0-apple-iphone-6s-et-plus-mais-surtout-le-pencil-469792.html', +        'only_matching': True, +    }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5eeeda08d..1e7db8a9b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -516,6 +516,12 @@ class InfoExtractor(object):              '%s. Use --username and --password or --netrc to provide account credentials.' % msg,              expected=True) +    @staticmethod +    def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): +        raise ExtractorError( +            '%s. You might want to use --proxy to workaround.' % msg, +            expected=True) +      # Methods for following #608      @staticmethod      def url_result(url, ie=None, video_id=None, video_title=None): @@ -731,8 +737,9 @@ class InfoExtractor(object):      @staticmethod      def _hidden_inputs(html): +        html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)          hidden_inputs = {} -        for input in re.findall(r'<input([^>]+)>', html): +        for input in re.findall(r'(?i)<input([^>]+)>', html):              if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):                  continue              name = re.search(r'name=(["\'])(?P<value>.+?)\1', input) @@ -746,7 +753,7 @@ class InfoExtractor(object):      def _form_hidden_inputs(self, form_id, html):          form = self._search_regex( -            r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, +            r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,              html, '%s form' % form_id, group='form')          return self._hidden_inputs(form) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 3db4db4e4..d6949ca28 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -2,7 +2,6 @@  from __future__ import unicode_literals  import re -import json  from .common import InfoExtractor  from ..compat import ( @@ -24,16 +23,28 @@ class CondeNastIE(InfoExtractor):      # The keys are the supported sites and the values are the name to be shown      # to the user and in the extractor description.      _SITES = { -        'wired': 'WIRED', +        'allure': 'Allure', +        'architecturaldigest': 'Architectural Digest', +        'arstechnica': 'Ars Technica', +        'bonappetit': 'Bon Appétit', +        'brides': 'Brides', +        'cnevids': 'Condé Nast', +        'cntraveler': 'Condé Nast Traveler', +        'details': 'Details', +        'epicurious': 'Epicurious', +        'glamour': 'Glamour', +        'golfdigest': 'Golf Digest',          'gq': 'GQ', +        'newyorker': 'The New Yorker', +        'self': 'SELF', +        'teenvogue': 'Teen Vogue', +        'vanityfair': 'Vanity Fair',          'vogue': 'Vogue', -        'glamour': 'Glamour', +        'wired': 'WIRED',          'wmagazine': 'W Magazine', -        'vanityfair': 'Vanity Fair', -        'cnevids': 'Condé Nast',      } -    _VALID_URL = r'http://(video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys()) +    _VALID_URL = r'http://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())      IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))      EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed)/.+?' % '|'.join(_SITES.keys()) @@ -86,8 +97,8 @@ class CondeNastIE(InfoExtractor):          info_url = base_info_url + data          info_page = self._download_webpage(info_url, video_id,                                             'Downloading video info') -        video_info = self._search_regex(r'var video = ({.+?});', info_page, 'video info') -        video_info = json.loads(video_info) +        video_info = self._search_regex(r'var\s+video\s*=\s*({.+?});', info_page, 'video info') +        video_info = self._parse_json(video_info, video_id)          formats = [{              'format_id': '%s-%s' % (fdata['type'].split('/')[-1], fdata['quality']), diff --git a/youtube_dl/extractor/divxstage.py b/youtube_dl/extractor/divxstage.py deleted file mode 100644 index b88379e06..000000000 --- a/youtube_dl/extractor/divxstage.py +++ /dev/null @@ -1,27 +0,0 @@ -from __future__ import unicode_literals - -from .novamov import NovaMovIE - - -class DivxStageIE(NovaMovIE): -    IE_NAME = 'divxstage' -    IE_DESC = 'DivxStage' - -    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'divxstage\.(?:eu|net|ch|co|at|ag|to)'} - -    _HOST = 'www.divxstage.eu' - -    _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' -    _TITLE_REGEX = r'<div class="video_det">\s*<strong>([^<]+)</strong>' -    _DESCRIPTION_REGEX = r'<div class="video_det">\s*<strong>[^<]+</strong>\s*<p>([^<]+)</p>' - -    _TEST = { -        'url': 'http://www.divxstage.eu/video/57f238e2e5e01', -        'md5': '63969f6eb26533a1968c4d325be63e72', -        'info_dict': { -            'id': '57f238e2e5e01', -            'ext': 'flv', -            'title': 'youtubedl test video', -            'description': 'This is a test video for youtubedl.', -        } -    } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec748ed9f..8881a8a23 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals  import os  import re +import sys  from .common import InfoExtractor  from .youtube import YoutubeIE @@ -231,6 +232,22 @@ class GenericIE(InfoExtractor):              }          },          { +            # redirect in Refresh HTTP header +            'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', +            'info_dict': { +                'id': 'pO8h3EaFRdo', +                'ext': 'mp4', +                'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', +                'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', +                'upload_date': '20150917', +                'uploader_id': 'brtvofficial', +                'uploader': 'Boiler Room', +            }, +            'params': { +                'skip_download': False, +            }, +        }, +        {              'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',              'md5': '85b90ccc9d73b4acd9138d3af4c27f89',              'info_dict': { @@ -1808,6 +1825,9 @@ class GenericIE(InfoExtractor):                  # Look also in Refresh HTTP header                  refresh_header = head_response.headers.get('Refresh')                  if refresh_header: +                    # In python 2 response HTTP headers are bytestrings +                    if sys.version_info < (3, 0) and isinstance(refresh_header, str): +                        refresh_header = refresh_header.decode('iso-8859-1')                      found = re.search(REDIRECT_REGEX, refresh_header)              if found:                  new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) diff --git a/youtube_dl/extractor/hostingbulk.py b/youtube_dl/extractor/hostingbulk.py deleted file mode 100644 index a3154cfde..000000000 --- a/youtube_dl/extractor/hostingbulk.py +++ /dev/null @@ -1,80 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( -    compat_urllib_request, -) -from ..utils import ( -    ExtractorError, -    int_or_none, -    urlencode_postdata, -) - - -class HostingBulkIE(InfoExtractor): -    _VALID_URL = r'''(?x) -        https?://(?:www\.)?hostingbulk\.com/ -        (?:embed-)?(?P<id>[A-Za-z0-9]{12})(?:-\d+x\d+)?\.html''' -    _FILE_DELETED_REGEX = r'<b>File Not Found</b>' -    _TEST = { -        'url': 'http://hostingbulk.com/n0ulw1hv20fm.html', -        'md5': '6c8653c8ecf7ebfa83b76e24b7b2fe3f', -        'info_dict': { -            'id': 'n0ulw1hv20fm', -            'ext': 'mp4', -            'title': 'md5:5afeba33f48ec87219c269e054afd622', -            'filesize': 6816081, -            'thumbnail': 're:^http://.*\.jpg$', -        } -    } - -    def _real_extract(self, url): -        video_id = self._match_id(url) -        url = 'http://hostingbulk.com/{0:}.html'.format(video_id) - -        # Custom request with cookie to set language to English, so our file -        # deleted regex would work. -        request = compat_urllib_request.Request( -            url, headers={'Cookie': 'lang=english'}) -        webpage = self._download_webpage(request, video_id) - -        if re.search(self._FILE_DELETED_REGEX, webpage) is not None: -            raise ExtractorError('Video %s does not exist' % video_id, -                                 expected=True) - -        title = self._html_search_regex(r'<h3>(.*?)</h3>', webpage, 'title') -        filesize = int_or_none( -            self._search_regex( -                r'<small>\((\d+)\sbytes?\)</small>', -                webpage, -                'filesize', -                fatal=False -            ) -        ) -        thumbnail = self._search_regex( -            r'<img src="([^"]+)".+?class="pic"', -            webpage, 'thumbnail', fatal=False) - -        fields = self._hidden_inputs(webpage) - -        request = compat_urllib_request.Request(url, urlencode_postdata(fields)) -        request.add_header('Content-type', 'application/x-www-form-urlencoded') -        response = self._request_webpage(request, video_id, -                                         'Submiting download request') -        video_url = response.geturl() - -        formats = [{ -            'format_id': 'sd', -            'filesize': filesize, -            'url': video_url, -        }] - -        return { -            'id': video_id, -            'title': title, -            'thumbnail': thumbnail, -            'formats': formats, -        } diff --git a/youtube_dl/extractor/iconosquare.py b/youtube_dl/extractor/iconosquare.py index 70e4c0d41..a39f422e9 100644 --- a/youtube_dl/extractor/iconosquare.py +++ b/youtube_dl/extractor/iconosquare.py @@ -1,7 +1,11 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( +    int_or_none, +    get_element_by_id, +    remove_end, +)  class IconosquareIE(InfoExtractor): @@ -12,7 +16,7 @@ class IconosquareIE(InfoExtractor):          'info_dict': {              'id': '522207370455279102_24101272',              'ext': 'mp4', -            'title': 'Instagram media by @aguynamedpatrick (Patrick Janelle)', +            'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)',              'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d',              'timestamp': 1376471991,              'upload_date': '20130814', @@ -29,8 +33,7 @@ class IconosquareIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          media = self._parse_json( -            self._search_regex( -                r'window\.media\s*=\s*({.+?});\n', webpage, 'media'), +            get_element_by_id('mediaJson', webpage),              video_id)          formats = [{ @@ -41,9 +44,7 @@ class IconosquareIE(InfoExtractor):          } for format_id, f in media['videos'].items()]          self._sort_formats(formats) -        title = self._html_search_regex( -            r'<title>(.+?)(?: *\(Videos?\))? \| (?:Iconosquare|Statigram)</title>', -            webpage, 'title') +        title = remove_end(self._og_search_title(webpage), ' - via Iconosquare')          timestamp = int_or_none(media.get('created_time') or media.get('caption', {}).get('created_time'))          description = media.get('caption', {}).get('text') @@ -61,6 +62,14 @@ class IconosquareIE(InfoExtractor):              'height': int_or_none(t.get('height'))          } for thumbnail_id, t in media.get('images', {}).items()] +        comments = [{ +            'id': comment.get('id'), +            'text': comment['text'], +            'timestamp': int_or_none(comment.get('created_time')), +            'author': comment.get('from', {}).get('full_name'), +            'author_id': comment.get('from', {}).get('username'), +        } for comment in media.get('comments', {}).get('data', []) if 'text' in comment] +          return {              'id': video_id,              'title': title, @@ -72,4 +81,5 @@ class IconosquareIE(InfoExtractor):              'comment_count': comment_count,              'like_count': like_count,              'formats': formats, +            'comments': comments,          } diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 393e67e35..ce1ab3820 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -95,6 +95,10 @@ class IqiyiIE(InfoExtractor):          ('10', 'h1'),      ] +    @staticmethod +    def md5_text(text): +        return hashlib.md5(text.encode('utf-8')).hexdigest() +      def construct_video_urls(self, data, video_id, _uuid):          def do_xor(x, y):              a = y % 3 @@ -121,7 +125,7 @@ class IqiyiIE(InfoExtractor):                  note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)              )['t']              t = str(int(math.floor(int(tm) / (600.0)))) -            return hashlib.md5((t + mg + x).encode('utf8')).hexdigest() +            return self.md5_text(t + mg + x)          video_urls_dict = {}          for format_item in data['vp']['tkl'][0]['vs']: @@ -179,20 +183,19 @@ class IqiyiIE(InfoExtractor):      def get_raw_data(self, tvid, video_id, enc_key, _uuid):          tm = str(int(time.time())) +        tail = tm + tvid          param = {              'key': 'fvip', -            'src': hashlib.md5(b'youtube-dl').hexdigest(), +            'src': self.md5_text('youtube-dl'),              'tvId': tvid,              'vid': video_id,              'vinfo': 1,              'tm': tm, -            'enc': hashlib.md5( -                (enc_key + tm + tvid).encode('utf8')).hexdigest(), +            'enc': self.md5_text((enc_key + tail)[1:64:2] + tail),              'qyid': _uuid,              'tn': random.random(),              'um': 0, -            'authkey': hashlib.md5( -                (tm + tvid).encode('utf8')).hexdigest() +            'authkey': self.md5_text(self.md5_text('') + tail),          }          api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ @@ -201,7 +204,8 @@ class IqiyiIE(InfoExtractor):          return raw_data      def get_enc_key(self, swf_url, video_id): -        enc_key = '3601ba290e4f4662848c710e2122007e'  # last update at 2015-08-10 for Zombie +        # TODO: automatic key extraction +        enc_key = 'eac64f22daf001da6ba9aa8da4d501508bbe90a4d4091fea3b0582a85b38c2cc'  # last update at 2015-09-23-23 for Zombie::bite          return enc_key      def _real_extract(self, url): diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 852d72266..54993e2c9 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,74 +1,85 @@  from __future__ import unicode_literals -import json -  from .common import InfoExtractor -from ..compat import ( -    compat_urllib_parse, -    compat_urllib_parse_unquote, -    compat_urlparse, -) +from ..compat import compat_urllib_parse  from ..utils import ( +    encode_dict,      get_element_by_attribute, -    parse_duration, -    strip_jsonp, +    int_or_none,  )  class MiTeleIE(InfoExtractor): -    IE_NAME = 'mitele.es' +    IE_DESC = 'mitele.es'      _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/'      _TESTS = [{          'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', +        'md5': 'ace7635b2a0b286aaa37d3ff192d2a8a',          'info_dict': { -            'id': '0fce117d', -            'ext': 'mp4', -            'title': 'Programa 144 - Tor, la web invisible', -            'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', +            'id': '0NF1jJnxS1Wu3pHrmvFyw2',              'display_id': 'programa-144', +            'ext': 'flv', +            'title': 'Tor, la web invisible', +            'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', +            'thumbnail': 're:(?i)^https?://.*\.jpg$',              'duration': 2913,          }, -        'params': { -            # m3u8 download -            'skip_download': True, -        },      }]      def _real_extract(self, url): -        episode = self._match_id(url) -        webpage = self._download_webpage(url, episode) -        embed_data_json = self._search_regex( -            r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', -        ).replace('\'', '"') -        embed_data = json.loads(embed_data_json) +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) -        domain = embed_data['mediaUrl'] -        if not domain.startswith('http'): -            # only happens in telecinco.es videos -            domain = 'http://' + domain -        info_url = compat_urlparse.urljoin( -            domain, -            compat_urllib_parse_unquote(embed_data['flashvars']['host']) -        ) -        info_el = self._download_xml(info_url, episode).find('./video/info') +        config_url = self._search_regex( +            r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url') -        video_link = info_el.find('videoUrl/link').text -        token_query = compat_urllib_parse.urlencode({'id': video_link}) -        token_info = self._download_json( -            embed_data['flashvars']['ov_tk'] + '?' + token_query, -            episode, -            transform_source=strip_jsonp -        ) -        formats = self._extract_m3u8_formats( -            token_info['tokenizedUrl'], episode, ext='mp4') +        config = self._download_json( +            config_url, display_id, 'Downloading config JSON') + +        mmc = self._download_json( +            config['services']['mmc'], display_id, 'Downloading mmc JSON') + +        formats = [] +        for location in mmc['locations']: +            gat = self._proto_relative_url(location.get('gat'), 'http:') +            bas = location.get('bas') +            loc = location.get('loc') +            ogn = location.get('ogn') +            if None in (gat, bas, loc, ogn): +                continue +            token_data = { +                'bas': bas, +                'icd': loc, +                'ogn': ogn, +                'sta': '0', +            } +            media = self._download_json( +                '%s/?%s' % (gat, compat_urllib_parse.urlencode(encode_dict(token_data)).encode('utf-8')), +                display_id, 'Downloading %s JSON' % location['loc']) +            file_ = media.get('file') +            if not file_: +                continue +            formats.extend(self._extract_f4m_formats( +                file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', +                display_id, f4m_id=loc)) + +        title = self._search_regex( +            r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>', webpage, 'title') + +        video_id = self._search_regex( +            r'data-media-id\s*=\s*"([^"]+)"', webpage, +            'data media id', default=None) or display_id +        thumbnail = config.get('poster', {}).get('imageUrl') +        duration = int_or_none(mmc.get('duration'))          return { -            'id': embed_data['videoId'], -            'display_id': episode, -            'title': info_el.find('title').text, -            'formats': formats, +            'id': video_id, +            'display_id': display_id, +            'title': title,              'description': get_element_by_attribute('class', 'text', webpage), -            'thumbnail': info_el.find('thumb').text, -            'duration': parse_duration(info_el.find('duration').text), +            'thumbnail': thumbnail, +            'duration': duration, +            'formats': formats,          } diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 79a13958b..e3cc6fde8 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -1,130 +1,380 @@ -# encoding: utf-8 +# coding: utf-8  from __future__ import unicode_literals  import re  from .common import InfoExtractor  from ..utils import ( -    ExtractorError, +    determine_ext,      int_or_none, +    parse_iso8601,      qualities, -    parse_duration,  )  class NDRBaseIE(InfoExtractor):      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        return self._extract_embed(webpage, display_id) + -        page = self._download_webpage(url, video_id, 'Downloading page') +class NDRIE(NDRBaseIE): +    IE_NAME = 'ndr' +    IE_DESC = 'NDR.de - Norddeutscher Rundfunk' +    _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P<id>[^/?#]+),[\da-z]+\.html' +    _TESTS = [{ +        # httpVideo, same content id +        'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', +        'md5': '6515bc255dc5c5f8c85bbc38e035a659', +        'info_dict': { +            'id': 'hafengeburtstag988', +            'display_id': 'Party-Poette-und-Parade', +            'ext': 'mp4', +            'title': 'Party, Pötte und Parade', +            'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c', +            'uploader': 'ndrtv', +            'timestamp': 1431108900, +            'upload_date': '20150510', +            'duration': 3498, +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        # httpVideo, different content id +        'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html', +        'md5': '1043ff203eab307f0c51702ec49e9a71', +        'info_dict': { +            'id': 'osna272', +            'display_id': '40-Osnabrueck-spielt-sich-in-einen-Rausch', +            'ext': 'mp4', +            'title': 'Osnabrück - Wehen Wiesbaden: Die Highlights', +            'description': 'md5:32e9b800b3d2d4008103752682d5dc01', +            'uploader': 'ndrtv', +            'timestamp': 1442059200, +            'upload_date': '20150912', +            'duration': 510, +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        # httpAudio, same content id +        'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', +        'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', +        'info_dict': { +            'id': 'audio51535', +            'display_id': 'La-Valette-entgeht-der-Hinrichtung', +            'ext': 'mp3', +            'title': 'La Valette entgeht der Hinrichtung', +            'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', +            'uploader': 'ndrinfo', +            'timestamp': 1290626100, +            'upload_date': '20140729', +            'duration': 884, +        }, +        'params': { +            'skip_download': True, +        }, +    }] -        title = self._og_search_title(page).strip() -        description = self._og_search_description(page) -        if description: -            description = description.strip() +    def _extract_embed(self, webpage, display_id): +        embed_url = self._html_search_meta( +            'embedURL', webpage, 'embed URL', fatal=True) +        description = self._search_regex( +            r'<p[^>]+itemprop="description">([^<]+)</p>', +            webpage, 'description', fatal=False) +        timestamp = parse_iso8601( +            self._search_regex( +                r'<span itemprop="datePublished" content="([^"]+)">', +                webpage, 'upload date', fatal=False)) +        return { +            '_type': 'url_transparent', +            'url': embed_url, +            'display_id': display_id, +            'description': description, +            'timestamp': timestamp, +        } -        duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', default=None)) -        if not duration: -            duration = parse_duration(self._html_search_regex( -                r'(<span class="min">\d+</span>:<span class="sec">\d+</span>)', -                page, 'duration', default=None)) -        formats = [] +class NJoyIE(NDRBaseIE): +    IE_NAME = 'njoy' +    IE_DESC = 'N-JOY' +    _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?P<id>[^/?#]+),[\da-z]+\.html' +    _TESTS = [{ +        # httpVideo, same content id +        'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', +        'md5': 'cb63be60cd6f9dd75218803146d8dc67', +        'info_dict': { +            'id': 'comedycontest2480', +            'display_id': 'Benaissa-beim-NDR-Comedy-Contest', +            'ext': 'mp4', +            'title': 'Benaissa beim NDR Comedy Contest', +            'description': 'md5:f057a6c4e1c728b10d33b5ffd36ddc39', +            'uploader': 'ndrtv', +            'upload_date': '20141129', +            'duration': 654, +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        # httpVideo, different content id +        'url': 'http://www.n-joy.de/musik/Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-,felixjaehn168.html', +        'md5': '417660fffa90e6df2fda19f1b40a64d8', +        'info_dict': { +            'id': 'dockville882', +            'display_id': 'Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-', +            'ext': 'mp4', +            'title': '"Ich hab noch nie" mit Felix Jaehn', +            'description': 'md5:85dd312d53be1b99e1f998a16452a2f3', +            'uploader': 'njoy', +            'upload_date': '20150822', +            'duration': 211, +        }, +        'params': { +            'skip_download': True, +        }, +    }] + +    def _extract_embed(self, webpage, display_id): +        video_id = self._search_regex( +            r'<iframe[^>]+id="pp_([\da-z]+)"', webpage, 'embed id') +        description = self._search_regex( +            r'<div[^>]+class="subline"[^>]*>[^<]+</div>\s*<p>([^<]+)</p>', +            webpage, 'description', fatal=False) +        return { +            '_type': 'url_transparent', +            'ie_key': 'NDREmbedBase', +            'url': 'ndr:%s' % video_id, +            'display_id': display_id, +            'description': description, +        } -        mp3_url = re.search(r'''\{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page) -        if mp3_url: -            formats.append({ -                'url': mp3_url.group('audio'), -                'format_id': 'mp3', -            }) -        thumbnail = None +class NDREmbedBaseIE(InfoExtractor): +    IE_NAME = 'ndr:embed:base' +    _VALID_URL = r'(?:ndr:(?P<id_s>[\da-z]+)|https?://www\.ndr\.de/(?P<id>[\da-z]+)-ppjson\.json)' +    _TESTS = [{ +        'url': 'ndr:soundcheck3366', +        'only_matching': True, +    }, { +        'url': 'http://www.ndr.de/soundcheck3366-ppjson.json', +        'only_matching': True, +    }] -        video_url = re.search(r'''3: \{src:'(?P<video>.+?)\.(lo|hi|hq)\.mp4', type:"video/mp4"},''', page) -        if video_url: -            thumbnails = re.findall(r'''\d+: \{src: "([^"]+)"(?: \|\| '[^']+')?, quality: '([^']+)'}''', page) -            if thumbnails: -                quality_key = qualities(['xs', 's', 'm', 'l', 'xl']) -                largest = max(thumbnails, key=lambda thumb: quality_key(thumb[1])) -                thumbnail = 'http://www.ndr.de' + largest[0] +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') or mobj.group('id_s') -            for format_id in 'lo', 'hi', 'hq': -                formats.append({ -                    'url': '%s.%s.mp4' % (video_url.group('video'), format_id), -                    'format_id': format_id, -                }) +        ppjson = self._download_json( +            'http://www.ndr.de/%s-ppjson.json' % video_id, video_id) -        if not formats: -            raise ExtractorError('No media links available for %s' % video_id) +        playlist = ppjson['playlist'] + +        formats = [] +        quality_key = qualities(('xs', 's', 'm', 'l', 'xl')) + +        for format_id, f in playlist.items(): +            src = f.get('src') +            if not src: +                continue +            ext = determine_ext(src, None) +            if ext == 'f4m': +                formats.extend(self._extract_f4m_formats( +                    src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id='hds')) +            elif ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    src, video_id, m3u8_id='hls', entry_protocol='m3u8_native')) +            else: +                quality = f.get('quality') +                ff = { +                    'url': src, +                    'format_id': quality or format_id, +                    'quality': quality_key(quality), +                } +                type_ = f.get('type') +                if type_ and type_.split('/')[0] == 'audio': +                    ff['vcodec'] = 'none' +                    ff['ext'] = ext or 'mp3' +                formats.append(ff) +        self._sort_formats(formats) + +        config = playlist['config'] + +        live = playlist.get('config', {}).get('streamType') in ['httpVideoLive', 'httpAudioLive'] +        title = config['title'] +        if live: +            title = self._live_title(title) +        uploader = ppjson.get('config', {}).get('branding') +        upload_date = ppjson.get('config', {}).get('publicationDate') +        duration = int_or_none(config.get('duration')) + +        thumbnails = [{ +            'id': thumbnail.get('quality') or thumbnail_id, +            'url': thumbnail['src'], +            'preference': quality_key(thumbnail.get('quality')), +        } for thumbnail_id, thumbnail in config.get('poster', {}).items() if thumbnail.get('src')]          return {              'id': video_id,              'title': title, -            'description': description, -            'thumbnail': thumbnail, +            'is_live': live, +            'uploader': uploader if uploader != '-' else None, +            'upload_date': upload_date[0:8] if upload_date else None,              'duration': duration, +            'thumbnails': thumbnails,              'formats': formats,          } -class NDRIE(NDRBaseIE): -    IE_NAME = 'ndr' -    IE_DESC = 'NDR.de - Mediathek' -    _VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html' - -    _TESTS = [ -        { -            'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html', -            'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c', -            'note': 'Video file', -            'info_dict': { -                'id': '25866', -                'ext': 'mp4', -                'title': 'Kartoffeltage in der Lewitz', -                'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8', -                'duration': 166, -            }, -            'skip': '404 Not found', -        }, -        { -            'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', -            'md5': 'dadc003c55ae12a5d2f6bd436cd73f59', -            'info_dict': { -                'id': '988', -                'ext': 'mp4', -                'title': 'Party, Pötte und Parade', -                'description': 'Hunderttausende feiern zwischen Speicherstadt und St. Pauli den 826. Hafengeburtstag. Die NDR Sondersendung zeigt die schönsten und spektakulärsten Bilder vom Auftakt.', -                'duration': 3498, -            }, -        }, -        { -            'url': 'http://www.ndr.de/info/audio51535.html', -            'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', -            'note': 'Audio file', -            'info_dict': { -                'id': '51535', -                'ext': 'mp3', -                'title': 'La Valette entgeht der Hinrichtung', -                'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', -                'duration': 884, -            } -        } -    ] - +class NDREmbedIE(NDREmbedBaseIE): +    IE_NAME = 'ndr:embed' +    _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html' +    _TESTS = [{ +        'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', +        'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', +        'info_dict': { +            'id': 'ndraktuell28488', +            'ext': 'mp4', +            'title': 'Norddeutschland begrüßt Flüchtlinge', +            'is_live': False, +            'uploader': 'ndrtv', +            'upload_date': '20150907', +            'duration': 132, +        }, +    }, { +        'url': 'http://www.ndr.de/ndr2/events/soundcheck/soundcheck3366-player.html', +        'md5': '002085c44bae38802d94ae5802a36e78', +        'info_dict': { +            'id': 'soundcheck3366', +            'ext': 'mp4', +            'title': 'Ella Henderson braucht Vergleiche nicht zu scheuen', +            'is_live': False, +            'uploader': 'ndr2', +            'upload_date': '20150912', +            'duration': 3554, +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        'url': 'http://www.ndr.de/info/audio51535-player.html', +        'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', +        'info_dict': { +            'id': 'audio51535', +            'ext': 'mp3', +            'title': 'La Valette entgeht der Hinrichtung', +            'is_live': False, +            'uploader': 'ndrinfo', +            'upload_date': '20140729', +            'duration': 884, +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        'url': 'http://www.ndr.de/fernsehen/sendungen/visite/visite11010-externalPlayer.html', +        'md5': 'ae57f80511c1e1f2fd0d0d3d31aeae7c', +        'info_dict': { +            'id': 'visite11010', +            'ext': 'mp4', +            'title': 'Visite - die ganze Sendung', +            'is_live': False, +            'uploader': 'ndrtv', +            'upload_date': '20150902', +            'duration': 3525, +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        # httpVideoLive +        'url': 'http://www.ndr.de/fernsehen/livestream/livestream217-externalPlayer.html', +        'info_dict': { +            'id': 'livestream217', +            'ext': 'flv', +            'title': 're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', +            'is_live': True, +            'upload_date': '20150910', +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        'url': 'http://www.ndr.de/ndrkultur/audio255020-player.html', +        'only_matching': True, +    }, { +        'url': 'http://www.ndr.de/fernsehen/sendungen/nordtour/nordtour7124-player.html', +        'only_matching': True, +    }, { +        'url': 'http://www.ndr.de/kultur/film/videos/videoimport10424-player.html', +        'only_matching': True, +    }, { +        'url': 'http://www.ndr.de/fernsehen/sendungen/hamburg_journal/hamj43006-player.html', +        'only_matching': True, +    }, { +        'url': 'http://www.ndr.de/fernsehen/sendungen/weltbilder/weltbilder4518-player.html', +        'only_matching': True, +    }, { +        'url': 'http://www.ndr.de/fernsehen/doku952-player.html', +        'only_matching': True, +    }] -class NJoyIE(NDRBaseIE): -    IE_NAME = 'N-JOY' -    _VALID_URL = r'https?://www\.n-joy\.de/.+?(?P<id>\d+)\.html' -    _TEST = { -        'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', -        'md5': 'cb63be60cd6f9dd75218803146d8dc67', +class NJoyEmbedIE(NDREmbedBaseIE): +    IE_NAME = 'njoy:embed' +    _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' +    _TESTS = [{ +        # httpVideo +        'url': 'http://www.n-joy.de/events/reeperbahnfestival/doku948-player_image-bc168e87-5263-4d6d-bd27-bb643005a6de_theme-n-joy.html', +        'md5': '8483cbfe2320bd4d28a349d62d88bd74',          'info_dict': { -            'id': '2480', +            'id': 'doku948',              'ext': 'mp4', -            'title': 'Benaissa beim NDR Comedy Contest', -            'description': 'Von seinem sehr "behaarten" Leben lässt sich Benaissa trotz aller Schwierigkeiten nicht unterkriegen.', -            'duration': 654, -        } -    } +            'title': 'Zehn Jahre Reeperbahn Festival - die Doku', +            'is_live': False, +            'upload_date': '20150807', +            'duration': 1011, +        }, +    }, { +        # httpAudio +        'url': 'http://www.n-joy.de/news_wissen/stefanrichter100-player_image-d5e938b1-f21a-4b9a-86b8-aaba8bca3a13_theme-n-joy.html', +        'md5': 'd989f80f28ac954430f7b8a48197188a', +        'info_dict': { +            'id': 'stefanrichter100', +            'ext': 'mp3', +            'title': 'Interview mit einem Augenzeugen', +            'is_live': False, +            'uploader': 'njoy', +            'upload_date': '20150909', +            'duration': 140, +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        # httpAudioLive, no explicit ext +        'url': 'http://www.n-joy.de/news_wissen/webradioweltweit100-player_image-3fec0484-2244-4565-8fb8-ed25fd28b173_theme-n-joy.html', +        'info_dict': { +            'id': 'webradioweltweit100', +            'ext': 'mp3', +            'title': 're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', +            'is_live': True, +            'uploader': 'njoy', +            'upload_date': '20150810', +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        'url': 'http://www.n-joy.de/musik/dockville882-player_image-3905259e-0803-4764-ac72-8b7de077d80a_theme-n-joy.html', +        'only_matching': True, +    }, { +        'url': 'http://www.n-joy.de/radio/sendungen/morningshow/urlaubsfotos190-player_image-066a5df1-5c95-49ec-a323-941d848718db_theme-n-joy.html', +        'only_matching': True, +    }, { +        'url': 'http://www.n-joy.de/entertainment/comedy/krudetv290-player_image-ab261bfe-51bf-4bf3-87ba-c5122ee35b3d_theme-n-joy.html', +        'only_matching': True, +    }] diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index dc54634a5..55dc6107d 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -16,53 +16,104 @@ from ..utils import (  class NFLIE(InfoExtractor):      IE_NAME = 'nfl.com' -    _VALID_URL = r'''(?x)https?:// -        (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/ -        (?:.+?/)* -        (?P<id>(?:[a-z0-9]{16}|\w{8}\-(?:\w{4}\-){3}\w{12}))''' -    _TESTS = [ -        { -            'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', -            'md5': '394ef771ddcd1354f665b471d78ec4c6', -            'info_dict': { -                'id': '0ap3000000398478', -                'ext': 'mp4', -                'title': 'Week 3: Redskins vs. Eagles highlights', -                'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', -                'upload_date': '20140921', -                'timestamp': 1411337580, -                'thumbnail': 're:^https?://.*\.jpg$', -            } -        }, -        { -            'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266', -            'md5': 'cf85bdb4bc49f6e9d3816d130c78279c', -            'info_dict': { -                'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266', -                'ext': 'mp4', -                'title': 'LIVE: Post Game vs. Browns', -                'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8', -                'upload_date': '20131229', -                'timestamp': 1388354455, -                'thumbnail': 're:^https?://.*\.jpg$', -            } +    _VALID_URL = r'''(?x) +                    https?:// +                        (?P<host> +                            (?:www\.)? +                            (?: +                                (?: +                                    nfl| +                                    buffalobills| +                                    miamidolphins| +                                    patriots| +                                    newyorkjets| +                                    baltimoreravens| +                                    bengals| +                                    clevelandbrowns| +                                    steelers| +                                    houstontexans| +                                    colts| +                                    jaguars| +                                    titansonline| +                                    denverbroncos| +                                    kcchiefs| +                                    raiders| +                                    chargers| +                                    dallascowboys| +                                    giants| +                                    philadelphiaeagles| +                                    redskins| +                                    chicagobears| +                                    detroitlions| +                                    packers| +                                    vikings| +                                    atlantafalcons| +                                    panthers| +                                    neworleanssaints| +                                    buccaneers| +                                    azcardinals| +                                    stlouisrams| +                                    49ers| +                                    seahawks +                                )\.com| +                                .+?\.clubs\.nfl\.com +                            ) +                        )/ +                        (?:.+?/)* +                        (?P<id>[^/#?&]+) +                    ''' +    _TESTS = [{ +        'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', +        'md5': '394ef771ddcd1354f665b471d78ec4c6', +        'info_dict': { +            'id': '0ap3000000398478', +            'ext': 'mp4', +            'title': 'Week 3: Redskins vs. Eagles highlights', +            'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', +            'upload_date': '20140921', +            'timestamp': 1411337580, +            'thumbnail': 're:^https?://.*\.jpg$', +        } +    }, { +        'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266', +        'md5': 'cf85bdb4bc49f6e9d3816d130c78279c', +        'info_dict': { +            'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266', +            'ext': 'mp4', +            'title': 'LIVE: Post Game vs. Browns', +            'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8', +            'upload_date': '20131229', +            'timestamp': 1388354455, +            'thumbnail': 're:^https?://.*\.jpg$', +        } +    }, { +        'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish', +        'info_dict': { +            'id': '0ap3000000467607', +            'ext': 'mp4', +            'title': 'Frustrations flare on the field', +            'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.', +            'timestamp': 1422850320, +            'upload_date': '20150202',          }, -        { -            'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish', -            'info_dict': { -                'id': '0ap3000000467607', -                'ext': 'mp4', -                'title': 'Frustrations flare on the field', -                'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.', -                'timestamp': 1422850320, -                'upload_date': '20150202', -            }, +    }, { +        'url': 'http://www.patriots.com/video/2015/09/18/10-days-gillette', +        'md5': '4c319e2f625ffd0b481b4382c6fc124c', +        'info_dict': { +            'id': 'n-238346', +            'ext': 'mp4', +            'title': '10 Days at Gillette', +            'description': 'md5:8cd9cd48fac16de596eadc0b24add951', +            'timestamp': 1442618809, +            'upload_date': '20150918',          }, -        { -            'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood', -            'only_matching': True, -        } -    ] +    }, { +        'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood', +        'only_matching': True, +    }, { +        'url': 'http://www.buffalobills.com/video/videos/Rex_Ryan_Show_World_Wide_Rex/b1dcfab2-3190-4bb1-bfc0-d6e603d6601a', +        'only_matching': True, +    }]      @staticmethod      def prepend_host(host, url): @@ -95,13 +146,14 @@ class NFLIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          config_url = NFLIE.prepend_host(host, self._search_regex( -            r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL', -            default='static/content/static/config/video/config.json')) +            r'(?:(?:config|configURL)\s*:\s*|<nflcs:avplayer[^>]+data-config\s*=\s*)(["\'])(?P<config>.+?)\1', +            webpage, 'config URL', default='static/content/static/config/video/config.json', +            group='config'))          # For articles, the id in the url is not the video id          video_id = self._search_regex( -            r'contentId\s*:\s*"([^"]+)"', webpage, 'video id', default=video_id) -        config = self._download_json(config_url, video_id, -                                     note='Downloading player config') +            r'(?:<nflcs:avplayer[^>]+data-contentId\s*=\s*|contentId\s*:\s*)(["\'])(?P<id>.+?)\1', +            webpage, 'video id', default=video_id, group='id') +        config = self._download_json(config_url, video_id, 'Downloading player config')          url_template = NFLIE.prepend_host(              host, '{contentURLTemplate:}'.format(**config))          video_data = self._download_json( diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 279b18386..e98a5ef89 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -72,7 +72,7 @@ class NHLBaseInfoExtractor(InfoExtractor):  class NHLIE(NHLBaseInfoExtractor):      IE_NAME = 'nhl.com' -    _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/(?:console)?(?:\?(?:.*?[?&])?)(?:id|hlg)=(?P<id>[-0-9a-zA-Z,]+)' +    _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/(?:console|embed)?(?:\?(?:.*?[?&])?)(?:id|hlg|playlist)=(?P<id>[-0-9a-zA-Z,]+)'      _TESTS = [{          'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', @@ -136,6 +136,9 @@ class NHLIE(NHLBaseInfoExtractor):          'params': {              'skip_download': True,  # Requires rtmpdump          } +    }, { +        'url': 'http://video.nhl.com/videocenter/embed?playlist=836127', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -146,9 +149,9 @@ class NHLIE(NHLBaseInfoExtractor):  class NHLNewsIE(NHLBaseInfoExtractor):      IE_NAME = 'nhl.com:news'      IE_DESC = 'NHL news' -    _VALID_URL = r'https?://(?:www\.)?nhl\.com/ice/news\.html?(?:\?(?:.*?[?&])?)id=(?P<id>[-0-9a-zA-Z]+)' +    _VALID_URL = r'https?://(?:.+?\.)?nhl\.com/(?:ice|club)/news\.html?(?:\?(?:.*?[?&])?)id=(?P<id>[-0-9a-zA-Z]+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.nhl.com/ice/news.htm?id=750727',          'md5': '4b3d1262e177687a3009937bd9ec0be8',          'info_dict': { @@ -159,13 +162,26 @@ class NHLNewsIE(NHLBaseInfoExtractor):              'duration': 37,              'upload_date': '20150128',          }, -    } +    }, { +        # iframe embed +        'url': 'http://sabres.nhl.com/club/news.htm?id=780189', +        'md5': '9f663d1c006c90ac9fb82777d4294e12', +        'info_dict': { +            'id': '836127', +            'ext': 'mp4', +            'title': 'Morning Skate: OTT vs. BUF (9/23/15)', +            'description': "Brian Duff chats with Tyler Ennis prior to Buffalo's first preseason home game.", +            'duration': 93, +            'upload_date': '20150923', +        }, +    }]      def _real_extract(self, url):          news_id = self._match_id(url)          webpage = self._download_webpage(url, news_id)          video_id = self._search_regex( -            [r'pVid(\d+)', r"nlid\s*:\s*'(\d+)'"], +            [r'pVid(\d+)', r"nlid\s*:\s*'(\d+)'", +             r'<iframe[^>]+src=["\']https?://video.*?\.nhl\.com/videocenter/embed\?.*\bplaylist=(\d+)'],              webpage, 'video id')          return self._real_extract_video(video_id) diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index 7f842b5c2..a06d38afd 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -1,7 +1,6 @@  from __future__ import unicode_literals  import re -import json  from .common import InfoExtractor  from ..utils import str_to_int @@ -9,61 +8,93 @@ from ..utils import str_to_int  class NineGagIE(InfoExtractor):      IE_NAME = '9gag' -    _VALID_URL = r'''(?x)^https?://(?:www\.)?9gag\.tv/ -        (?: -            v/(?P<numid>[0-9]+)| -            p/(?P<id>[a-zA-Z0-9]+)/(?P<display_id>[^?#/]+) -        ) -    ''' +    _VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv|\.tv)/(?:p|embed)/(?P<id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^?#/]+))?'      _TESTS = [{ -        "url": "http://9gag.tv/v/1912", -        "info_dict": { -            "id": "1912", -            "ext": "mp4", -            "description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)", -            "title": "\"People Are Awesome 2013\" Is Absolutely Awesome", +        'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome', +        'info_dict': { +            'id': 'Kk2X5', +            'ext': 'mp4', +            'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)', +            'title': '\"People Are Awesome 2013\" Is Absolutely Awesome',              'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA',              'uploader': 'CompilationChannel',              'upload_date': '20131110', -            "view_count": int, -            "thumbnail": "re:^https?://", +            'view_count': int,          }, -        'add_ie': ['Youtube'] +        'add_ie': ['Youtube'],      }, { -        'url': 'http://9gag.tv/p/KklwM/alternate-banned-opening-scene-of-gravity?ref=fsidebar', +        'url': 'http://9gag.com/tv/p/aKolP3',          'info_dict': { -            'id': 'KklwM', +            'id': 'aKolP3',              'ext': 'mp4', -            'display_id': 'alternate-banned-opening-scene-of-gravity', -            "description": "While Gravity was a pretty awesome movie already, YouTuber Krishna Shenoi came up with a way to improve upon it, introducing a much better solution to Sandra Bullock's seemingly endless tumble in space. The ending is priceless.", -            'title': "Banned Opening Scene Of \"Gravity\" That Changes The Whole Movie", -            'uploader': 'Krishna Shenoi', -            'upload_date': '20140401', -            'uploader_id': 'krishnashenoi93', +            'title': 'This Guy Travelled 11 countries In 44 days Just To Make This Amazing Video', +            'description': "I just saw more in 1 minute than I've seen in 1 year. This guy's video is epic!!", +            'uploader_id': 'rickmereki', +            'uploader': 'Rick Mereki', +            'upload_date': '20110803', +            'view_count': int,          }, +        'add_ie': ['Vimeo'], +    }, { +        'url': 'http://9gag.com/tv/p/KklwM', +        'only_matching': True, +    }, { +        'url': 'http://9gag.tv/p/Kk2X5', +        'only_matching': True, +    }, { +        'url': 'http://9gag.com/tv/embed/a5Dmvl', +        'only_matching': True,      }] +    _EXTERNAL_VIDEO_PROVIDER = { +        '1': { +            'url': '%s', +            'ie_key': 'Youtube', +        }, +        '2': { +            'url': 'http://player.vimeo.com/video/%s', +            'ie_key': 'Vimeo', +        }, +        '3': { +            'url': 'http://instagram.com/p/%s', +            'ie_key': 'Instagram', +        }, +        '4': { +            'url': 'http://vine.co/v/%s', +            'ie_key': 'Vine', +        }, +    } +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('numid') or mobj.group('id') +        video_id = mobj.group('id')          display_id = mobj.group('display_id') or video_id          webpage = self._download_webpage(url, display_id) -        post_view = json.loads(self._html_search_regex( -            r'var postView = new app\.PostView\({\s*post:\s*({.+?}),\s*posts:\s*prefetchedCurrentPost', webpage, 'post view')) +        post_view = self._parse_json( +            self._search_regex( +                r'var\s+postView\s*=\s*new\s+app\.PostView\({\s*post:\s*({.+?})\s*,\s*posts:\s*prefetchedCurrentPost', +                webpage, 'post view'), +            display_id) -        youtube_id = post_view['videoExternalId'] +        ie_key = None +        source_url = post_view.get('sourceUrl') +        if not source_url: +            external_video_id = post_view['videoExternalId'] +            external_video_provider = post_view['videoExternalProvider'] +            source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id +            ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key']          title = post_view['title'] -        description = post_view['description'] -        view_count = str_to_int(post_view['externalView']) +        description = post_view.get('description') +        view_count = str_to_int(post_view.get('externalView'))          thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w')          return {              '_type': 'url_transparent', -            'url': youtube_id, -            'ie_key': 'Youtube', +            'url': source_url, +            'ie_key': ie_key,              'id': video_id,              'display_id': display_id,              'title': title, diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index 6b2f3f55a..b97f62fdb 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -1,64 +1,134 @@  # encoding: utf-8  from __future__ import unicode_literals -import re -  from .brightcove import BrightcoveIE  from .common import InfoExtractor  from ..utils import ExtractorError +from ..compat import ( +    compat_str, +    compat_urllib_request, +) + + +class NownessBaseIE(InfoExtractor): +    def _extract_url_result(self, post): +        if post['type'] == 'video': +            for media in post['media']: +                if media['type'] == 'video': +                    video_id = media['content'] +                    source = media['source'] +                    if source == 'brightcove': +                        player_code = self._download_webpage( +                            'http://www.nowness.com/iframe?id=%s' % video_id, video_id, +                            note='Downloading player JavaScript', +                            errnote='Unable to download player JavaScript') +                        bc_url = BrightcoveIE._extract_brightcove_url(player_code) +                        if bc_url is None: +                            raise ExtractorError('Could not find player definition') +                        return self.url_result(bc_url, 'Brightcove') +                    elif source == 'vimeo': +                        return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') +                    elif source == 'youtube': +                        return self.url_result(video_id, 'Youtube') +                    elif source == 'cinematique': +                        # youtube-dl currently doesn't support cinematique +                        # return self.url_result('http://cinematique.com/embed/%s' % video_id, 'Cinematique') +                        pass +    def _api_request(self, url, request_path): +        display_id = self._match_id(url) +        request = compat_urllib_request.Request( +            'http://api.nowness.com/api/' + request_path % display_id, +            headers={ +                'X-Nowness-Language': 'zh-cn' if 'cn.nowness.com' in url else 'en-us', +            }) +        return display_id, self._download_json(request, display_id) -class NownessIE(InfoExtractor): -    _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/[^?#]*?/(?P<id>[0-9]+)/(?P<slug>[^/]+?)(?:$|[?#])' -    _TESTS = [ -        { -            'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation', -            'md5': '068bc0202558c2e391924cb8cc470676', -            'info_dict': { -                'id': '2520295746001', -                'ext': 'mp4', -                'title': 'Candor: The Art of Gesticulation', -                'description': 'Candor: The Art of Gesticulation', -                'thumbnail': 're:^https?://.*\.jpg', -                'uploader': 'Nowness', -            } +class NownessIE(NownessBaseIE): +    IE_NAME = 'nowness' +    _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/(?:story|(?:series|category)/[^/]+)/(?P<id>[^/]+?)(?:$|[?#])' +    _TESTS = [{ +        'url': 'https://www.nowness.com/story/candor-the-art-of-gesticulation', +        'md5': '068bc0202558c2e391924cb8cc470676', +        'info_dict': { +            'id': '2520295746001', +            'ext': 'mp4', +            'title': 'Candor: The Art of Gesticulation', +            'description': 'Candor: The Art of Gesticulation', +            'thumbnail': 're:^https?://.*\.jpg', +            'uploader': 'Nowness',          }, -        { -            'url': 'http://cn.nowness.com/day/2014/8/7/4069/kasper-bj-rke-ft-jaakko-eino-kalevi--tnr', -            'md5': 'e79cf125e387216f86b2e0a5b5c63aa3', -            'info_dict': { -                'id': '3716354522001', -                'ext': 'mp4', -                'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', -                'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', -                'thumbnail': 're:^https?://.*\.jpg', -                'uploader': 'Nowness', -            } +    }, { +        'url': 'https://cn.nowness.com/story/kasper-bjorke-ft-jaakko-eino-kalevi-tnr', +        'md5': 'e79cf125e387216f86b2e0a5b5c63aa3', +        'info_dict': { +            'id': '3716354522001', +            'ext': 'mp4', +            'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', +            'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', +            'thumbnail': 're:^https?://.*\.jpg', +            'uploader': 'Nowness',          }, -    ] +    }, { +        # vimeo +        'url': 'https://www.nowness.com/series/nowness-picks/jean-luc-godard-supercut', +        'md5': '9a5a6a8edf806407e411296ab6bc2a49', +        'info_dict': { +            'id': '130020913', +            'ext': 'mp4', +            'title': 'Bleu, Blanc, Rouge - A Godard Supercut', +            'description': 'md5:f0ea5f1857dffca02dbd37875d742cec', +            'thumbnail': 're:^https?://.*\.jpg', +            'upload_date': '20150607', +            'uploader': 'Cinema Sem Lei', +            'uploader_id': 'cinemasemlei', +        }, +    }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('slug') +        _, post = self._api_request(url, 'post/getBySlug/%s') +        return self._extract_url_result(post) -        webpage = self._download_webpage(url, video_id) -        player_url = self._search_regex( -            r'"([^"]+/content/issue-[0-9.]+.js)"', webpage, 'player URL') -        real_id = self._search_regex( -            r'\sdata-videoId="([0-9]+)"', webpage, 'internal video ID') -        player_code = self._download_webpage( -            player_url, video_id, -            note='Downloading player JavaScript', -            errnote='Player download failed') -        player_code = player_code.replace("'+d+'", real_id) +class NownessPlaylistIE(NownessBaseIE): +    IE_NAME = 'nowness:playlist' +    _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/playlist/(?P<id>\d+)' +    _TEST = { +        'url': 'https://www.nowness.com/playlist/3286/i-guess-thats-why-they-call-it-the-blues', +        'info_dict': { +            'id': '3286', +        }, +        'playlist_mincount': 8, +    } -        bc_url = BrightcoveIE._extract_brightcove_url(player_code) -        if bc_url is None: -            raise ExtractorError('Could not find player definition') -        return { -            '_type': 'url', -            'url': bc_url, -            'ie_key': 'Brightcove', -        } +    def _real_extract(self, url): +        playlist_id, playlist = self._api_request(url, 'post?PlaylistId=%s') +        entries = [self._extract_url_result(item) for item in playlist['items']] +        return self.playlist_result(entries, playlist_id) + + +class NownessSeriesIE(NownessBaseIE): +    IE_NAME = 'nowness:series' +    _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/series/(?P<id>[^/]+?)(?:$|[?#])' +    _TEST = { +        'url': 'https://www.nowness.com/series/60-seconds', +        'info_dict': { +            'id': '60', +            'title': '60 Seconds', +            'description': 'One-minute wisdom in a new NOWNESS series', +        }, +        'playlist_mincount': 4, +    } + +    def _real_extract(self, url): +        display_id, series = self._api_request(url, 'series/getBySlug/%s') +        entries = [self._extract_url_result(post) for post in series['posts']] +        series_title = None +        series_description = None +        translations = series.get('translations', []) +        if translations: +            series_title = translations[0].get('title') or translations[0]['seoTitle'] +            series_description = translations[0].get('seoDescription') +        return self.playlist_result( +            entries, compat_str(series['id']), series_title, series_description) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 66520c2c5..ccc88cfb1 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals  from .common import InfoExtractor  from ..compat import compat_urllib_parse_unquote  from ..utils import ( +    ExtractorError,      unified_strdate,      int_or_none,      qualities, @@ -28,6 +29,7 @@ class OdnoklassnikiIE(InfoExtractor):              'like_count': int,              'age_limit': 0,          }, +        'skip': 'Video has been blocked',      }, {          # metadataUrl          'url': 'http://ok.ru/video/63567059965189-0', @@ -72,6 +74,12 @@ class OdnoklassnikiIE(InfoExtractor):          webpage = self._download_webpage(              'http://ok.ru/video/%s' % video_id, video_id) +        error = self._search_regex( +            r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<', +            webpage, 'error', default=None) +        if error: +            raise ExtractorError(error, expected=True) +          player = self._parse_json(              unescapeHTML(self._search_regex(                  r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id, diff --git a/youtube_dl/extractor/openfilm.py b/youtube_dl/extractor/openfilm.py deleted file mode 100644 index d2ceedd01..000000000 --- a/youtube_dl/extractor/openfilm.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus -from ..utils import ( -    parse_iso8601, -    parse_age_limit, -    int_or_none, -) - - -class OpenFilmIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)openfilm\.com/videos/(?P<id>.+)' -    _TEST = { -        'url': 'http://www.openfilm.com/videos/human-resources-remastered', -        'md5': '42bcd88c2f3ec13b65edf0f8ad1cac37', -        'info_dict': { -            'id': '32736', -            'display_id': 'human-resources-remastered', -            'ext': 'mp4', -            'title': 'Human Resources (Remastered)', -            'description': 'Social Engineering in the 20th Century.', -            'thumbnail': 're:^https?://.*\.jpg$', -            'duration': 7164, -            'timestamp': 1334756988, -            'upload_date': '20120418', -            'uploader_id': '41117', -            'view_count': int, -            'age_limit': 0, -        }, -    } - -    def _real_extract(self, url): -        display_id = self._match_id(url) - -        webpage = self._download_webpage(url, display_id) - -        player = compat_urllib_parse_unquote_plus( -            self._og_search_video_url(webpage)) - -        video = json.loads(self._search_regex( -            r'\bp=({.+?})(?:&|$)', player, 'video JSON')) - -        video_url = '%s1.mp4' % video['location'] -        video_id = video.get('video_id') -        display_id = video.get('alias') or display_id -        title = video.get('title') -        description = video.get('description') -        thumbnail = video.get('main_thumb') -        duration = int_or_none(video.get('duration')) -        timestamp = parse_iso8601(video.get('dt_published'), ' ') -        uploader_id = video.get('user_id') -        view_count = int_or_none(video.get('views_count')) -        age_limit = parse_age_limit(video.get('age_limit')) - -        return { -            'id': video_id, -            'display_id': display_id, -            'url': video_url, -            'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'duration': duration, -            'timestamp': timestamp, -            'uploader_id': uploader_id, -            'view_count': view_count, -            'age_limit': age_limit, -        } diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py index bdc71017b..6d138ef25 100644 --- a/youtube_dl/extractor/playwire.py +++ b/youtube_dl/extractor/playwire.py @@ -19,7 +19,7 @@ class PlaywireIE(InfoExtractor):              'id': '3353705',              'ext': 'mp4',              'title': 'S04_RM_UCL_Rus', -            'thumbnail': 're:^http://.*\.png$', +            'thumbnail': 're:^https?://.*\.png$',              'duration': 145.94,          },      }, { diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 7b0cdc41a..a656ad85a 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -20,7 +20,7 @@ from ..aes import (  class PornHubIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)' +    _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'      _TESTS = [{          'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',          'md5': '882f488fa1f0026f023f33576004a2ed', @@ -34,6 +34,9 @@ class PornHubIE(InfoExtractor):      }, {          'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',          'only_matching': True, +    }, { +        'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', +        'only_matching': True,      }]      @classmethod diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 1631faf29..7ff1d06c4 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -5,6 +5,7 @@ import re  from .common import InfoExtractor  from ..compat import (      compat_urllib_parse, +    compat_urlparse,  )  from ..utils import (      parse_duration, @@ -72,6 +73,18 @@ class RaiIE(InfoExtractor):                  'description': 'Primo appuntamento con "Il candidato" con Filippo Timi, alias Piero Zucca presidente!',                  'uploader': 'RaiTre',              } +        }, +        { +            'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', +            'md5': '037104d2c14132887e5e4cf114569214', +            'info_dict': { +                'id': '0c7a664b-d0f4-4b2c-8835-3f82e46f433e', +                'ext': 'flv', +                'title': 'Il pacco', +                'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', +                'uploader': 'RaiTre', +                'upload_date': '20141221', +            },          }      ] @@ -90,11 +103,14 @@ class RaiIE(InfoExtractor):          relinker_url = self._extract_relinker_url(webpage)          if not relinker_url: -            iframe_path = self._search_regex( -                r'<iframe[^>]+src="/?(dl/[^"]+\?iframe\b[^"]*)"', +            iframe_url = self._search_regex( +                [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', +                 r'drawMediaRaiTV\(["\'](.+?)["\']'],                  webpage, 'iframe') +            if not iframe_url.startswith('http'): +                iframe_url = compat_urlparse.urljoin(url, iframe_url)              webpage = self._download_webpage( -                '%s/%s' % (host, iframe_path), video_id) +                iframe_url, video_id)              relinker_url = self._extract_relinker_url(webpage)          relinker = self._download_json( diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 6e9903d5e..f76fb12c0 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -16,7 +16,7 @@ class ShahidIE(InfoExtractor):          'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html',          'info_dict': {              'id': '90574', -            'ext': 'm3u8', +            'ext': 'mp4',              'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3',              'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان',              'duration': 2972, @@ -81,7 +81,7 @@ class ShahidIE(InfoExtractor):                  compat_urllib_parse.urlencode({                      'apiKey': 'sh@hid0nlin3',                      'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', -                }).encode('utf-8')), +                })),              video_id, 'Downloading video JSON')          video = video[api_vars['playerType']] diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index ed5dcc0d3..2b60d354a 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -113,7 +113,7 @@ class SoundcloudIE(InfoExtractor):          },      ] -    _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' +    _CLIENT_ID = '02gUJC0hH2ct1EGOcYXQIzRFU91c72Ea'      _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'      def report_resolve(self, video_id): diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index ae94f055c..2c8e9b941 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -1,24 +1,51 @@  # coding: utf-8  from __future__ import unicode_literals -from .mitele import MiTeleIE +import json +from .common import InfoExtractor +from ..compat import ( +    compat_urllib_parse, +    compat_urllib_parse_unquote, +    compat_urlparse, +) +from ..utils import ( +    get_element_by_attribute, +    parse_duration, +    strip_jsonp, +) -class TelecincoIE(MiTeleIE): -    IE_NAME = 'telecinco.es' -    _VALID_URL = r'https?://www\.telecinco\.es/(?:[^/]+/)+(?P<id>.+?)\.html' + +class TelecincoIE(InfoExtractor): +    IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' +    _VALID_URL = r'https?://www\.(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html'      _TESTS = [{          'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', +        'md5': '5cbef3ad5ef17bf0d21570332d140729',          'info_dict': {              'id': 'MDSVID20141015_0058',              'ext': 'mp4',              'title': 'Con Martín Berasategui, hacer un bacalao al ...',              'duration': 662,          }, -        'params': { -            # m3u8 download -            'skip_download': True, +    }, { +        'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', +        'md5': '0a5b9f3cc8b074f50a0578f823a12694', +        'info_dict': { +            'id': 'MDSVID20150916_0128', +            'ext': 'mp4', +            'title': '¿Quién es este ex futbolista con el que hablan ...', +            'duration': 79, +        }, +    }, { +        'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', +        'md5': 'ad1bfaaba922dd4a295724b05b68f86a', +        'info_dict': { +            'id': 'MDSVID20150513_0220', +            'ext': 'mp4', +            'title': '#DOYLACARA. Con la trata no hay trato', +            'duration': 50,          },      }, {          'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html', @@ -27,3 +54,41 @@ class TelecincoIE(MiTeleIE):          'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html',          'only_matching': True,      }] + +    def _real_extract(self, url): +        episode = self._match_id(url) +        webpage = self._download_webpage(url, episode) +        embed_data_json = self._search_regex( +            r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', +        ).replace('\'', '"') +        embed_data = json.loads(embed_data_json) + +        domain = embed_data['mediaUrl'] +        if not domain.startswith('http'): +            # only happens in telecinco.es videos +            domain = 'http://' + domain +        info_url = compat_urlparse.urljoin( +            domain, +            compat_urllib_parse_unquote(embed_data['flashvars']['host']) +        ) +        info_el = self._download_xml(info_url, episode).find('./video/info') + +        video_link = info_el.find('videoUrl/link').text +        token_query = compat_urllib_parse.urlencode({'id': video_link}) +        token_info = self._download_json( +            embed_data['flashvars']['ov_tk'] + '?' + token_query, +            episode, +            transform_source=strip_jsonp +        ) +        formats = self._extract_m3u8_formats( +            token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native') + +        return { +            'id': embed_data['videoId'], +            'display_id': episode, +            'title': info_el.find('title').text, +            'formats': formats, +            'description': get_element_by_attribute('class', 'text', webpage), +            'thumbnail': info_el.find('thumb').text, +            'duration': parse_duration(info_el.find('duration').text), +        } diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 84fe71aef..5f7ac4b35 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -2,14 +2,12 @@  from __future__ import unicode_literals -import re -import json -  from .common import InfoExtractor +from ..compat import compat_str  class TudouIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/.*?/(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])' +    _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/([^/]+/)*(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])'      _TESTS = [{          'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',          'md5': '140a49ed444bd22f93330985d8475fcb', @@ -27,41 +25,41 @@ class TudouIE(InfoExtractor):              'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',              'thumbnail': 're:^https?://.*\.jpg$',          } +    }, { +        'url': 'http://www.tudou.com/albumplay/cJAHGih4yYg.html', +        'only_matching': True,      }]      _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' -    def _url_for_id(self, id, quality=None): -        info_url = "http://v2.tudou.com/f?id=" + str(id) +    def _url_for_id(self, video_id, quality=None): +        info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)          if quality:              info_url += '&hd' + quality -        webpage = self._download_webpage(info_url, id, "Opening the info webpage") -        final_url = self._html_search_regex('>(.+?)</f>', webpage, 'video url') +        xml_data = self._download_xml(info_url, video_id, "Opening the info XML page") +        final_url = xml_data.text          return final_url      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage) -        if m and m.group(1): -            return { -                '_type': 'url', -                'url': 'youku:' + m.group(1), -                'ie_key': 'Youku' -            } +        youku_vcode = self._search_regex( +            r'vcode\s*:\s*[\'"]([^\'"]*)[\'"]', webpage, 'youku vcode', default=None) +        if youku_vcode: +            return self.url_result('youku:' + youku_vcode, ie='Youku')          title = self._search_regex( -            r",kw:\s*['\"](.+?)[\"']", webpage, 'title') +            r',kw\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'title')          thumbnail_url = self._search_regex( -            r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False) +            r',pic\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'thumbnail URL', fatal=False)          player_url = self._search_regex( -            r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']", +            r'playerUrl\s*:\s*[\'"]([^\'"]+\.swf)[\'"]',              webpage, 'player URL', default=self._PLAYER_URL) -        segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments') -        segments = json.loads(segs_json) +        segments = self._parse_json(self._search_regex( +            r'segs: \'([^\']+)\'', webpage, 'segments'), video_id)          # It looks like the keys are the arguments that have to be passed as          # the hd field in the request url, we pick the higher          # Also, filter non-number qualities (see issue #3643). diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 157bb74fe..078d283b2 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -1,10 +1,12 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..compat import compat_HTTPError  from ..utils import ( +    ExtractorError,      int_or_none,      float_or_none, -    str_to_int, +    parse_iso8601,  ) @@ -12,18 +14,41 @@ class VidmeIE(InfoExtractor):      _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)'      _TESTS = [{          'url': 'https://vid.me/QNB', -        'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', +        'md5': 'c62f1156138dc3323902188c5b5a8bd6',          'info_dict': {              'id': 'QNB',              'ext': 'mp4',              'title': 'Fishing for piranha - the easy way',              'description': 'source: https://www.facebook.com/photo.php?v=312276045600871', -            'duration': 119.92, +            'thumbnail': 're:^https?://.*\.jpg',              'timestamp': 1406313244,              'upload_date': '20140725', +            'age_limit': 0, +            'duration': 119.92, +            'view_count': int, +            'like_count': int, +            'comment_count': int, +        }, +    }, { +        'url': 'https://vid.me/Gc6M', +        'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', +        'info_dict': { +            'id': 'Gc6M', +            'ext': 'mp4', +            'title': 'O Mere Dil ke chain - Arnav and Khushi VM',              'thumbnail': 're:^https?://.*\.jpg', +            'timestamp': 1441211642, +            'upload_date': '20150902', +            'uploader': 'SunshineM', +            'uploader_id': '3552827', +            'age_limit': 0, +            'duration': 223.72,              'view_count': int,              'like_count': int, +            'comment_count': int, +        }, +        'params': { +            'skip_download': True,          },      }, {          # tests uploader field @@ -33,63 +58,95 @@ class VidmeIE(InfoExtractor):              'ext': 'mp4',              'title': 'The Carver',              'description': 'md5:e9c24870018ae8113be936645b93ba3c', -            'duration': 97.859999999999999, +            'thumbnail': 're:^https?://.*\.jpg',              'timestamp': 1433203629,              'upload_date': '20150602',              'uploader': 'Thomas', -            'thumbnail': 're:^https?://.*\.jpg', +            'uploader_id': '109747', +            'age_limit': 0, +            'duration': 97.859999999999999,              'view_count': int,              'like_count': int, +            'comment_count': int,          },          'params': {              'skip_download': True,          },      }, { -        # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching +        # nsfw test from http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching          'url': 'https://vid.me/e/Wmur', -        'only_matching': True, +        'info_dict': { +            'id': 'Wmur', +            'ext': 'mp4', +            'title': 'naked smoking & stretching', +            'thumbnail': 're:^https?://.*\.jpg', +            'timestamp': 1430931613, +            'upload_date': '20150506', +            'uploader': 'naked-yogi', +            'uploader_id': '1638622', +            'age_limit': 18, +            'duration': 653.26999999999998, +            'view_count': int, +            'like_count': int, +            'comment_count': int, +        }, +        'params': { +            'skip_download': True, +        },      }]      def _real_extract(self, url): -        url = url.replace('vid.me/e/', 'vid.me/')          video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) -        video_url = self._html_search_regex( -            r'<source src="([^"]+)"', webpage, 'video URL') +        try: +            response = self._download_json( +                'https://api.vid.me/videoByUrl/%s' % video_id, video_id) +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: +                response = self._parse_json(e.cause.read(), video_id) +            else: +                raise + +        error = response.get('error') +        if error: +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, error), expected=True) -        title = self._og_search_title(webpage) -        description = self._og_search_description(webpage, default='') -        thumbnail = self._og_search_thumbnail(webpage) -        timestamp = int_or_none(self._og_search_property( -            'updated_time', webpage, fatal=False)) -        width = int_or_none(self._og_search_property( -            'video:width', webpage, fatal=False)) -        height = int_or_none(self._og_search_property( -            'video:height', webpage, fatal=False)) -        duration = float_or_none(self._html_search_regex( -            r'data-duration="([^"]+)"', webpage, 'duration', fatal=False)) -        view_count = str_to_int(self._html_search_regex( -            r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', -            webpage, 'view count', fatal=False)) -        like_count = str_to_int(self._html_search_regex( -            r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">', -            webpage, 'like count', fatal=False)) -        uploader = self._html_search_regex( -            'class="video_author_username"[^>]*>([^<]+)', -            webpage, 'uploader', default=None) +        video = response['video'] + +        formats = [{ +            'format_id': f.get('type'), +            'url': f['uri'], +            'width': int_or_none(f.get('width')), +            'height': int_or_none(f.get('height')), +            'preference': 0 if f.get('type', '').endswith('clip') else 1, +        } for f in video.get('formats', []) if f.get('uri')] +        self._sort_formats(formats) + +        title = video['title'] +        description = video.get('description') +        thumbnail = video.get('thumbnail_url') +        timestamp = parse_iso8601(video.get('date_created'), ' ') +        uploader = video.get('user', {}).get('username') +        uploader_id = video.get('user', {}).get('user_id') +        age_limit = 18 if video.get('nsfw') is True else 0 +        duration = float_or_none(video.get('duration')) +        view_count = int_or_none(video.get('view_count')) +        like_count = int_or_none(video.get('likes_count')) +        comment_count = int_or_none(video.get('comment_count'))          return {              'id': video_id, -            'url': video_url,              'title': title,              'description': description,              'thumbnail': thumbnail, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'age_limit': age_limit,              'timestamp': timestamp, -            'width': width, -            'height': height,              'duration': duration,              'view_count': view_count,              'like_count': like_count, -            'uploader': uploader, +            'comment_count': comment_count, +            'formats': formats,          } diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 15377097e..c76c20614 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -2,6 +2,7 @@  from __future__ import unicode_literals  import re +import itertools  from .common import InfoExtractor @@ -91,31 +92,27 @@ class VierVideosIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          program = mobj.group('program') -        webpage = self._download_webpage(url, program) -          page_id = mobj.group('page')          if page_id:              page_id = int(page_id)              start_page = page_id -            last_page = start_page + 1              playlist_id = '%s-page%d' % (program, page_id)          else:              start_page = 0 -            last_page = int(self._search_regex( -                r'videos\?page=(\d+)">laatste</a>', -                webpage, 'last page', default=0)) + 1              playlist_id = program          entries = [] -        for current_page_id in range(start_page, last_page): +        for current_page_id in itertools.count(start_page):              current_page = self._download_webpage(                  'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id),                  program, -                'Downloading page %d' % (current_page_id + 1)) if current_page_id != page_id else webpage +                'Downloading page %d' % (current_page_id + 1))              page_entries = [                  self.url_result('http://www.vier.be' + video_url, 'Vier')                  for video_url in re.findall(                      r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]              entries.extend(page_entries) +            if page_id or '>Meer<' not in current_page: +                break          return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index cda02ba24..632e57fb4 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -3,12 +3,14 @@ from __future__ import unicode_literals  from .common import InfoExtractor  from ..compat import ( +    compat_HTTPError,      compat_urllib_request,      compat_urllib_parse,      compat_urllib_parse_unquote,  )  from ..utils import (      determine_ext, +    ExtractorError,      int_or_none,      parse_iso8601,      HEADRequest, @@ -16,14 +18,14 @@ from ..utils import (  class ViewsterIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?viewster\.com/(?:serie|movie)/(?P<id>\d+-\d+-\d+)' +    _VALID_URL = r'https?://(?:www\.)?viewster\.com/(?:serie|movie)/(?P<id>\d+-\d+-\d+)'      _TESTS = [{          # movie, Type=Movie          'url': 'http://www.viewster.com/movie/1140-11855-000/the-listening-project/', -        'md5': '14d3cfffe66d57b41ae2d9c873416f01', +        'md5': 'e642d1b27fcf3a4ffa79f194f5adde36',          'info_dict': {              'id': '1140-11855-000', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'The listening Project',              'description': 'md5:bac720244afd1a8ea279864e67baa071',              'timestamp': 1214870400, @@ -33,10 +35,10 @@ class ViewsterIE(InfoExtractor):      }, {          # series episode, Type=Episode          'url': 'http://www.viewster.com/serie/1284-19427-001/the-world-and-a-wall/', -        'md5': 'd5434c80fcfdb61651cc2199a88d6ba3', +        'md5': '9243079a8531809efe1b089db102c069',          'info_dict': {              'id': '1284-19427-001', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'The World and a Wall',              'description': 'md5:24814cf74d3453fdf5bfef9716d073e3',              'timestamp': 1428192000, @@ -61,6 +63,14 @@ class ViewsterIE(InfoExtractor):              'description': 'md5:e7097a8fc97151e25f085c9eb7a1cdb1',          },          'playlist_mincount': 16, +    }, { +        # geo restricted series +        'url': 'https://www.viewster.com/serie/1280-18794-002/', +        'only_matching': True, +    }, { +        # geo restricted video +        'url': 'https://www.viewster.com/serie/1280-18794-002/what-is-extraterritoriality-lawo/', +        'only_matching': True,      }]      _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' @@ -74,8 +84,8 @@ class ViewsterIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url)          # Get 'api_token' cookie -        self._request_webpage(HEADRequest(url), video_id) -        cookies = self._get_cookies(url) +        self._request_webpage(HEADRequest('http://www.viewster.com/'), video_id) +        cookies = self._get_cookies('http://www.viewster.com/')          self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value)          info = self._download_json( @@ -85,10 +95,16 @@ class ViewsterIE(InfoExtractor):          entry_id = info.get('Id') or info['id']          # unfinished serie has no Type -        if info.get('Type') in ['Serie', None]: -            episodes = self._download_json( -                'https://public-api.viewster.com/series/%s/episodes' % entry_id, -                video_id, 'Downloading series JSON') +        if info.get('Type') in ('Serie', None): +            try: +                episodes = self._download_json( +                    'https://public-api.viewster.com/series/%s/episodes' % entry_id, +                    video_id, 'Downloading series JSON') +            except ExtractorError as e: +                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: +                    self.raise_geo_restricted() +                else: +                    raise              entries = [                  self.url_result(                      'http://www.viewster.com/movie/%s' % episode['OriginId'], 'Viewster') @@ -98,7 +114,7 @@ class ViewsterIE(InfoExtractor):              return self.playlist_result(entries, video_id, title, description)          formats = [] -        for media_type in ('application/f4m+xml', 'application/x-mpegURL'): +        for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'):              media = self._download_json(                  'https://public-api.viewster.com/movies/%s/video?mediaType=%s'                  % (entry_id, compat_urllib_parse.quote(media_type)), @@ -120,9 +136,22 @@ class ViewsterIE(InfoExtractor):                      fatal=False  # m3u8 sometimes fail                  ))              else: -                formats.append({ +                format_id = media.get('Bitrate') +                f = {                      'url': video_url, -                }) +                    'format_id': 'mp4-%s' % format_id, +                    'height': int_or_none(media.get('Height')), +                    'width': int_or_none(media.get('Width')), +                    'preference': 1, +                } +                if format_id and not f['height']: +                    f['height'] = int_or_none(self._search_regex( +                        r'^(\d+)[pP]$', format_id, 'height', default=None)) +                formats.append(f) + +        if not formats and not info.get('LanguageSets') and not info.get('VODSettings'): +            self.raise_geo_restricted() +          self._sort_formats(formats)          synopsis = info.get('Synopsis', {}) diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 72eb010f8..ec8b99998 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -19,25 +19,25 @@ class WashingtonPostIE(InfoExtractor):              'title': 'Sinkhole of bureaucracy',          },          'playlist': [{ -            'md5': '79132cc09ec5309fa590ae46e4cc31bc', +            'md5': 'b9be794ceb56c7267d410a13f99d801a',              'info_dict': {                  'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',                  'ext': 'mp4',                  'title': 'Breaking Points: The Paper Mine', -                'duration': 1287, +                'duration': 1290,                  'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',                  'uploader': 'The Washington Post',                  'timestamp': 1395527908,                  'upload_date': '20140322',              },          }, { -            'md5': 'e1d5734c06865cc504ad99dc2de0d443', +            'md5': '1fff6a689d8770966df78c8cb6c8c17c',              'info_dict': {                  'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',                  'ext': 'mp4',                  'title': 'The town bureaucracy sustains',                  'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.', -                'duration': 2217, +                'duration': 2220,                  'timestamp': 1395528005,                  'upload_date': '20140322',                  'uploader': 'The Washington Post', diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 97315750f..8938c0e45 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -63,7 +63,9 @@ class XHamsterIE(InfoExtractor):          mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo)          webpage = self._download_webpage(mrss_url, video_id) -        title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title') +        title = self._html_search_regex( +            [r'<title>(?P<title>.+?)(?:, (?:[^,]+? )?Porn: xHamster| - xHamster\.com)</title>', +             r'<h1>([^<]+)</h1>'], webpage, 'title')          # Only a few videos have an description          mobj = re.search(r'<span>Description: </span>([^<]+)', webpage) diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 5aac8adb3..8bbac54e2 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -19,7 +19,7 @@ class XuiteIE(InfoExtractor):      _TESTS = [{          # Audio          'url': 'http://vlog.xuite.net/play/RGkzc1ZULTM4NjA5MTQuZmx2', -        'md5': '63a42c705772aa53fd4c1a0027f86adf', +        'md5': 'e79284c87b371424885448d11f6398c8',          'info_dict': {              'id': '3860914',              'ext': 'mp3', diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index b8579b573..fca5ddc69 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -144,6 +144,17 @@ class YahooIE(InfoExtractor):          }, {              'url': 'https://tw.news.yahoo.com/-100120367.html',              'only_matching': True, +        }, { +            # Query result is embedded in webpage, but explicit request to video API fails with geo restriction +            'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', +            'md5': '4fbafb9c9b6f07aa8f870629f6671b35', +            'info_dict': { +                'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', +                'ext': 'mp4', +                'title': 'Communitary - Community Episode 1: Ladders', +                'description': 'md5:8fc39608213295748e1e289807838c97', +                'duration': 1646, +            },          }      ] @@ -171,6 +182,19 @@ class YahooIE(InfoExtractor):          if nbc_sports_url:              return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') +        # Query result is often embedded in webpage as JSON. Sometimes explicit requests +        # to video API results in a failure with geo restriction reason therefore using +        # embedded query result when present sounds reasonable. +        config_json = self._search_regex( +            r'window\.Af\.bootstrap\[[^\]]+\]\s*=\s*({.*?"applet_type"\s*:\s*"td-applet-videoplayer".*?});(?:</script>|$)', +            webpage, 'videoplayer applet', default=None) +        if config_json: +            config = self._parse_json(config_json, display_id, fatal=False) +            if config: +                sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') +                if sapi: +                    return self._extract_info(display_id, sapi, webpage) +          items_json = self._search_regex(              r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,              default=None) @@ -190,22 +214,10 @@ class YahooIE(InfoExtractor):              video_id = info['id']          return self._get_info(video_id, display_id, webpage) -    def _get_info(self, video_id, display_id, webpage): -        region = self._search_regex( -            r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', -            webpage, 'region', fatal=False, default='US') -        data = compat_urllib_parse.urlencode({ -            'protocol': 'http', -            'region': region, -        }) -        query_url = ( -            'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' -            '{id}?{data}'.format(id=video_id, data=data)) -        query_result = self._download_json( -            query_url, display_id, 'Downloading video info') - -        info = query_result['query']['results']['mediaObj'][0] +    def _extract_info(self, display_id, query, webpage): +        info = query['query']['results']['mediaObj'][0]          meta = info.get('meta') +        video_id = info.get('id')          if not meta:              msg = info['status'].get('msg') @@ -231,6 +243,9 @@ class YahooIE(InfoExtractor):                      'ext': 'flv',                  })              else: +                if s.get('format') == 'm3u8_playlist': +                    format_info['protocol'] = 'm3u8_native' +                    format_info['ext'] = 'mp4'                  format_url = compat_urlparse.urljoin(host, path)                  format_info['url'] = format_url              formats.append(format_info) @@ -264,6 +279,21 @@ class YahooIE(InfoExtractor):              'subtitles': subtitles,          } +    def _get_info(self, video_id, display_id, webpage): +        region = self._search_regex( +            r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', +            webpage, 'region', fatal=False, default='US') +        data = compat_urllib_parse.urlencode({ +            'protocol': 'http', +            'region': region, +        }) +        query_url = ( +            'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' +            '{id}?{data}'.format(id=video_id, data=data)) +        query_result = self._download_json( +            query_url, display_id, 'Downloading video info') +        return self._extract_info(display_id, query_result, webpage) +  class YahooSearchIE(SearchInfoExtractor):      IE_DESC = 'Yahoo screen search' diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 97ce36550..b252e36e1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1654,12 +1654,15 @@ class YoutubeChannelIE(InfoExtractor):          channel_page = self._download_webpage(              url + '?view=57', channel_id,              'Downloading channel page', fatal=False) -        channel_playlist_id = self._html_search_meta( -            'channelId', channel_page, 'channel id', default=None) -        if not channel_playlist_id: -            channel_playlist_id = self._search_regex( -                r'data-channel-external-id="([^"]+)"', -                channel_page, 'channel id', default=None) +        if channel_page is False: +            channel_playlist_id = False +        else: +            channel_playlist_id = self._html_search_meta( +                'channelId', channel_page, 'channel id', default=None) +            if not channel_playlist_id: +                channel_playlist_id = self._search_regex( +                    r'data-channel-external-id="([^"]+)"', +                    channel_page, 'channel id', default=None)          if channel_playlist_id and channel_playlist_id.startswith('UC'):              playlist_id = 'UU' + channel_playlist_id[2:]              return self.url_result( @@ -1970,6 +1973,7 @@ class YoutubeTruncatedURLIE(InfoExtractor):              annotation_id=annotation_[^&]+|              x-yt-cl=[0-9]+|              hl=[^&]*| +            t=[0-9]+          )?          |              attribution_link\?a=[^&]+ @@ -1992,6 +1996,9 @@ class YoutubeTruncatedURLIE(InfoExtractor):      }, {          'url': 'https://www.youtube.com/watch?hl=en-GB',          'only_matching': True, +    }, { +        'url': 'https://www.youtube.com/watch?t=2372', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 1f723908b..4f320e124 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -135,7 +135,10 @@ class FFmpegPostProcessor(PostProcessor):          files_cmd = []          for path in input_paths: -            files_cmd.extend([encodeArgument('-i'), encodeFilename(path, True)]) +            files_cmd.extend([ +                encodeArgument('-i'), +                encodeFilename(self._ffmpeg_filename_argument(path), True) +            ])          cmd = ([encodeFilename(self.executable, True), encodeArgument('-y')] +                 files_cmd +                 [encodeArgument(o) for o in opts] + @@ -155,10 +158,10 @@ class FFmpegPostProcessor(PostProcessor):          self.run_ffmpeg_multiple_files([path], out_path, opts)      def _ffmpeg_filename_argument(self, fn): -        # ffmpeg broke --, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details -        if fn.startswith('-'): -            return './' + fn -        return fn +        # Always use 'file:' because the filename may contain ':' (ffmpeg +        # interprets that as a protocol) or can start with '-' (-- is broken in +        # ffmpeg, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details) +        return 'file:' + fn  class FFmpegExtractAudioPP(FFmpegPostProcessor): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 206dd56bc..1dc3153fd 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -619,7 +619,7 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):      # expected HTTP responses to meet HTTP/1.0 or later (see also      # https://github.com/rg3/youtube-dl/issues/6727)      if sys.version_info < (3, 0): -        kwargs['strict'] = True +        kwargs[b'strict'] = True      hc = http_class(*args, **kwargs)      source_address = ydl_handler._params.get('source_address')      if source_address is not None: diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6bc689b75..7ef4f2755 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.09.03' +__version__ = '2015.09.22' | 
