diff options
| -rw-r--r-- | AUTHORS | 1 | ||||
| -rw-r--r-- | test/test_compat.py | 2 | ||||
| -rwxr-xr-x | youtube_dl/YoutubeDL.py | 7 | ||||
| -rw-r--r-- | youtube_dl/__init__.py | 3 | ||||
| -rw-r--r-- | youtube_dl/cache.py | 4 | ||||
| -rw-r--r-- | youtube_dl/compat.py | 24 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/arte.py | 114 | ||||
| -rw-r--r-- | youtube_dl/extractor/bliptv.py | 14 | ||||
| -rw-r--r-- | youtube_dl/extractor/brightcove.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/comedycentral.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 44 | ||||
| -rw-r--r-- | youtube_dl/extractor/folketinget.py | 75 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/goldenmoustache.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/mtv.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/rtlnl.py | 10 | ||||
| -rw-r--r-- | youtube_dl/extractor/stanfordoc.py | 88 | ||||
| -rw-r--r-- | youtube_dl/extractor/sztvhu.py | 24 | ||||
| -rw-r--r-- | youtube_dl/extractor/telebruxelles.py | 60 | ||||
| -rw-r--r-- | youtube_dl/extractor/vh1.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 20 | ||||
| -rw-r--r-- | youtube_dl/jsinterp.py | 2 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 28 | ||||
| -rw-r--r-- | youtube_dl/version.py | 2 | 
25 files changed, 384 insertions, 161 deletions
| @@ -82,3 +82,4 @@ Xavier Beynon  Gabriel Schubiner  xantares  Jan Matějka +Mauroy Sébastien diff --git a/test/test_compat.py b/test/test_compat.py index 4a7fc3606..1eb454e06 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -26,11 +26,13 @@ class TestCompat(unittest.TestCase):          self.assertEqual(compat_getenv('YOUTUBE-DL-TEST'), test_str)      def test_compat_expanduser(self): +        old_home = os.environ.get('HOME')          test_str = 'C:\Documents and Settings\тест\Application Data'          os.environ['HOME'] = (              test_str if sys.version_info >= (3, 0)              else test_str.encode(get_filesystem_encoding()))          self.assertEqual(compat_expanduser('~'), test_str) +        os.environ['HOME'] = old_home      def test_all_present(self):          import youtube_dl.compat diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5c875b497..fde026fbf 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -624,7 +624,7 @@ class YoutubeDL(object):              return self.process_ie_result(                  new_result, download=download, extra_info=extra_info) -        elif result_type == 'playlist': +        elif result_type == 'playlist' or result_type == 'multi_video':              # We process each entry in the playlist              playlist = ie_result.get('title', None) or ie_result.get('id', None)              self.to_screen('[download] Downloading playlist: %s' % playlist) @@ -679,6 +679,9 @@ class YoutubeDL(object):              ie_result['entries'] = playlist_results              return ie_result          elif result_type == 'compat_list': +            self.report_warning( +                'Extractor %s returned a compat_list result. ' +                'It needs to be updated.' % ie_result.get('extractor'))              def _fixup(r):                  self.add_extra_info(r,                      { @@ -1001,7 +1004,7 @@ class YoutubeDL(object):              else:                  self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)                  try: -                    write_json_file(info_dict, encodeFilename(infofn)) +                    write_json_file(info_dict, infofn)                  except (OSError, IOError):                      self.report_error('Cannot write metadata to JSON file ' + infofn)                      return diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 8e186a0db..c1323b4f3 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -19,6 +19,7 @@ from .compat import (      compat_expanduser,      compat_getpass,      compat_print, +    workaround_optparse_bug9161,  )  from .utils import (      DateRange, @@ -57,6 +58,8 @@ def _real_main(argv=None):          # https://github.com/rg3/youtube-dl/issues/820          codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) +    workaround_optparse_bug9161() +      setproctitle('youtube-dl')      parser, opts, args = parseOpts(argv) diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py index 2d9b426cb..5fe839eb1 100644 --- a/youtube_dl/cache.py +++ b/youtube_dl/cache.py @@ -8,7 +8,7 @@ import re  import shutil  import traceback -from .compat import compat_expanduser +from .compat import compat_expanduser, compat_getenv  from .utils import write_json_file @@ -19,7 +19,7 @@ class Cache(object):      def _get_root_dir(self):          res = self._ydl.params.get('cachedir')          if res is None: -            cache_root = os.environ.get('XDG_CACHE_HOME', '~/.cache') +            cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache')              res = os.path.join(cache_root, 'youtube-dl')          return compat_expanduser(res) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 385924803..9d33a8ec5 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,6 +1,7 @@  from __future__ import unicode_literals  import getpass +import optparse  import os  import subprocess  import sys @@ -297,6 +298,28 @@ except TypeError:  else:      compat_kwargs = lambda kwargs: kwargs + +# Fix https://github.com/rg3/youtube-dl/issues/4223 +# See http://bugs.python.org/issue9161 for what is broken +def workaround_optparse_bug9161(): +    op = optparse.OptionParser() +    og = optparse.OptionGroup(op, 'foo') +    try: +        og.add_option('-t') +    except TypeError: +        real_add_option = optparse.OptionGroup.add_option + +        def _compat_add_option(self, *args, **kwargs): +            enc = lambda v: ( +                v.encode('ascii', 'replace') if isinstance(v, compat_str) +                else v) +            bargs = [enc(a) for a in args] +            bkwargs = dict( +                (k, enc(v)) for k, v in kwargs.items()) +            return real_add_option(self, *bargs, **bkwargs) +        optparse.OptionGroup.add_option = _compat_add_option + +  __all__ = [      'compat_HTTPError',      'compat_chr', @@ -323,4 +346,5 @@ __all__ = [      'compat_xml_parse_error',      'shlex_quote',      'subprocess_check_output', +    'workaround_optparse_bug9161',  ] diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b687a56b4..fcb75af34 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -115,6 +115,7 @@ from .fktv import (      FKTVPosteckeIE,  )  from .flickr import FlickrIE +from .folketinget import FolketingetIE  from .fourtube import FourTubeIE  from .franceculture import FranceCultureIE  from .franceinter import FranceInterIE @@ -379,6 +380,7 @@ from .teachingchannel import TeachingChannelIE  from .teamcoco import TeamcocoIE  from .techtalks import TechTalksIE  from .ted import TEDIE +from .telebruxelles import TeleBruxellesIE  from .telecinco import TelecincoIE  from .telemb import TeleMBIE  from .tenplay import TenPlayIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index b9a9440c0..3a57ce527 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -5,13 +5,12 @@ import re  from .common import InfoExtractor  from ..utils import ( -    ExtractorError,      find_xpath_attr,      unified_strdate, -    determine_ext,      get_element_by_id,      get_element_by_attribute,      int_or_none, +    qualities,  )  # There are different sources of video in arte.tv, the extraction process  @@ -102,79 +101,54 @@ class ArteTVPlus7IE(InfoExtractor):              'upload_date': unified_strdate(upload_date_str),              'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),          } +        qfunc = qualities(['HQ', 'MQ', 'EQ', 'SQ']) -        all_formats = [] +        formats = []          for format_id, format_dict in player_info['VSR'].items(): -            fmt = dict(format_dict) -            fmt['format_id'] = format_id -            all_formats.append(fmt) -        # Some formats use the m3u8 protocol -        all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats)) -        def _match_lang(f): -            if f.get('versionCode') is None: -                return True -            # Return true if that format is in the language of the url -            if lang == 'fr': -                l = 'F' -            elif lang == 'de': -                l = 'A' -            else: -                l = lang -            regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] -            return any(re.match(r, f['versionCode']) for r in regexes) -        # Some formats may not be in the same language as the url -        # TODO: Might want not to drop videos that does not match requested language -        # but to process those formats with lower precedence -        formats = filter(_match_lang, all_formats) -        formats = list(formats)  # in python3 filter returns an iterator -        if not formats: -            # Some videos are only available in the 'Originalversion' -            # they aren't tagged as being in French or German -            # Sometimes there are neither videos of requested lang code -            # nor original version videos available -            # For such cases we just take all_formats as is -            formats = all_formats -            if not formats: -                raise ExtractorError('The formats list is empty') - -        if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: -            def sort_key(f): -                return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) -        else: -            def sort_key(f): -                versionCode = f.get('versionCode') -                if versionCode is None: -                    versionCode = '' -                return ( -                    # Sort first by quality -                    int(f.get('height', -1)), -                    int(f.get('bitrate', -1)), -                    # The original version with subtitles has lower relevance -                    re.match(r'VO-ST(F|A)', versionCode) is None, -                    # The version with sourds/mal subtitles has also lower relevance -                    re.match(r'VO?(F|A)-STM\1', versionCode) is None, -                    # Prefer http downloads over m3u8 -                    0 if f['url'].endswith('m3u8') else 1, -                ) -        formats = sorted(formats, key=sort_key) -        def _format(format_info): -            info = { -                'format_id': format_info['format_id'], -                'format_note': '%s, %s' % (format_info.get('versionCode'), format_info.get('versionLibelle')), -                'width': int_or_none(format_info.get('width')), -                'height': int_or_none(format_info.get('height')), -                'tbr': int_or_none(format_info.get('bitrate')), +            f = dict(format_dict) +            versionCode = f.get('versionCode') + +            langcode = { +                'fr': 'F', +                'de': 'A', +            }.get(lang, lang) +            lang_rexs = [r'VO?%s' % langcode, r'VO?.-ST%s' % langcode] +            lang_pref = ( +                None if versionCode is None else ( +                    10 if any(re.match(r, versionCode) for r in lang_rexs) +                    else -10)) +            source_pref = 0 +            if versionCode is not None: +                # The original version with subtitles has lower relevance +                if re.match(r'VO-ST(F|A)', versionCode): +                    source_pref -= 10 +                # The version with sourds/mal subtitles has also lower relevance +                elif re.match(r'VO?(F|A)-STM\1', versionCode): +                    source_pref -= 9 +            format = { +                'format_id': format_id, +                'preference': -10 if f.get('videoFormat') == 'M3U8' else None, +                'language_preference': lang_pref, +                'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')), +                'width': int_or_none(f.get('width')), +                'height': int_or_none(f.get('height')), +                'tbr': int_or_none(f.get('bitrate')), +                'quality': qfunc(f['quality']), +                'source_preference': source_pref,              } -            if format_info['mediaType'] == 'rtmp': -                info['url'] = format_info['streamer'] -                info['play_path'] = 'mp4:' + format_info['url'] -                info['ext'] = 'flv' + +            if f.get('mediaType') == 'rtmp': +                format['url'] = f['streamer'] +                format['play_path'] = 'mp4:' + f['url'] +                format['ext'] = 'flv'              else: -                info['url'] = format_info['url'] -                info['ext'] = determine_ext(info['url']) -            return info -        info_dict['formats'] = [_format(f) for f in formats] +                format['url'] = f['url'] + +            formats.append(format) + +        self._sort_formats(formats) +        info_dict['formats'] = formats          return info_dict diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 2370c24b0..f2b02643d 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -166,9 +166,17 @@ class BlipTVIE(SubtitlesInfoExtractor):  class BlipTVUserIE(InfoExtractor): -    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$' +    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$'      _PAGE_SIZE = 12      IE_NAME = 'blip.tv:user' +    _TEST = { +        'url': 'http://blip.tv/actone', +        'info_dict': { +            'id': 'actone', +            'title': 'Act One: The Series', +        }, +        'playlist_count': 5, +    }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -179,6 +187,7 @@ class BlipTVUserIE(InfoExtractor):          page = self._download_webpage(url, username, 'Downloading user page')          mobj = re.search(r'data-users-id="([^"]+)"', page)          page_base = page_base % mobj.group(1) +        title = self._og_search_title(page)          # Download video ids using BlipTV Ajax calls. Result size per          # query is limited (currently to 12 videos) so we need to query @@ -215,4 +224,5 @@ class BlipTVUserIE(InfoExtractor):          urls = ['http://blip.tv/%s' % video_id for video_id in video_ids]          url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls] -        return [self.playlist_result(url_entries, playlist_title=username)] +        return self.playlist_result( +            url_entries, playlist_title=title, playlist_id=username) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index a6920685e..2db7f9fef 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -111,6 +111,8 @@ class BrightcoveIE(InfoExtractor):                              lambda m: m.group(1) + '/>', object_str)          # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608          object_str = object_str.replace('<--', '<!--') +        # remove namespace to simplify extraction +        object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str)          object_str = fix_xml_ampersands(object_str)          object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) @@ -219,7 +221,7 @@ class BrightcoveIE(InfoExtractor):          webpage = self._download_webpage(req, video_id)          error_msg = self._html_search_regex( -            r"<h1>We're sorry.</h1>\s*<p>(.*?)</p>", webpage, +            r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage,              'error message', default=None)          if error_msg is not None:              raise ExtractorError( diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 49b978b4e..2e3ef3fda 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals  import re -from .common import InfoExtractor  from .mtv import MTVServicesInfoExtractor  from ..utils import (      compat_str, @@ -110,9 +109,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url, re.VERBOSE) -        if mobj is None: -            raise ExtractorError('Invalid URL: %s' % url) +        mobj = re.match(self._VALID_URL, url)          if mobj.group('shortname'):              if mobj.group('shortname') in ('tds', 'thedailyshow'): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b77f0e519..93a5a3d57 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -43,7 +43,11 @@ class InfoExtractor(object):      information possibly downloading the video to the file system, among      other possible outcomes. -    The dictionaries must include the following fields: +    The type field determines the the type of the result. +    By far the most common value (and the default if _type is missing) is +    "video", which indicates a single video. + +    For a video, the dictionaries must include the following fields:      id:             Video identifier.      title:          Video title, unescaped. @@ -87,6 +91,11 @@ class InfoExtractor(object):                                   by this field, regardless of all other values.                                   -1 for default (order by other properties),                                   -2 or smaller for less than default. +                    * language_preference  Is this in the correct requested +                                 language? +                                 10 if it's what the URL is about, +                                 -1 for default (don't know), +                                 -10 otherwise, other values reserved for now.                      * quality    Order number of the video quality of this                                   format, irrespective of the file format.                                   -1 for default (order by other properties), @@ -146,6 +155,38 @@ class InfoExtractor(object):      Unless mentioned otherwise, None is equivalent to absence of information. + +    _type "playlist" indicates multiple videos. +    There must be a key "entries", which is a list or a PagedList object, each +    element of which is a valid dictionary under this specfication. + +    Additionally, playlists can have "title" and "id" attributes with the same +    semantics as videos (see above). + + +    _type "multi_video" indicates that there are multiple videos that +    form a single show, for examples multiple acts of an opera or TV episode. +    It must have an entries key like a playlist and contain all the keys +    required for a video at the same time. + + +    _type "url" indicates that the video must be extracted from another +    location, possibly by a different extractor. Its only required key is: +    "url" - the next URL to extract. + +    Additionally, it may have properties believed to be identical to the +    resolved entity, for example "title" if the title of the referred video is +    known ahead of time. + + +    _type "url_transparent" entities have the same specification as "url", but +    indicate that the given additional information is more precise than the one +    associated with the resolved URL. +    This is useful when a site employs a video service that hosts the video and +    its technical metadata, but that video service does not embed a useful +    title, description etc. + +      Subclasses of this one should re-define the _real_initialize() and      _real_extract() methods and define a _VALID_URL regexp.      Probably, they should also be added to the list of extractors. @@ -615,6 +656,7 @@ class InfoExtractor(object):              return (                  preference, +                f.get('language_preference') if f.get('language_preference') is not None else -1,                  f.get('quality') if f.get('quality') is not None else -1,                  f.get('height') if f.get('height') is not None else -1,                  f.get('width') if f.get('width') is not None else -1, diff --git a/youtube_dl/extractor/folketinget.py b/youtube_dl/extractor/folketinget.py new file mode 100644 index 000000000..68e2db943 --- /dev/null +++ b/youtube_dl/extractor/folketinget.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ( +    int_or_none, +    parse_duration, +    parse_iso8601, +    xpath_text, +) + + +class FolketingetIE(InfoExtractor): +    IE_DESC = 'Folketinget (ft.dk; Danish parliament)' +    _VALID_URL = r'https?://(?:www\.)?ft\.dk/webtv/video/[^?#]*?\.(?P<id>[0-9]+)\.aspx' +    _TEST = { +        'url': 'http://www.ft.dk/webtv/video/20141/eru/td.1165642.aspx?as=1#player', +        'info_dict': { +            'id': '1165642', +            'ext': 'mp4', +            'title': 'Åbent samråd i Erhvervsudvalget', +            'description': 'Åbent samråd med erhvervs- og vækstministeren om regeringens politik på teleområdet', +            'view_count': int, +            'width': 768, +            'height': 432, +            'tbr': 928000, +            'timestamp': 1416493800, +            'upload_date': '20141120', +            'duration': 3960, +        }, +        'params': { +            'skip_download': 'rtmpdump required', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        title = self._og_search_title(webpage) +        description = self._html_search_regex( +            r'(?s)<div class="video-item-agenda"[^>]*>(.*?)<', +            webpage, 'description', fatal=False) + +        player_params = compat_parse_qs(self._search_regex( +            r'<embed src="http://ft\.arkena\.tv/flash/ftplayer\.swf\?([^"]+)"', +            webpage, 'player params')) +        xml_url = player_params['xml'][0] +        doc = self._download_xml(xml_url, video_id) + +        timestamp = parse_iso8601(xpath_text(doc, './/date')) +        duration = parse_duration(xpath_text(doc, './/duration')) +        width = int_or_none(xpath_text(doc, './/width')) +        height = int_or_none(xpath_text(doc, './/height')) +        view_count = int_or_none(xpath_text(doc, './/views')) + +        formats = [{ +            'format_id': n.attrib['bitrate'], +            'url': xpath_text(n, './url', fatal=True), +            'tbr': int_or_none(n.attrib['bitrate']), +        } for n in doc.findall('.//streams/stream')] +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +            'description': description, +            'timestamp': timestamp, +            'width': width, +            'height': height, +            'duration': duration, +            'view_count': view_count, +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index af769ab61..c7a824c29 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -979,7 +979,7 @@ class GenericIE(InfoExtractor):                  found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))          if not found:              # HTML5 video -            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src="([^"]+)"', webpage) +            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)          if not found:              found = re.search(                  r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' diff --git a/youtube_dl/extractor/goldenmoustache.py b/youtube_dl/extractor/goldenmoustache.py index 7e13b131b..10001d4d9 100644 --- a/youtube_dl/extractor/goldenmoustache.py +++ b/youtube_dl/extractor/goldenmoustache.py @@ -1,9 +1,7 @@  from __future__ import unicode_literals -import re  from .common import InfoExtractor  from ..utils import ( -    parse_duration,      int_or_none,  ) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 474bdff7d..5f0f476b6 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -145,7 +145,8 @@ class MTVServicesInfoExtractor(InfoExtractor):          idoc = self._download_xml(              feed_url + '?' + data, video_id,              'Downloading info', transform_source=fix_xml_ampersands) -        return [self._get_video_info(item) for item in idoc.findall('.//item')] +        return self.playlist_result( +            [self._get_video_info(item) for item in idoc.findall('.//item')])      def _real_extract(self, url):          title = url_basename(url) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 0ab1eb69c..4a188e5d4 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -38,10 +38,11 @@ class RtlXlIE(InfoExtractor):          progname = info['abstracts'][0]['name']          subtitle = material['title'] or info['episodes'][0]['name'] -        videopath = material['videopath'] -        f4m_url = 'http://manifest.us.rtl.nl' + videopath +        # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118) +        videopath = material['videopath'].replace('.f4m', '.m3u8') +        m3u8_url = 'http://manifest.us.rtl.nl' + videopath -        formats = self._extract_f4m_formats(f4m_url, uuid) +        formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4')          video_urlpart = videopath.split('/flash/')[1][:-4]          PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' @@ -54,9 +55,12 @@ class RtlXlIE(InfoExtractor):              {                  'url': PG_URL_TEMPLATE % ('a3m', video_urlpart),                  'format_id': 'pg-hd', +                'quality': 0,              }          ]) +        self._sort_formats(formats) +          return {              'id': uuid,              'title': '%s - %s' % (progname, subtitle), diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py index 44c52c718..5feb4ff83 100644 --- a/youtube_dl/extractor/stanfordoc.py +++ b/youtube_dl/extractor/stanfordoc.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import re  from .common import InfoExtractor @@ -9,24 +11,23 @@ from ..utils import (  class StanfordOpenClassroomIE(InfoExtractor): -    IE_NAME = u'stanfordoc' -    IE_DESC = u'Stanford Open ClassRoom' -    _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' +    IE_NAME = 'stanfordoc' +    IE_DESC = 'Stanford Open ClassRoom' +    _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'      _TEST = { -        u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', -        u'file': u'PracticalUnix_intro-environment.mp4', -        u'md5': u'544a9468546059d4e80d76265b0443b8', -        u'info_dict': { -            u"title": u"Intro Environment" +        'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', +        'md5': '544a9468546059d4e80d76265b0443b8', +        'info_dict': { +            'id': 'PracticalUnix_intro-environment', +            'ext': 'mp4', +            'title': 'Intro Environment',          }      }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url) -        if mobj.group('course') and mobj.group('video'): # A specific video +        if mobj.group('course') and mobj.group('video'):  # A specific video              course = mobj.group('course')              video = mobj.group('video')              info = { @@ -35,7 +36,6 @@ class StanfordOpenClassroomIE(InfoExtractor):                  'upload_date': None,              } -            self.report_extraction(info['id'])              baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'              xmlUrl = baseUrl + video + '.xml'              mdoc = self._download_xml(xmlUrl, info['id']) @@ -43,63 +43,49 @@ class StanfordOpenClassroomIE(InfoExtractor):                  info['title'] = mdoc.findall('./title')[0].text                  info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text              except IndexError: -                raise ExtractorError(u'Invalid metadata XML file') -            info['ext'] = info['url'].rpartition('.')[2] -            return [info] -        elif mobj.group('course'): # A course page +                raise ExtractorError('Invalid metadata XML file') +            return info +        elif mobj.group('course'):  # A course page              course = mobj.group('course')              info = {                  'id': course, -                'type': 'playlist', +                '_type': 'playlist',                  'uploader': None,                  'upload_date': None,              } -            coursepage = self._download_webpage(url, info['id'], -                                        note='Downloading course info page', -                                        errnote='Unable to download course info page') +            coursepage = self._download_webpage( +                url, info['id'], +                note='Downloading course info page', +                errnote='Unable to download course info page') -            info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) +            info['title'] = self._html_search_regex( +                r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) -            info['description'] = self._html_search_regex('<description>([^<]+)</description>', -                coursepage, u'description', fatal=False) +            info['description'] = self._html_search_regex( +                r'(?s)<description>([^<]+)</description>', +                coursepage, 'description', fatal=False)              links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) -            info['list'] = [ -                { -                    'type': 'reference', -                    'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), -                } -                    for vpage in links] -            results = [] -            for entry in info['list']: -                assert entry['type'] == 'reference' -                results += self.extract(entry['url']) -            return results -        else: # Root page +            info['entries'] = [self.url_result( +                'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) +            ) for l in links] +            return info +        else:  # Root page              info = {                  'id': 'Stanford OpenClassroom', -                'type': 'playlist', +                '_type': 'playlist',                  'uploader': None,                  'upload_date': None,              } +            info['title'] = info['id']              rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'              rootpage = self._download_webpage(rootURL, info['id'], -                errnote=u'Unable to download course info page') - -            info['title'] = info['id'] +                errnote='Unable to download course info page')              links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) -            info['list'] = [ -                { -                    'type': 'reference', -                    'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), -                } -                    for cpage in links] - -            results = [] -            for entry in info['list']: -                assert entry['type'] == 'reference' -                results += self.extract(entry['url']) -            return results +            info['entries'] = [self.url_result( +                'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) +            ) for l in links] +            return info diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py index c9359fafb..aa5964acb 100644 --- a/youtube_dl/extractor/sztvhu.py +++ b/youtube_dl/extractor/sztvhu.py @@ -1,27 +1,24 @@  # -*- coding: utf-8 -*- - -import re +from __future__ import unicode_literals  from .common import InfoExtractor -from ..utils import determine_ext  class SztvHuIE(InfoExtractor): -    _VALID_URL = r'(?:http://)?(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)' +    _VALID_URL = r'http://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)'      _TEST = { -        u'url': u'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', -        u'file': u'20130909.mp4', -        u'md5': u'a6df607b11fb07d0e9f2ad94613375cb', -        u'info_dict': { -            u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren", -            u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...', +        'url': 'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', +        'md5': 'a6df607b11fb07d0e9f2ad94613375cb', +        'info_dict': { +            'id': '20130909', +            'ext': 'mp4', +            'title': 'Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren', +            'description': 'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...',          }, -        u'skip': u'Service temporarily disabled as of 2013-11-20'      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          video_file = self._search_regex(              r'file: "...:(.*?)",', webpage, 'video file') @@ -39,7 +36,6 @@ class SztvHuIE(InfoExtractor):              'id': video_id,              'url': video_url,              'title': title, -            'ext': determine_ext(video_url),              'description': description,              'thumbnail': thumbnail,          } diff --git a/youtube_dl/extractor/telebruxelles.py b/youtube_dl/extractor/telebruxelles.py new file mode 100644 index 000000000..a3d05f97d --- /dev/null +++ b/youtube_dl/extractor/telebruxelles.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TeleBruxellesIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?telebruxelles\.be/(news|sport|dernier-jt)/?(?P<id>[^/#?]+)' +    _TESTS = [{ +        'url': 'http://www.telebruxelles.be/news/auditions-devant-parlement-francken-galant-tres-attendus/', +        'md5': '59439e568c9ee42fb77588b2096b214f', +        'info_dict': { +            'id': '11942', +            'display_id': 'auditions-devant-parlement-francken-galant-tres-attendus', +            'ext': 'flv', +            'title': 'Parlement : Francken et Galant répondent aux interpellations de l’opposition', +            'description': 're:Les auditions des ministres se poursuivent*' +        }, +        'params': { +            'skip_download': 'requires rtmpdump' +        }, +    }, { +        'url': 'http://www.telebruxelles.be/sport/basket-brussels-bat-mons-80-74/', +        'md5': '181d3fbdcf20b909309e5aef5c6c6047', +        'info_dict': { +            'id': '10091', +            'display_id': 'basket-brussels-bat-mons-80-74', +            'ext': 'flv', +            'title': 'Basket : le Brussels bat Mons 80-74', +            'description': 're:^Ils l\u2019on fait ! En basket, le B*', +        }, +        'params': { +            'skip_download': 'requires rtmpdump' +        }, +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) + +        article_id = self._html_search_regex( +            r"<article id=\"post-(\d+)\"", webpage, 'article ID') +        title = self._html_search_regex( +            r'<h1 class=\"entry-title\">(.*?)</h1>', webpage, 'title') +        description = self._og_search_description(webpage) + +        rtmp_url = self._html_search_regex( +            r"file: \"(rtmp://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}/vod/mp4:\" \+ \"\w+\" \+ \".mp4)\"", +            webpage, 'RTMP url') +        rtmp_url = rtmp_url.replace("\" + \"", "") + +        return { +            'id': article_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'url': rtmp_url, +            'ext': 'flv', +            'rtmp_live': True  # if rtmpdump is not called with "--live" argument, the download is blocked and can be completed +        } diff --git a/youtube_dl/extractor/vh1.py b/youtube_dl/extractor/vh1.py index 2f77e3898..6be3774b7 100644 --- a/youtube_dl/extractor/vh1.py +++ b/youtube_dl/extractor/vh1.py @@ -121,4 +121,7 @@ class VH1IE(MTVIE):          idoc = self._download_xml(              doc_url, video_id,              'Downloading info', transform_source=fix_xml_ampersands) -        return [self._get_video_info(item) for item in idoc.findall('.//item')] +        return self.playlist_result( +            [self._get_video_info(item) for item in idoc.findall('.//item')], +            playlist_id=video_id, +        ) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 08f63be96..0cb837afc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -307,6 +307,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},          '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, +        # Dash webm audio with opus inside +        '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50}, +        '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50}, +        '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50}, +          # RTMP (unnamed)          '_rtmp': {'protocol': 'rtmp'},      } @@ -401,6 +406,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  'format': '141',              },          }, +        # Controversy video +        { +            'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', +            'info_dict': { +                'id': 'T4XJQO3qol8', +                'ext': 'mp4', +                'upload_date': '20100909', +                'uploader': 'The Amazing Atheist', +                'uploader_id': 'TheAmazingAtheist', +                'title': 'Burning Everyone\'s Koran', +                'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', +            } +        }      ]      def __init__(self, *args, **kwargs): @@ -661,7 +679,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          video_id = self.extract_id(url)          # Get video webpage -        url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id +        url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id          pref_cookies = [              c for c in self._downloader.cookiejar              if c.domain == '.youtube.com' and c.name == 'PREF'] diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index c40cd376d..b4617fbad 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -61,7 +61,7 @@ class JSInterpreter(object):              pass          m = re.match( -            r'^(?P<var>[a-zA-Z0-9_]+)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$', +            r'^(?P<var>[$a-zA-Z0-9_]+)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$',              expr)          if m:              variable = m.group('var') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 50e515a04..5be7cf992 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -71,9 +71,10 @@ def preferredencoding():  def write_json_file(obj, fn): -    """ Encode obj as JSON and write it to fn, atomically """ +    """ Encode obj as JSON and write it to fn, atomically if possible """ -    if sys.version_info < (3, 0): +    fn = encodeFilename(fn) +    if sys.version_info < (3, 0) and sys.platform != 'win32':          encoding = get_filesystem_encoding()          # os.path.basename returns a bytes object, but NamedTemporaryFile          # will fail if the filename contains non ascii characters unless we @@ -107,6 +108,13 @@ def write_json_file(obj, fn):      try:          with tf:              json.dump(obj, tf) +        if sys.platform == 'win32': +            # Need to remove existing file on Windows, else os.rename raises +            # WindowsError or FileExistsError. +            try: +                os.unlink(fn) +            except OSError: +                pass          os.rename(tf.name, fn)      except:          try: @@ -412,6 +420,7 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs):              pass  # Python < 3.4          return compat_urllib_request.HTTPSHandler(context=context, **kwargs) +  class ExtractorError(Exception):      """Error during info extraction."""      def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): @@ -426,7 +435,13 @@ class ExtractorError(Exception):          if cause:              msg += ' (caused by %r)' % cause          if not expected: -            msg = msg + '; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.' +            if ytdl_is_updateable(): +                update_cmd = 'type  youtube-dl -U  to update' +            else: +                update_cmd = 'see  https://yt-dl.org/update  on how to update' +            msg += '; please report this issue on https://yt-dl.org/bug .' +            msg += ' Make sure you are using the latest version; %s.' % update_cmd +            msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'          super(ExtractorError, self).__init__(msg)          self.traceback = tb @@ -1411,3 +1426,10 @@ def is_outdated_version(version, limit, assume_new=True):          return version_tuple(version) < version_tuple(limit)      except ValueError:          return not assume_new + + +def ytdl_is_updateable(): +    """ Returns if youtube-dl can be updated with -U """ +    from zipimport import zipimporter + +    return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen') diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 34bf665ad..6be5d07c4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.11.16' +__version__ = '2014.11.23' | 
