diff options
| -rwxr-xr-x | youtube_dl/YoutubeDL.py | 5 | ||||
| -rw-r--r-- | youtube_dl/compat.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/arte.py | 114 | ||||
| -rw-r--r-- | youtube_dl/extractor/bliptv.py | 14 | ||||
| -rw-r--r-- | youtube_dl/extractor/brightcove.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/comedycentral.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 44 | ||||
| -rw-r--r-- | youtube_dl/extractor/folketinget.py | 75 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/goldenmoustache.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/mtv.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/rtlnl.py | 8 | ||||
| -rw-r--r-- | youtube_dl/extractor/stanfordoc.py | 88 | ||||
| -rw-r--r-- | youtube_dl/extractor/sztvhu.py | 24 | ||||
| -rw-r--r-- | youtube_dl/extractor/vh1.py | 5 | ||||
| -rw-r--r-- | youtube_dl/jsinterp.py | 2 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 27 | ||||
| -rw-r--r-- | youtube_dl/version.py | 2 | 
19 files changed, 270 insertions, 159 deletions
| diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 94c50903c..fde026fbf 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -624,7 +624,7 @@ class YoutubeDL(object):              return self.process_ie_result(                  new_result, download=download, extra_info=extra_info) -        elif result_type == 'playlist': +        elif result_type == 'playlist' or result_type == 'multi_video':              # We process each entry in the playlist              playlist = ie_result.get('title', None) or ie_result.get('id', None)              self.to_screen('[download] Downloading playlist: %s' % playlist) @@ -679,6 +679,9 @@ class YoutubeDL(object):              ie_result['entries'] = playlist_results              return ie_result          elif result_type == 'compat_list': +            self.report_warning( +                'Extractor %s returned a compat_list result. ' +                'It needs to be updated.' % ie_result.get('extractor'))              def _fixup(r):                  self.add_extra_info(r,                      { diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 64a975489..9d33a8ec5 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -302,8 +302,10 @@ else:  # Fix https://github.com/rg3/youtube-dl/issues/4223  # See http://bugs.python.org/issue9161 for what is broken  def workaround_optparse_bug9161(): +    op = optparse.OptionParser() +    og = optparse.OptionGroup(op, 'foo')      try: -        optparse.OptionGroup('foo').add_option('-t') +        og.add_option('-t')      except TypeError:          real_add_option = optparse.OptionGroup.add_option diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7275d247a..7497a97f5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -115,6 +115,7 @@ from .fktv import (      FKTVPosteckeIE,  )  from .flickr import FlickrIE +from .folketinget import FolketingetIE  from .fourtube import FourTubeIE  from .franceculture import FranceCultureIE  from .franceinter import FranceInterIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index b9a9440c0..3a57ce527 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -5,13 +5,12 @@ import re  from .common import InfoExtractor  from ..utils import ( -    ExtractorError,      find_xpath_attr,      unified_strdate, -    determine_ext,      get_element_by_id,      get_element_by_attribute,      int_or_none, +    qualities,  )  # There are different sources of video in arte.tv, the extraction process  @@ -102,79 +101,54 @@ class ArteTVPlus7IE(InfoExtractor):              'upload_date': unified_strdate(upload_date_str),              'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),          } +        qfunc = qualities(['HQ', 'MQ', 'EQ', 'SQ']) -        all_formats = [] +        formats = []          for format_id, format_dict in player_info['VSR'].items(): -            fmt = dict(format_dict) -            fmt['format_id'] = format_id -            all_formats.append(fmt) -        # Some formats use the m3u8 protocol -        all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats)) -        def _match_lang(f): -            if f.get('versionCode') is None: -                return True -            # Return true if that format is in the language of the url -            if lang == 'fr': -                l = 'F' -            elif lang == 'de': -                l = 'A' -            else: -                l = lang -            regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] -            return any(re.match(r, f['versionCode']) for r in regexes) -        # Some formats may not be in the same language as the url -        # TODO: Might want not to drop videos that does not match requested language -        # but to process those formats with lower precedence -        formats = filter(_match_lang, all_formats) -        formats = list(formats)  # in python3 filter returns an iterator -        if not formats: -            # Some videos are only available in the 'Originalversion' -            # they aren't tagged as being in French or German -            # Sometimes there are neither videos of requested lang code -            # nor original version videos available -            # For such cases we just take all_formats as is -            formats = all_formats -            if not formats: -                raise ExtractorError('The formats list is empty') - -        if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: -            def sort_key(f): -                return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) -        else: -            def sort_key(f): -                versionCode = f.get('versionCode') -                if versionCode is None: -                    versionCode = '' -                return ( -                    # Sort first by quality -                    int(f.get('height', -1)), -                    int(f.get('bitrate', -1)), -                    # The original version with subtitles has lower relevance -                    re.match(r'VO-ST(F|A)', versionCode) is None, -                    # The version with sourds/mal subtitles has also lower relevance -                    re.match(r'VO?(F|A)-STM\1', versionCode) is None, -                    # Prefer http downloads over m3u8 -                    0 if f['url'].endswith('m3u8') else 1, -                ) -        formats = sorted(formats, key=sort_key) -        def _format(format_info): -            info = { -                'format_id': format_info['format_id'], -                'format_note': '%s, %s' % (format_info.get('versionCode'), format_info.get('versionLibelle')), -                'width': int_or_none(format_info.get('width')), -                'height': int_or_none(format_info.get('height')), -                'tbr': int_or_none(format_info.get('bitrate')), +            f = dict(format_dict) +            versionCode = f.get('versionCode') + +            langcode = { +                'fr': 'F', +                'de': 'A', +            }.get(lang, lang) +            lang_rexs = [r'VO?%s' % langcode, r'VO?.-ST%s' % langcode] +            lang_pref = ( +                None if versionCode is None else ( +                    10 if any(re.match(r, versionCode) for r in lang_rexs) +                    else -10)) +            source_pref = 0 +            if versionCode is not None: +                # The original version with subtitles has lower relevance +                if re.match(r'VO-ST(F|A)', versionCode): +                    source_pref -= 10 +                # The version with sourds/mal subtitles has also lower relevance +                elif re.match(r'VO?(F|A)-STM\1', versionCode): +                    source_pref -= 9 +            format = { +                'format_id': format_id, +                'preference': -10 if f.get('videoFormat') == 'M3U8' else None, +                'language_preference': lang_pref, +                'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')), +                'width': int_or_none(f.get('width')), +                'height': int_or_none(f.get('height')), +                'tbr': int_or_none(f.get('bitrate')), +                'quality': qfunc(f['quality']), +                'source_preference': source_pref,              } -            if format_info['mediaType'] == 'rtmp': -                info['url'] = format_info['streamer'] -                info['play_path'] = 'mp4:' + format_info['url'] -                info['ext'] = 'flv' + +            if f.get('mediaType') == 'rtmp': +                format['url'] = f['streamer'] +                format['play_path'] = 'mp4:' + f['url'] +                format['ext'] = 'flv'              else: -                info['url'] = format_info['url'] -                info['ext'] = determine_ext(info['url']) -            return info -        info_dict['formats'] = [_format(f) for f in formats] +                format['url'] = f['url'] + +            formats.append(format) + +        self._sort_formats(formats) +        info_dict['formats'] = formats          return info_dict diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 2370c24b0..f2b02643d 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -166,9 +166,17 @@ class BlipTVIE(SubtitlesInfoExtractor):  class BlipTVUserIE(InfoExtractor): -    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$' +    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$'      _PAGE_SIZE = 12      IE_NAME = 'blip.tv:user' +    _TEST = { +        'url': 'http://blip.tv/actone', +        'info_dict': { +            'id': 'actone', +            'title': 'Act One: The Series', +        }, +        'playlist_count': 5, +    }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -179,6 +187,7 @@ class BlipTVUserIE(InfoExtractor):          page = self._download_webpage(url, username, 'Downloading user page')          mobj = re.search(r'data-users-id="([^"]+)"', page)          page_base = page_base % mobj.group(1) +        title = self._og_search_title(page)          # Download video ids using BlipTV Ajax calls. Result size per          # query is limited (currently to 12 videos) so we need to query @@ -215,4 +224,5 @@ class BlipTVUserIE(InfoExtractor):          urls = ['http://blip.tv/%s' % video_id for video_id in video_ids]          url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls] -        return [self.playlist_result(url_entries, playlist_title=username)] +        return self.playlist_result( +            url_entries, playlist_title=title, playlist_id=username) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index a6920685e..2db7f9fef 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -111,6 +111,8 @@ class BrightcoveIE(InfoExtractor):                              lambda m: m.group(1) + '/>', object_str)          # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608          object_str = object_str.replace('<--', '<!--') +        # remove namespace to simplify extraction +        object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str)          object_str = fix_xml_ampersands(object_str)          object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) @@ -219,7 +221,7 @@ class BrightcoveIE(InfoExtractor):          webpage = self._download_webpage(req, video_id)          error_msg = self._html_search_regex( -            r"<h1>We're sorry.</h1>\s*<p>(.*?)</p>", webpage, +            r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage,              'error message', default=None)          if error_msg is not None:              raise ExtractorError( diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 49b978b4e..2e3ef3fda 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals  import re -from .common import InfoExtractor  from .mtv import MTVServicesInfoExtractor  from ..utils import (      compat_str, @@ -110,9 +109,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url, re.VERBOSE) -        if mobj is None: -            raise ExtractorError('Invalid URL: %s' % url) +        mobj = re.match(self._VALID_URL, url)          if mobj.group('shortname'):              if mobj.group('shortname') in ('tds', 'thedailyshow'): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b77f0e519..93a5a3d57 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -43,7 +43,11 @@ class InfoExtractor(object):      information possibly downloading the video to the file system, among      other possible outcomes. -    The dictionaries must include the following fields: +    The type field determines the the type of the result. +    By far the most common value (and the default if _type is missing) is +    "video", which indicates a single video. + +    For a video, the dictionaries must include the following fields:      id:             Video identifier.      title:          Video title, unescaped. @@ -87,6 +91,11 @@ class InfoExtractor(object):                                   by this field, regardless of all other values.                                   -1 for default (order by other properties),                                   -2 or smaller for less than default. +                    * language_preference  Is this in the correct requested +                                 language? +                                 10 if it's what the URL is about, +                                 -1 for default (don't know), +                                 -10 otherwise, other values reserved for now.                      * quality    Order number of the video quality of this                                   format, irrespective of the file format.                                   -1 for default (order by other properties), @@ -146,6 +155,38 @@ class InfoExtractor(object):      Unless mentioned otherwise, None is equivalent to absence of information. + +    _type "playlist" indicates multiple videos. +    There must be a key "entries", which is a list or a PagedList object, each +    element of which is a valid dictionary under this specfication. + +    Additionally, playlists can have "title" and "id" attributes with the same +    semantics as videos (see above). + + +    _type "multi_video" indicates that there are multiple videos that +    form a single show, for examples multiple acts of an opera or TV episode. +    It must have an entries key like a playlist and contain all the keys +    required for a video at the same time. + + +    _type "url" indicates that the video must be extracted from another +    location, possibly by a different extractor. Its only required key is: +    "url" - the next URL to extract. + +    Additionally, it may have properties believed to be identical to the +    resolved entity, for example "title" if the title of the referred video is +    known ahead of time. + + +    _type "url_transparent" entities have the same specification as "url", but +    indicate that the given additional information is more precise than the one +    associated with the resolved URL. +    This is useful when a site employs a video service that hosts the video and +    its technical metadata, but that video service does not embed a useful +    title, description etc. + +      Subclasses of this one should re-define the _real_initialize() and      _real_extract() methods and define a _VALID_URL regexp.      Probably, they should also be added to the list of extractors. @@ -615,6 +656,7 @@ class InfoExtractor(object):              return (                  preference, +                f.get('language_preference') if f.get('language_preference') is not None else -1,                  f.get('quality') if f.get('quality') is not None else -1,                  f.get('height') if f.get('height') is not None else -1,                  f.get('width') if f.get('width') is not None else -1, diff --git a/youtube_dl/extractor/folketinget.py b/youtube_dl/extractor/folketinget.py new file mode 100644 index 000000000..68e2db943 --- /dev/null +++ b/youtube_dl/extractor/folketinget.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ( +    int_or_none, +    parse_duration, +    parse_iso8601, +    xpath_text, +) + + +class FolketingetIE(InfoExtractor): +    IE_DESC = 'Folketinget (ft.dk; Danish parliament)' +    _VALID_URL = r'https?://(?:www\.)?ft\.dk/webtv/video/[^?#]*?\.(?P<id>[0-9]+)\.aspx' +    _TEST = { +        'url': 'http://www.ft.dk/webtv/video/20141/eru/td.1165642.aspx?as=1#player', +        'info_dict': { +            'id': '1165642', +            'ext': 'mp4', +            'title': 'Åbent samråd i Erhvervsudvalget', +            'description': 'Åbent samråd med erhvervs- og vækstministeren om regeringens politik på teleområdet', +            'view_count': int, +            'width': 768, +            'height': 432, +            'tbr': 928000, +            'timestamp': 1416493800, +            'upload_date': '20141120', +            'duration': 3960, +        }, +        'params': { +            'skip_download': 'rtmpdump required', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        title = self._og_search_title(webpage) +        description = self._html_search_regex( +            r'(?s)<div class="video-item-agenda"[^>]*>(.*?)<', +            webpage, 'description', fatal=False) + +        player_params = compat_parse_qs(self._search_regex( +            r'<embed src="http://ft\.arkena\.tv/flash/ftplayer\.swf\?([^"]+)"', +            webpage, 'player params')) +        xml_url = player_params['xml'][0] +        doc = self._download_xml(xml_url, video_id) + +        timestamp = parse_iso8601(xpath_text(doc, './/date')) +        duration = parse_duration(xpath_text(doc, './/duration')) +        width = int_or_none(xpath_text(doc, './/width')) +        height = int_or_none(xpath_text(doc, './/height')) +        view_count = int_or_none(xpath_text(doc, './/views')) + +        formats = [{ +            'format_id': n.attrib['bitrate'], +            'url': xpath_text(n, './url', fatal=True), +            'tbr': int_or_none(n.attrib['bitrate']), +        } for n in doc.findall('.//streams/stream')] +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +            'description': description, +            'timestamp': timestamp, +            'width': width, +            'height': height, +            'duration': duration, +            'view_count': view_count, +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index af769ab61..c7a824c29 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -979,7 +979,7 @@ class GenericIE(InfoExtractor):                  found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))          if not found:              # HTML5 video -            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src="([^"]+)"', webpage) +            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)          if not found:              found = re.search(                  r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' diff --git a/youtube_dl/extractor/goldenmoustache.py b/youtube_dl/extractor/goldenmoustache.py index 7e13b131b..10001d4d9 100644 --- a/youtube_dl/extractor/goldenmoustache.py +++ b/youtube_dl/extractor/goldenmoustache.py @@ -1,9 +1,7 @@  from __future__ import unicode_literals -import re  from .common import InfoExtractor  from ..utils import ( -    parse_duration,      int_or_none,  ) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 474bdff7d..5f0f476b6 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -145,7 +145,8 @@ class MTVServicesInfoExtractor(InfoExtractor):          idoc = self._download_xml(              feed_url + '?' + data, video_id,              'Downloading info', transform_source=fix_xml_ampersands) -        return [self._get_video_info(item) for item in idoc.findall('.//item')] +        return self.playlist_result( +            [self._get_video_info(item) for item in idoc.findall('.//item')])      def _real_extract(self, url):          title = url_basename(url) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 5daef2fc5..4a188e5d4 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -28,9 +28,8 @@ class RtlXlIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          uuid = mobj.group('uuid') -        # Use m3u8 streams (see https://github.com/rg3/youtube-dl/issues/4118)          info = self._download_json( -            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/d=pc/fmt=adaptive/' % uuid, +            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid,              uuid)          material = info['material'][0] @@ -39,12 +38,13 @@ class RtlXlIE(InfoExtractor):          progname = info['abstracts'][0]['name']          subtitle = material['title'] or info['episodes'][0]['name'] -        videopath = material['videopath'] +        # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118) +        videopath = material['videopath'].replace('.f4m', '.m3u8')          m3u8_url = 'http://manifest.us.rtl.nl' + videopath          formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') -        video_urlpart = videopath.split('/adaptive/')[1][:-4] +        video_urlpart = videopath.split('/flash/')[1][:-4]          PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4'          formats.extend([ diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py index 44c52c718..5feb4ff83 100644 --- a/youtube_dl/extractor/stanfordoc.py +++ b/youtube_dl/extractor/stanfordoc.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import re  from .common import InfoExtractor @@ -9,24 +11,23 @@ from ..utils import (  class StanfordOpenClassroomIE(InfoExtractor): -    IE_NAME = u'stanfordoc' -    IE_DESC = u'Stanford Open ClassRoom' -    _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' +    IE_NAME = 'stanfordoc' +    IE_DESC = 'Stanford Open ClassRoom' +    _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'      _TEST = { -        u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', -        u'file': u'PracticalUnix_intro-environment.mp4', -        u'md5': u'544a9468546059d4e80d76265b0443b8', -        u'info_dict': { -            u"title": u"Intro Environment" +        'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', +        'md5': '544a9468546059d4e80d76265b0443b8', +        'info_dict': { +            'id': 'PracticalUnix_intro-environment', +            'ext': 'mp4', +            'title': 'Intro Environment',          }      }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url) -        if mobj.group('course') and mobj.group('video'): # A specific video +        if mobj.group('course') and mobj.group('video'):  # A specific video              course = mobj.group('course')              video = mobj.group('video')              info = { @@ -35,7 +36,6 @@ class StanfordOpenClassroomIE(InfoExtractor):                  'upload_date': None,              } -            self.report_extraction(info['id'])              baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'              xmlUrl = baseUrl + video + '.xml'              mdoc = self._download_xml(xmlUrl, info['id']) @@ -43,63 +43,49 @@ class StanfordOpenClassroomIE(InfoExtractor):                  info['title'] = mdoc.findall('./title')[0].text                  info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text              except IndexError: -                raise ExtractorError(u'Invalid metadata XML file') -            info['ext'] = info['url'].rpartition('.')[2] -            return [info] -        elif mobj.group('course'): # A course page +                raise ExtractorError('Invalid metadata XML file') +            return info +        elif mobj.group('course'):  # A course page              course = mobj.group('course')              info = {                  'id': course, -                'type': 'playlist', +                '_type': 'playlist',                  'uploader': None,                  'upload_date': None,              } -            coursepage = self._download_webpage(url, info['id'], -                                        note='Downloading course info page', -                                        errnote='Unable to download course info page') +            coursepage = self._download_webpage( +                url, info['id'], +                note='Downloading course info page', +                errnote='Unable to download course info page') -            info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) +            info['title'] = self._html_search_regex( +                r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) -            info['description'] = self._html_search_regex('<description>([^<]+)</description>', -                coursepage, u'description', fatal=False) +            info['description'] = self._html_search_regex( +                r'(?s)<description>([^<]+)</description>', +                coursepage, 'description', fatal=False)              links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) -            info['list'] = [ -                { -                    'type': 'reference', -                    'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), -                } -                    for vpage in links] -            results = [] -            for entry in info['list']: -                assert entry['type'] == 'reference' -                results += self.extract(entry['url']) -            return results -        else: # Root page +            info['entries'] = [self.url_result( +                'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) +            ) for l in links] +            return info +        else:  # Root page              info = {                  'id': 'Stanford OpenClassroom', -                'type': 'playlist', +                '_type': 'playlist',                  'uploader': None,                  'upload_date': None,              } +            info['title'] = info['id']              rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'              rootpage = self._download_webpage(rootURL, info['id'], -                errnote=u'Unable to download course info page') - -            info['title'] = info['id'] +                errnote='Unable to download course info page')              links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) -            info['list'] = [ -                { -                    'type': 'reference', -                    'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), -                } -                    for cpage in links] - -            results = [] -            for entry in info['list']: -                assert entry['type'] == 'reference' -                results += self.extract(entry['url']) -            return results +            info['entries'] = [self.url_result( +                'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) +            ) for l in links] +            return info diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py index c9359fafb..aa5964acb 100644 --- a/youtube_dl/extractor/sztvhu.py +++ b/youtube_dl/extractor/sztvhu.py @@ -1,27 +1,24 @@  # -*- coding: utf-8 -*- - -import re +from __future__ import unicode_literals  from .common import InfoExtractor -from ..utils import determine_ext  class SztvHuIE(InfoExtractor): -    _VALID_URL = r'(?:http://)?(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)' +    _VALID_URL = r'http://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)'      _TEST = { -        u'url': u'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', -        u'file': u'20130909.mp4', -        u'md5': u'a6df607b11fb07d0e9f2ad94613375cb', -        u'info_dict': { -            u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren", -            u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...', +        'url': 'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', +        'md5': 'a6df607b11fb07d0e9f2ad94613375cb', +        'info_dict': { +            'id': '20130909', +            'ext': 'mp4', +            'title': 'Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren', +            'description': 'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...',          }, -        u'skip': u'Service temporarily disabled as of 2013-11-20'      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          video_file = self._search_regex(              r'file: "...:(.*?)",', webpage, 'video file') @@ -39,7 +36,6 @@ class SztvHuIE(InfoExtractor):              'id': video_id,              'url': video_url,              'title': title, -            'ext': determine_ext(video_url),              'description': description,              'thumbnail': thumbnail,          } diff --git a/youtube_dl/extractor/vh1.py b/youtube_dl/extractor/vh1.py index 2f77e3898..6be3774b7 100644 --- a/youtube_dl/extractor/vh1.py +++ b/youtube_dl/extractor/vh1.py @@ -121,4 +121,7 @@ class VH1IE(MTVIE):          idoc = self._download_xml(              doc_url, video_id,              'Downloading info', transform_source=fix_xml_ampersands) -        return [self._get_video_info(item) for item in idoc.findall('.//item')] +        return self.playlist_result( +            [self._get_video_info(item) for item in idoc.findall('.//item')], +            playlist_id=video_id, +        ) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index c40cd376d..b4617fbad 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -61,7 +61,7 @@ class JSInterpreter(object):              pass          m = re.match( -            r'^(?P<var>[a-zA-Z0-9_]+)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$', +            r'^(?P<var>[$a-zA-Z0-9_]+)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$',              expr)          if m:              variable = m.group('var') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 94b496dd0..5be7cf992 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -71,10 +71,10 @@ def preferredencoding():  def write_json_file(obj, fn): -    """ Encode obj as JSON and write it to fn, atomically """ +    """ Encode obj as JSON and write it to fn, atomically if possible """      fn = encodeFilename(fn) -    if sys.version_info < (3, 0): +    if sys.version_info < (3, 0) and sys.platform != 'win32':          encoding = get_filesystem_encoding()          # os.path.basename returns a bytes object, but NamedTemporaryFile          # will fail if the filename contains non ascii characters unless we @@ -108,6 +108,13 @@ def write_json_file(obj, fn):      try:          with tf:              json.dump(obj, tf) +        if sys.platform == 'win32': +            # Need to remove existing file on Windows, else os.rename raises +            # WindowsError or FileExistsError. +            try: +                os.unlink(fn) +            except OSError: +                pass          os.rename(tf.name, fn)      except:          try: @@ -413,6 +420,7 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs):              pass  # Python < 3.4          return compat_urllib_request.HTTPSHandler(context=context, **kwargs) +  class ExtractorError(Exception):      """Error during info extraction."""      def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): @@ -427,7 +435,13 @@ class ExtractorError(Exception):          if cause:              msg += ' (caused by %r)' % cause          if not expected: -            msg = msg + '; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.' +            if ytdl_is_updateable(): +                update_cmd = 'type  youtube-dl -U  to update' +            else: +                update_cmd = 'see  https://yt-dl.org/update  on how to update' +            msg += '; please report this issue on https://yt-dl.org/bug .' +            msg += ' Make sure you are using the latest version; %s.' % update_cmd +            msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'          super(ExtractorError, self).__init__(msg)          self.traceback = tb @@ -1412,3 +1426,10 @@ def is_outdated_version(version, limit, assume_new=True):          return version_tuple(version) < version_tuple(limit)      except ValueError:          return not assume_new + + +def ytdl_is_updateable(): +    """ Returns if youtube-dl can be updated with -U """ +    from zipimport import zipimporter + +    return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen') diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 34bf665ad..a283afbe3 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.11.16' +__version__ = '2014.11.21.1' | 
