diff options
| -rw-r--r-- | README.md | 6 | ||||
| -rw-r--r-- | test/helper.py | 2 | ||||
| -rwxr-xr-x | youtube_dl/YoutubeDL.py | 19 | ||||
| -rw-r--r-- | youtube_dl/__init__.py | 9 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/bild.py | 39 | ||||
| -rw-r--r-- | youtube_dl/extractor/cinemassacre.py | 17 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/crunchyroll.py | 11 | ||||
| -rw-r--r-- | youtube_dl/extractor/francetv.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/funnyordie.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 17 | ||||
| -rw-r--r-- | youtube_dl/extractor/glide.py | 40 | ||||
| -rw-r--r-- | youtube_dl/extractor/hark.py | 48 | ||||
| -rw-r--r-- | youtube_dl/extractor/mitele.py | 13 | ||||
| -rw-r--r-- | youtube_dl/extractor/motherless.py | 56 | ||||
| -rw-r--r-- | youtube_dl/extractor/pbs.py | 20 | ||||
| -rw-r--r-- | youtube_dl/extractor/telecinco.py | 19 | ||||
| -rw-r--r-- | youtube_dl/extractor/viddler.py | 108 | ||||
| -rw-r--r-- | youtube_dl/extractor/vidzi.py | 33 | ||||
| -rw-r--r-- | youtube_dl/options.py | 9 | ||||
| -rw-r--r-- | youtube_dl/version.py | 2 | 
22 files changed, 373 insertions, 112 deletions
| @@ -69,6 +69,8 @@ which means you can modify it, redistribute it or use it however you like.                                       configuration in ~/.config/youtube-dl.conf                                       (%APPDATA%/youtube-dl/config.txt on                                       Windows) +    --flat-playlist                  Do not extract the videos of a playlist, +                                     only list them.  ## Video Selection:      --playlist-start NUMBER          playlist video to start at (default is 1) @@ -197,6 +199,10 @@ which means you can modify it, redistribute it or use it however you like.      -j, --dump-json                  simulate, quiet but print JSON information.                                       See --output for a description of available                                       keys. +    -J, --dump-single-json           simulate, quiet but print JSON information +                                     for each command-line argument. If the URL +                                     refers to a playlist, dump the whole +                                     playlist information in a single line.      --newline                        output progress bar as new lines      --no-progress                    do not print progress bar      --console-title                  display progress in console titlebar diff --git a/test/helper.py b/test/helper.py index 62cb3ce02..2fa45631a 100644 --- a/test/helper.py +++ b/test/helper.py @@ -145,7 +145,7 @@ def expect_info_dict(self, expected_dict, got_dict):          info_dict_str = ''.join(              '    %s: %s,\n' % (_repr(k), _repr(v))              for k, v in test_info_dict.items()) -        write_string('\n"info_dict": {' + info_dict_str + '}\n', out=sys.stderr) +        write_string('\n"info_dict": {\n' + info_dict_str + '}\n', out=sys.stderr)          self.assertFalse(              missing_keys,              'Missing keys in test definition: %s' % ( diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index dec0e20e7..75461f19d 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -107,6 +107,8 @@ class YoutubeDL(object):      forcefilename:     Force printing final filename.      forceduration:     Force printing duration.      forcejson:         Force printing info_dict as JSON. +    dump_single_json:  Force printing the info_dict of the whole playlist +                       (or video) as a single JSON line.      simulate:          Do not download the video files.      format:            Video format code.      format_limit:      Highest quality format to try. @@ -165,6 +167,8 @@ class YoutubeDL(object):                         'auto' for elaborate guessing      encoding:          Use this encoding instead of the system-specified.      extract_flat:      Do not resolve URLs, return the immediate result. +                       Pass in 'in_playlist' to only show this behavior for +                       playlist items.      The following parameters are not used by YoutubeDL itself, they are used by      the FileDownloader: @@ -568,8 +572,12 @@ class YoutubeDL(object):          result_type = ie_result.get('_type', 'video') -        if self.params.get('extract_flat', False): -            if result_type in ('url', 'url_transparent'): +        if result_type in ('url', 'url_transparent'): +            extract_flat = self.params.get('extract_flat', False) +            if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or +                    extract_flat is True): +                if self.params.get('forcejson', False): +                    self.to_stdout(json.dumps(ie_result))                  return ie_result          if result_type == 'video': @@ -897,6 +905,8 @@ class YoutubeDL(object):          if self.params.get('forcejson', False):              info_dict['_filename'] = filename              self.to_stdout(json.dumps(info_dict)) +        if self.params.get('dump_single_json', False): +            info_dict['_filename'] = filename          # Do nothing else if in simulate mode          if self.params.get('simulate', False): @@ -1064,12 +1074,15 @@ class YoutubeDL(object):          for url in url_list:              try:                  #It also downloads the videos -                self.extract_info(url) +                res = self.extract_info(url)              except UnavailableVideoError:                  self.report_error('unable to download video')              except MaxDownloadsReached:                  self.to_screen('[info] Maximum number of downloaded files reached.')                  raise +            else: +                if self.params.get('dump_single_json', False): +                    self.to_stdout(json.dumps(res))          return self._download_retcode diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 78cdf14df..4f5ce604f 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -79,6 +79,9 @@ __authors__  = (      'Carlos Ramos',      '5moufl',      'lenaten', +    'Dennis Scheiba', +    'Damon Timm', +    'winwon',      'Xavier Beynon'  ) @@ -256,8 +259,6 @@ def _real_main(argv=None):          date = DateRange.day(opts.date)      else:          date = DateRange(opts.dateafter, opts.datebefore) -    if opts.default_search not in ('auto', 'auto_warning', 'error', 'fixup_error', None) and ':' not in opts.default_search: -        parser.error(u'--default-search invalid; did you forget a colon (:) at the end?')      # Do not download videos when there are audio-only formats      if opts.extractaudio and not opts.keepvideo and opts.format is None: @@ -285,7 +286,7 @@ def _real_main(argv=None):                       u' file! Use "{0}.%(ext)s" instead of "{0}" as the output'                       u' template'.format(outtmpl)) -    any_printing = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson +    any_printing = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json      download_archive_fn = os.path.expanduser(opts.download_archive) if opts.download_archive is not None else opts.download_archive      ydl_opts = { @@ -305,6 +306,7 @@ def _real_main(argv=None):          'forcefilename': opts.getfilename,          'forceformat': opts.getformat,          'forcejson': opts.dumpjson, +        'dump_single_json': opts.dump_single_json,          'simulate': opts.simulate,          'skip_download': (opts.skip_download or opts.simulate or any_printing),          'format': opts.format, @@ -370,6 +372,7 @@ def _real_main(argv=None):          'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,          'encoding': opts.encoding,          'exec_cmd': opts.exec_cmd, +        'extract_flat': opts.extract_flat,      }      with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 29f32cdef..691fef5ca 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -27,6 +27,7 @@ from .bandcamp import BandcampIE, BandcampAlbumIE  from .bbccouk import BBCCoUkIE  from .beeg import BeegIE  from .behindkink import BehindKinkIE +from .bild import BildIE  from .bilibili import BiliBiliIE  from .blinkx import BlinkxIE  from .bliptv import BlipTVIE, BlipTVUserIE @@ -135,6 +136,7 @@ from .gamestar import GameStarIE  from .gametrailers import GametrailersIE  from .gdcvault import GDCVaultIE  from .generic import GenericIE +from .glide import GlideIE  from .globo import GloboIE  from .godtube import GodTubeIE  from .golem import GolemIE @@ -368,6 +370,7 @@ from .teachingchannel import TeachingChannelIE  from .teamcoco import TeamcocoIE  from .techtalks import TechTalksIE  from .ted import TEDIE +from .telecinco import TelecincoIE  from .telemb import TeleMBIE  from .tenplay import TenPlayIE  from .testurl import TestURLIE @@ -422,6 +425,7 @@ from .videopremium import VideoPremiumIE  from .videott import VideoTtIE  from .videoweed import VideoWeedIE  from .vidme import VidmeIE +from .vidzi import VidziIE  from .vimeo import (      VimeoIE,      VimeoAlbumIE, @@ -490,10 +494,8 @@ from .youtube import (      YoutubeUserIE,      YoutubeWatchLaterIE,  ) -  from .zdf import ZDFIE -  _ALL_CLASSES = [      klass      for name, klass in globals().items() diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py new file mode 100644 index 000000000..0269d1174 --- /dev/null +++ b/youtube_dl/extractor/bild.py @@ -0,0 +1,39 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class BildIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html' +    IE_DESC = 'Bild.de' +    _TEST = { +        'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html', +        'md5': 'dd495cbd99f2413502a1713a1156ac8a', +        'info_dict': { +            'id': '38184146', +            'ext': 'mp4', +            'title': 'BILD hat sie getestet', +            'thumbnail': 'http://bilder.bild.de/fotos/stand-das-koennen-die-neuen-ipads-38184138/Bild/1.bild.jpg', +            'duration': 196, +            'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml" +        doc = self._download_xml(xml_url, video_id) + +        duration = int_or_none(doc.attrib.get('duration'), scale=1000) + +        return { +            'id': video_id, +            'title': doc.attrib['ueberschrift'], +            'description': doc.attrib.get('text'), +            'url': doc.attrib['src'], +            'thumbnail': doc.attrib.get('img'), +            'duration': duration, +        } diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 496271be4..d064a28f9 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -42,7 +42,7 @@ class CinemassacreIE(InfoExtractor):          webpage = self._download_webpage(url, display_id)          video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') -        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) +        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)          if not mobj:              raise ExtractorError('Can\'t extract embed url and video id')          playerdata_url = mobj.group('embed_url') @@ -53,17 +53,22 @@ class CinemassacreIE(InfoExtractor):          video_description = self._html_search_regex(              r'<div class="entry-content">(?P<description>.+?)</div>',              webpage, 'description', flags=re.DOTALL, fatal=False) +        video_thumbnail = self._og_search_thumbnail(webpage)          playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage') -        video_thumbnail = self._search_regex( -            r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False) -        sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file') -        videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url') +        vidurl = self._search_regex( +            r'\'vidurl\'\s*:\s*"([^\']+)"', playerdata, 'vidurl').replace('\\/', '/') +        vidid = self._search_regex( +            r'\'vidid\'\s*:\s*"([^\']+)"', playerdata, 'vidid') +        videoserver = self._html_search_regex( +            r"'videoserver'\s*:\s*'([^']+)'", playerdata, 'videoserver') + +        videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)          videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')          formats = [] -        baseurl = sd_url[:sd_url.rfind('/')+1] +        baseurl = vidurl[:vidurl.rfind('/')+1]          for video in videolist.findall('.//video'):              src = video.get('src')              if not src: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e8366f7f9..cf3781cd6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -89,6 +89,10 @@ class InfoExtractor(object):                                   format, irrespective of the file format.                                   -1 for default (order by other properties),                                   -2 or smaller for less than default. +                    * source_preference  Order number for this video source +                                  (quality takes higher priority) +                                 -1 for default (order by other properties), +                                 -2 or smaller for less than default.                      * http_referer  HTTP Referer header value to set.                      * http_method  HTTP method to use for the download.                      * http_headers  A dictionary of additional HTTP headers @@ -613,12 +617,13 @@ class InfoExtractor(object):                  audio_ext_preference,                  f.get('filesize') if f.get('filesize') is not None else -1,                  f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, +                f.get('source_preference') if f.get('source_preference') is not None else -1,                  f.get('format_id'),              )          formats.sort(key=_formats_key)      def http_scheme(self): -        """ Either "https:" or "https:", depending on the user's preferences """ +        """ Either "http:" or "https:", depending on the user's preferences """          return (              'http:'              if self._downloader.params.get('prefer_insecure', False) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f99888ecc..e3057d900 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -39,6 +39,7 @@ class CrunchyrollIE(SubtitlesInfoExtractor):              'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',              'uploader': 'Yomiuri Telecasting Corporation (YTV)',              'upload_date': '20131013', +            'url': 're:(?!.*&)',          },          'params': {              # rtmp @@ -237,12 +238,14 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text              streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+'&media%5Fid='+stream_id+'&video%5Fformat='+stream_format              streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')              streamdata_req.add_header('Content-Length', str(len(streamdata_req.data))) -            streamdata = self._download_webpage(streamdata_req, video_id, note='Downloading media info for '+video_format) -            video_url = self._search_regex(r'<host>([^<]+)', streamdata, 'video_url') -            video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, 'video_play_path') +            streamdata = self._download_xml( +                streamdata_req, video_id, +                note='Downloading media info for %s' % video_format) +            video_url = streamdata.find('.//host').text +            video_play_path = streamdata.find('.//file').text              formats.append({                  'url': video_url, -                'play_path':   video_play_path, +                'play_path': video_play_path,                  'ext': 'flv',                  'format': video_format,                  'format_id': video_format, diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 07165e330..566e20d76 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -58,7 +58,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):                  formats.append({                      'url': video_url,                      'format_id': format_id, -                    'preference': 2, +                    'preference': -1,                  })          self._sort_formats(formats) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index d966e8403..ec6d96ada 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -37,7 +37,7 @@ class FunnyOrDieIE(InfoExtractor):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        links = re.findall(r'<source src="([^"]+/v)\d+\.([^"]+)" type=\'video', webpage) +        links = re.findall(r'<source src="([^"]+/v)[^"]+\.([^"]+)" type=\'video', webpage)          if not links:              raise ExtractorError('No media links available for %s' % video_id) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9057a6beb..9b6498894 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -380,6 +380,17 @@ class GenericIE(InfoExtractor):                  'uploader': 'education-portal.com',              },          }, +        { +            'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', +            'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', +            'info_dict': { +                'id': 'uxjb0lwrcz', +                'ext': 'mp4', +                'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', +                'duration': 1715.0, +                'uploader': 'thoughtworks.wistia.com', +            },    +        },      ]      def report_following_redirect(self, new_url): @@ -476,7 +487,8 @@ class GenericIE(InfoExtractor):                       'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'                      ) % (url, url), expected=True)              else: -                assert ':' in default_search +                if ':' not in default_search: +                    default_search += ':'                  return self.url_result(default_search + url)          url, smuggled_data = unsmuggle_url(url) @@ -652,7 +664,7 @@ class GenericIE(InfoExtractor):          # Look for embedded Wistia player          match = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) +            r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)          if match:              embed_url = self._proto_relative_url(                  unescapeHTML(match.group('url'))) @@ -664,6 +676,7 @@ class GenericIE(InfoExtractor):                  'title': video_title,                  'id': video_id,              } +                      match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)          if match:              return { diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py new file mode 100644 index 000000000..9561ed5fb --- /dev/null +++ b/youtube_dl/extractor/glide.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class GlideIE(InfoExtractor): +    IE_DESC = 'Glide mobile video messages (glide.me)' +    _VALID_URL = r'https?://share\.glide\.me/(?P<id>[A-Za-z0-9\-=_+]+)' +    _TEST = { +        'url': 'http://share.glide.me/UZF8zlmuQbe4mr+7dCiQ0w==', +        'md5': '4466372687352851af2d131cfaa8a4c7', +        'info_dict': { +            'id': 'UZF8zlmuQbe4mr+7dCiQ0w==', +            'ext': 'mp4', +            'title': 'Damon Timm\'s Glide message', +            'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        title = self._html_search_regex( +            r'<title>(.*?)</title>', webpage, 'title') +        video_url = self.http_scheme() + self._search_regex( +            r'<source src="(.*?)" type="video/mp4">', webpage, 'video URL') +        thumbnail_url = self._search_regex( +            r'<img id="video-thumbnail" src="(.*?)"', +            webpage, 'thumbnail url', fatal=False) +        thumbnail = ( +            thumbnail_url if thumbnail_url is None +            else self.http_scheme() + thumbnail_url) + +        return { +            'id': video_id, +            'title': title, +            'url': video_url, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py index 5bdd08afa..b6cc15b6f 100644 --- a/youtube_dl/extractor/hark.py +++ b/youtube_dl/extractor/hark.py @@ -1,37 +1,33 @@  # -*- coding: utf-8 -*- - -import re -import json +from __future__ import unicode_literals  from .common import InfoExtractor -from ..utils import determine_ext +  class HarkIE(InfoExtractor): -    _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+' +    _VALID_URL = r'https?://www\.hark\.com/clips/(?P<id>.+?)-.+'      _TEST = { -        u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', -        u'file': u'mmbzyhkgny.mp3', -        u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', -        u'info_dict': { -            u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013", -            u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', -            u'duration': 11, +        'url': 'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', +        'md5': '6783a58491b47b92c7c1af5a77d4cbee', +        'info_dict': { +            'id': 'mmbzyhkgny', +            'ext': 'mp3', +            'title': 'Obama: \'Beyond The Afghan Theater, We Only Target Al Qaeda\' on May 23, 2013', +            'description': 'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', +            'duration': 11,          }      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group(1) -        json_url = "http://www.hark.com/clips/%s.json" %(video_id) -        info_json = self._download_webpage(json_url, video_id) -        info = json.loads(info_json) -        final_url = info['url'] +        video_id = self._match_id(url) +        data = self._download_json( +            'http://www.hark.com/clips/%s.json' % video_id, video_id) -        return {'id': video_id, -                'url' : final_url, -                'title': info['name'], -                'ext': determine_ext(final_url), -                'description': info['description'], -                'thumbnail': info['image_original'], -                'duration': info['duration'], -                } +        return { +            'id': video_id, +            'url': data['url'], +            'title': data['name'], +            'description': data.get('description'), +            'thumbnail': data.get('image_original'), +            'duration': data.get('duration'), +        } diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 979f3d692..6691521e5 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -6,6 +6,7 @@ import json  from .common import InfoExtractor  from ..utils import (      compat_urllib_parse, +    compat_urlparse,      get_element_by_attribute,      parse_duration,      strip_jsonp, @@ -39,13 +40,21 @@ class MiTeleIE(InfoExtractor):          ).replace('\'', '"')          embed_data = json.loads(embed_data_json) -        info_url = embed_data['flashvars']['host'] +        domain = embed_data['mediaUrl'] +        if not domain.startswith('http'): +            # only happens in telecinco.es videos +            domain = 'http://' + domain +        info_url = compat_urlparse.urljoin( +            domain, +            compat_urllib_parse.unquote(embed_data['flashvars']['host']) +        )          info_el = self._download_xml(info_url, episode).find('./video/info')          video_link = info_el.find('videoUrl/link').text          token_query = compat_urllib_parse.urlencode({'id': video_link})          token_info = self._download_json( -            'http://token.mitele.es/?' + token_query, episode, +            embed_data['flashvars']['ov_tk'] + '?' + token_query, +            episode,              transform_source=strip_jsonp          ) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6229b2173..3621ff99e 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -5,20 +5,20 @@ import re  from .common import InfoExtractor  from ..utils import ( -    int_or_none, +    str_to_int,      unified_strdate,  )  class MotherlessIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?motherless\.com/(?P<id>[A-Z0-9]+)' +    _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'      _TESTS = [          {              'url': 'http://motherless.com/AC3FFE1', -            'md5': '5527fef81d2e529215dad3c2d744a7d9', +            'md5': '310f62e325a9fafe64f68c0bccb6e75f',              'info_dict': {                  'id': 'AC3FFE1', -                'ext': 'flv', +                'ext': 'mp4',                  'title': 'Fucked in the ass while playing PS3',                  'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],                  'upload_date': '20100913', @@ -40,33 +40,51 @@ class MotherlessIE(InfoExtractor):                  'thumbnail': 're:http://.*\.jpg',                  'age_limit': 18,              } +        }, +        { +            'url': 'http://motherless.com/g/cosplay/633979F', +            'md5': '0b2a43f447a49c3e649c93ad1fafa4a0', +            'info_dict': { +                'id': '633979F', +                'ext': 'mp4', +                'title': 'Turtlette', +                'categories': ['superheroine heroine  superher'], +                'upload_date': '20140827', +                'uploader_id': 'shade0230', +                'thumbnail': 're:http://.*\.jpg', +                'age_limit': 18, +            }          }      ] -    def _real_extract(self,url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') - +    def _real_extract(self, url): +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        title = self._html_search_regex(r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') -         -        video_url = self._html_search_regex(r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video_url') +        title = self._html_search_regex( +            r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') +        video_url = self._html_search_regex( +            r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL')          age_limit = self._rta_search(webpage) - -        view_count = self._html_search_regex(r'<strong>Views</strong>\s+([^<]+)<', webpage, 'view_count') +        view_count = str_to_int(self._html_search_regex( +            r'<strong>Views</strong>\s+([^<]+)<', +            webpage, 'view count', fatal=False)) +        like_count = str_to_int(self._html_search_regex( +            r'<strong>Favorited</strong>\s+([^<]+)<', +            webpage, 'like count', fatal=False)) -        upload_date = self._html_search_regex(r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload_date') +        upload_date = self._html_search_regex( +            r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload date')          if 'Ago' in upload_date:              days = int(re.search(r'([0-9]+)', upload_date).group(1))              upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d')          else:              upload_date = unified_strdate(upload_date) -        like_count = self._html_search_regex(r'<strong>Favorited</strong>\s+([^<]+)<', webpage, 'like_count') -          comment_count = webpage.count('class="media-comment-contents"') -        uploader_id = self._html_search_regex(r'"thumb-member-username">\s+<a href="/m/([^"]+)"', webpage, 'uploader_id') +        uploader_id = self._html_search_regex( +            r'"thumb-member-username">\s+<a href="/m/([^"]+)"', +            webpage, 'uploader_id')          categories = self._html_search_meta('keywords', webpage)          if categories: @@ -79,8 +97,8 @@ class MotherlessIE(InfoExtractor):              'uploader_id': uploader_id,              'thumbnail': self._og_search_thumbnail(webpage),              'categories': categories, -            'view_count': int_or_none(view_count.replace(',', '')), -            'like_count': int_or_none(like_count.replace(',', '')), +            'view_count': view_count, +            'like_count': like_count,              'comment_count': comment_count,              'age_limit': age_limit,              'url': video_url, diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8f140d626..6118ed5c2 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -80,8 +80,14 @@ class PBSIE(InfoExtractor):                  'thumbnail': 're:^https?://.*\.jpg$',                  'upload_date': '20140122',              } +        }, +        { +            'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', +            'info_dict': { +                'id': 'united-states-of-secrets', +            }, +            'playlist_count': 2,          } -      ]      def _extract_webpage(self, url): @@ -96,6 +102,12 @@ class PBSIE(InfoExtractor):                  r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"',                  webpage, 'upload date', default=None)) +            # tabbed frontline videos +            tabbed_videos = re.findall( +                r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', webpage) +            if tabbed_videos: +                return tabbed_videos, presumptive_id, upload_date +              MEDIA_ID_REGEXES = [                  r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed                  r'class="coveplayerid">([^<]+)<',                       # coveplayer @@ -130,6 +142,12 @@ class PBSIE(InfoExtractor):      def _real_extract(self, url):          video_id, display_id, upload_date = self._extract_webpage(url) +        if isinstance(video_id, list): +            entries = [self.url_result( +                'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id) +                for vid_id in video_id] +            return self.playlist_result(entries, display_id) +          info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id          info = self._download_json(info_url, display_id) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py new file mode 100644 index 000000000..db9788c18 --- /dev/null +++ b/youtube_dl/extractor/telecinco.py @@ -0,0 +1,19 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .mitele import MiTeleIE + + +class TelecincoIE(MiTeleIE): +    IE_NAME = 'telecinco.es' +    _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/[^/]+/(?P<episode>.*?)\.html' + +    _TEST = { +        'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', +        'info_dict': { +            'id': 'MDSVID20141015_0058', +            'ext': 'mp4', +            'title': 'Con Martín Berasategui, hacer un bacalao al ...', +            'duration': 662, +        }, +    } diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 9328ef4a2..0faa729c6 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -1,55 +1,85 @@ -import json -import re +from __future__ import unicode_literals  from .common import InfoExtractor +from ..utils import ( +    float_or_none, +    int_or_none, +)  class ViddlerIE(InfoExtractor): -    _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler\.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)' +    _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)'      _TEST = { -        u"url": u"http://www.viddler.com/v/43903784", -        u'file': u'43903784.mp4', -        u'md5': u'fbbaedf7813e514eb7ca30410f439ac9', -        u'info_dict': { -            u"title": u"Video Made Easy", -            u"uploader": u"viddler", -            u"duration": 100.89, +        "url": "http://www.viddler.com/v/43903784", +        'md5': 'ae43ad7cb59431ce043f0ff7fa13cbf4', +        'info_dict': { +            'id': '43903784', +            'ext': 'mp4', +            "title": "Video Made Easy", +            'description': 'You don\'t need to be a professional to make high-quality video content. Viddler provides some quick and easy tips on how to produce great video content with limited resources. ', +            "uploader": "viddler", +            'timestamp': 1335371429, +            'upload_date': '20120425', +            "duration": 100.89, +            'thumbnail': 're:^https?://.*\.jpg$', +            'view_count': int, +            'categories': ['video content', 'high quality video', 'video made easy', 'how to produce video with limited resources', 'viddler'],          }      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') - -        embed_url = mobj.group('domain') + u'/embed/' + video_id -        webpage = self._download_webpage(embed_url, video_id) - -        video_sources_code = self._search_regex( -            r"(?ms)sources\s*:\s*(\{.*?\})", webpage, u'video URLs') -        video_sources = json.loads(video_sources_code.replace("'", '"')) - -        formats = [{ -            'url': video_url, -            'format': format_id, -        } for video_url, format_id in video_sources.items()] - -        title = self._html_search_regex( -            r"title\s*:\s*'([^']*)'", webpage, u'title') -        uploader = self._html_search_regex( -            r"authorName\s*:\s*'([^']*)'", webpage, u'uploader', fatal=False) -        duration_s = self._html_search_regex( -            r"duration\s*:\s*([0-9.]*)", webpage, u'duration', fatal=False) -        duration = float(duration_s) if duration_s else None -        thumbnail = self._html_search_regex( -            r"thumbnail\s*:\s*'([^']*)'", -            webpage, u'thumbnail', fatal=False) +        video_id = self._match_id(url) + +        json_url = ( +            'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?video_id=%s&key=v0vhrt7bg2xq1vyxhkct' % +            video_id) +        data = self._download_json(json_url, video_id)['video'] + +        formats = [] +        for filed in data['files']: +            if filed.get('status', 'ready') != 'ready': +                continue +            f = { +                'format_id': filed['profile_id'], +                'format_note': filed['profile_name'], +                'url': self._proto_relative_url(filed['url']), +                'width': int_or_none(filed.get('width')), +                'height': int_or_none(filed.get('height')), +                'filesize': int_or_none(filed.get('size')), +                'ext': filed.get('ext'), +                'source_preference': -1, +            } +            formats.append(f) + +            if filed.get('cdn_url'): +                f = f.copy() +                f['url'] = self._proto_relative_url(filed['cdn_url']) +                f['format_id'] = filed['profile_id'] + '-cdn' +                f['source_preference'] = 1 +                formats.append(f) + +            if filed.get('html5_video_source'): +                f = f.copy() +                f['url'] = self._proto_relative_url( +                    filed['html5_video_source']) +                f['format_id'] = filed['profile_id'] + '-html5' +                f['source_preference'] = 0 +                formats.append(f) +        self._sort_formats(formats) + +        categories = [ +            t.get('text') for t in data.get('tags', []) if 'text' in t]          return {              '_type': 'video',              'id': video_id, -            'title': title, -            'thumbnail': thumbnail, -            'uploader': uploader, -            'duration': duration, +            'title': data['title'],              'formats': formats, +            'description': data.get('description'), +            'timestamp': int_or_none(data.get('upload_time')), +            'thumbnail': self._proto_relative_url(data.get('thumbnail_url')), +            'uploader': data.get('author'), +            'duration': float_or_none(data.get('length')), +            'view_count': int_or_none(data.get('view_count')), +            'categories': categories,          } diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py new file mode 100644 index 000000000..669979e13 --- /dev/null +++ b/youtube_dl/extractor/vidzi.py @@ -0,0 +1,33 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class VidziIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)' +    _TEST = { +        'url': 'http://vidzi.tv/cghql9yq6emu.html', +        'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', +        'info_dict': { +            'id': 'cghql9yq6emu', +            'ext': 'mp4', +            'title': 'youtube-dl test video  1\\\\2\'3/4<5\\\\6ä7↭', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +         +        webpage = self._download_webpage(url, video_id) +        video_url = self._html_search_regex( +            r'{\s*file\s*:\s*"([^"]+)"\s*}', webpage, 'video url') +        title = self._html_search_regex( +            r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') +         +        return { +            'id': video_id, +            'title': title, +            'url': video_url, +        } +        
\ No newline at end of file diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 649361bde..2ccc63fc5 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -159,6 +159,11 @@ def parseOpts(overrideArguments=None):          '--ignore-config',          action='store_true',          help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)') +    general.add_option( +        '--flat-playlist', +        action='store_const', dest='extract_flat', const='in_playlist', +        default=False, +        help='Do not extract the videos of a playlist, only list them.')      selection = optparse.OptionGroup(parser, 'Video Selection')      selection.add_option( @@ -413,6 +418,10 @@ def parseOpts(overrideArguments=None):          action='store_true', dest='dumpjson', default=False,          help='simulate, quiet but print JSON information. See --output for a description of available keys.')      verbosity.add_option( +        '-J', '--dump-single-json', +        action='store_true', dest='dump_single_json', default=False, +        help='simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole playlist information in a single line.') +    verbosity.add_option(          '--newline',          action='store_true', dest='progress_with_newline', default=False,          help='output progress bar as new lines') diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e7f6adef1..d822ae330 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.10.18' +__version__ = '2014.10.25' | 
