diff options
40 files changed, 563 insertions, 219 deletions
| diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index ffebb4ae5..58cf9c313 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -128,6 +128,18 @@ class TestFormatSelection(unittest.TestCase):          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded['format_id'], u'35') +    def test_add_extra_info(self): +        test_dict = { +            'extractor': 'Foo', +        } +        extra_info = { +            'extractor': 'Bar', +            'playlist': 'funny videos', +        } +        YDL.add_extra_info(test_dict, extra_info) +        self.assertEqual(test_dict['extractor'], 'Foo') +        self.assertEqual(test_dict['playlist'], 'funny videos') +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_download.py b/test/test_download.py index dfb04d010..16f200809 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -31,6 +31,7 @@ from youtube_dl.utils import (      ExtractorError,      UnavailableVideoError,  ) +from youtube_dl.extractor import get_info_extractor  RETRIES = 3 @@ -63,9 +64,10 @@ def generator(test_case):      def test_template(self):          ie = youtube_dl.extractor.get_info_extractor(test_case['name']) +        other_ies = [get_info_extractor(ie_key) for ie_key in test_case.get('add_ie', [])]          def print_skipping(reason):              print('Skipping %s: %s' % (test_case['name'], reason)) -        if not ie._WORKING: +        if not ie.working():              print_skipping('IE marked as not _WORKING')              return          if 'playlist' not in test_case: @@ -77,6 +79,10 @@ def generator(test_case):          if 'skip' in test_case:              print_skipping(test_case['skip'])              return +        for other_ie in other_ies: +            if not other_ie.working(): +                print_skipping(u'test depends on %sIE, marked as not WORKING' % other_ie.ie_key()) +                return          params = get_params(test_case.get('params', {})) @@ -148,6 +154,9 @@ def generator(test_case):                  # Check for the presence of mandatory fields                  for key in ('id', 'url', 'title', 'ext'):                      self.assertTrue(key in info_dict.keys() and info_dict[key]) +                # Check for mandatory fields that are automatically set by YoutubeDL +                for key in ['webpage_url', 'extractor', 'extractor_key']: +                    self.assertTrue(info_dict.get(key), u'Missing field: %s' % key)          finally:              try_rm_tcs_files() diff --git a/test/test_playlists.py b/test/test_playlists.py index de1e8d88e..706b6bdca 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -17,6 +17,7 @@ from youtube_dl.extractor import (      DailymotionUserIE,      VimeoChannelIE,      UstreamChannelIE, +    SoundcloudSetIE,      SoundcloudUserIE,      LivestreamIE,      NHLVideocenterIE, @@ -61,6 +62,14 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['id'], u'5124905')          self.assertTrue(len(result['entries']) >= 11) +    def test_soundcloud_set(self): +        dl = FakeYDL() +        ie = SoundcloudSetIE(dl) +        result = ie.extract('https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep') +        self.assertIsPlaylist(result) +        self.assertEqual(result['title'], u'The Royal Concept EP') +        self.assertTrue(len(result['entries']) >= 6) +      def test_soundcloud_user(self):          dl = FakeYDL()          ie = SoundcloudUserIE(dl) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 8ecabab1a..35fa3ca61 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -4,12 +4,19 @@ import re  import subprocess  import sys  import time -import traceback  if os.name == 'nt':      import ctypes -from .utils import * +from .utils import ( +    compat_urllib_error, +    compat_urllib_request, +    ContentTooShortError, +    determine_ext, +    encodeFilename, +    sanitize_open, +    timeconvert, +)  class FileDownloader(object): @@ -194,7 +201,7 @@ class FileDownloader(object):              if old_filename == new_filename:                  return              os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) -        except (IOError, OSError) as err: +        except (IOError, OSError):              self.report_error(u'unable to rename file')      def try_utime(self, filename, last_modified_hdr): @@ -227,8 +234,14 @@ class FileDownloader(object):          if self.params.get('noprogress', False):              return          clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') -        eta_str = self.format_eta(eta) -        percent_str = self.format_percent(percent) +        if eta is not None: +            eta_str = self.format_eta(eta) +        else: +            eta_str = 'Unknown ETA' +        if percent is not None: +            percent_str = self.format_percent(percent) +        else: +            percent_str = 'Unknown %'          speed_str = self.format_speed(speed)          if self.params.get('progress_with_newline', False):              self.to_screen(u'[download] %s of %s at %s ETA %s' % @@ -251,7 +264,7 @@ class FileDownloader(object):          """Report file has already been fully downloaded."""          try:              self.to_screen(u'[download] %s has already been downloaded' % file_name) -        except (UnicodeEncodeError) as err: +        except UnicodeEncodeError:              self.to_screen(u'[download] The file has already been downloaded')      def report_unable_to_resume(self): @@ -366,7 +379,8 @@ class FileDownloader(object):          self.report_destination(filename)          tmpfilename = self.temp_name(filename) -        args = ['ffmpeg', '-y', '-i', url, '-f', 'mp4', tmpfilename] +        args = ['ffmpeg', '-y', '-i', url, '-f', 'mp4', '-c', 'copy', +            '-absf', 'aac_adtstoasc', tmpfilename]          # Check for ffmpeg first          try:              subprocess.call(['ffmpeg', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) @@ -550,12 +564,11 @@ class FileDownloader(object):              # Progress message              speed = self.calc_speed(start, time.time(), byte_counter - resume_len)              if data_len is None: -                self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA') -                eta = None +                eta = percent = None              else:                  percent = self.calc_percent(byte_counter, data_len)                  eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) -                self.report_progress(percent, data_len_str, speed, eta) +            self.report_progress(percent, data_len_str, speed, eta)              self._hook_progress({                  'downloaded_bytes': byte_counter, diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 7f73ea360..5253c39e1 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -318,6 +318,12 @@ class YoutubeDL(object):                      % info_dict)          return None +    @staticmethod +    def add_extra_info(info_dict, extra_info): +        '''Set the keys from extra_info in info dict if they are missing''' +        for key, value in extra_info.items(): +            info_dict.setdefault(key, value) +      def extract_info(self, url, download=True, ie_key=None, extra_info={}):          '''          Returns a list with a dictionary for each video we find. @@ -344,17 +350,17 @@ class YoutubeDL(object):                      break                  if isinstance(ie_result, list):                      # Backwards compatibility: old IE result format -                    for result in ie_result: -                        result.update(extra_info)                      ie_result = {                          '_type': 'compat_list',                          'entries': ie_result,                      } -                else: -                    ie_result.update(extra_info) -                if 'extractor' not in ie_result: -                    ie_result['extractor'] = ie.IE_NAME -                return self.process_ie_result(ie_result, download=download) +                self.add_extra_info(ie_result, +                    { +                        'extractor': ie.IE_NAME, +                        'webpage_url': url, +                        'extractor_key': ie.ie_key(), +                    }) +                return self.process_ie_result(ie_result, download, extra_info)              except ExtractorError as de: # An error we somewhat expected                  self.report_error(compat_str(de), de.format_traceback())                  break @@ -378,7 +384,7 @@ class YoutubeDL(object):          result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system          if result_type == 'video': -            ie_result.update(extra_info) +            self.add_extra_info(ie_result, extra_info)              return self.process_video_result(ie_result)          elif result_type == 'url':              # We have to add extra_info to the results because it may be @@ -388,6 +394,7 @@ class YoutubeDL(object):                                       ie_key=ie_result.get('ie_key'),                                       extra_info=extra_info)          elif result_type == 'playlist': +            self.add_extra_info(ie_result, extra_info)              # We process each entry in the playlist              playlist = ie_result.get('title', None) or ie_result.get('id', None)              self.to_screen(u'[download] Downloading playlist: %s' % playlist) @@ -413,12 +420,10 @@ class YoutubeDL(object):                  extra = {                      'playlist': playlist,                      'playlist_index': i + playliststart, +                    'extractor': ie_result['extractor'], +                    'webpage_url': ie_result['webpage_url'], +                    'extractor_key': ie_result['extractor_key'],                  } -                if not 'extractor' in entry: -                    # We set the extractor, if it's an url it will be set then to -                    # the new extractor, but if it's already a video we must make -                    # sure it's present: see issue #877 -                    entry['extractor'] = ie_result['extractor']                  entry_result = self.process_ie_result(entry,                                                        download=download,                                                        extra_info=extra) @@ -427,10 +432,15 @@ class YoutubeDL(object):              return ie_result          elif result_type == 'compat_list':              def _fixup(r): -                r.setdefault('extractor', ie_result['extractor']) +                self.add_extra_info(r, +                    { +                        'extractor': ie_result['extractor'], +                        'webpage_url': ie_result['webpage_url'], +                        'extractor_key': ie_result['extractor_key'], +                    })                  return r              ie_result['entries'] = [ -                self.process_ie_result(_fixup(r), download=download) +                self.process_ie_result(_fixup(r), download, extra_info)                  for r in ie_result['entries']              ]              return ie_result @@ -772,7 +782,7 @@ class YoutubeDL(object):      def list_formats(self, info_dict):          def line(format): -            return (u'%-15s%-10s%-12s%s' % ( +            return (u'%-20s%-10s%-12s%s' % (                  format['format_id'],                  format['ext'],                  self.format_resolution(format), diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 48ffcbf8e..ab7879c5d 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -349,7 +349,7 @@ def parseOpts(overrideArguments=None):                    'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))      filesystem.add_option('--autonumber-size',              dest='autonumber_size', metavar='NUMBER', -            help='Specifies the number of digits in %(autonumber)s when it is present in output filename template or --autonumber option is given') +            help='Specifies the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given')      filesystem.add_option('--restrict-filenames',              action='store_true', dest='restrictfilenames',              help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames', default=False) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a69c08f51..f9caca4ef 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -38,8 +38,10 @@ from .defense import DefenseGouvFrIE  from .ebaumsworld import EbaumsWorldIE  from .ehow import EHowIE  from .eighttracks import EightTracksIE +from .eitb import EitbIE  from .escapist import EscapistIE  from .exfm import ExfmIE +from .extremetube import ExtremeTubeIE  from .facebook import FacebookIE  from .faz import FazIE  from .fktv import ( @@ -82,6 +84,7 @@ from .metacafe import MetacafeIE  from .metacritic import MetacriticIE  from .mit import TechTVMITIE, MITIE  from .mixcloud import MixcloudIE +from .mofosex import MofosexIE  from .mtv import MTVIE  from .muzu import MuzuTVIE  from .myspace import MySpaceIE @@ -113,6 +116,7 @@ from .slideshare import SlideshareIE  from .sohu import SohuIE  from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE  from .southparkstudios import SouthParkStudiosIE +from .space import SpaceIE  from .spankwire import SpankwireIE  from .spiegel import SpiegelIE  from .stanfordoc import StanfordOpenClassroomIE @@ -152,6 +156,7 @@ from .worldstarhiphop import WorldStarHipHopIE  from .xhamster import XHamsterIE  from .xnxx import XNXXIE  from .xvideos import XVideosIE +from .xtube import XTubeIE  from .yahoo import YahooIE, YahooSearchIE  from .youjizz import YouJizzIE  from .youku import YoukuIE @@ -160,6 +165,7 @@ from .youtube import (      YoutubeIE,      YoutubePlaylistIE,      YoutubeSearchIE, +    YoutubeSearchDateIE,      YoutubeUserIE,      YoutubeChannelIE,      YoutubeShowIE, diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index e10c74c11..b35a679e3 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -10,6 +10,7 @@ from ..utils import (      unified_strdate,      determine_ext,      get_element_by_id, +    compat_str,  )  # There are different sources of video in arte.tv, the extraction process  @@ -181,20 +182,30 @@ class ArteTVPlus7IE(InfoExtractor):                  formats = all_formats              else:                  raise ExtractorError(u'The formats list is empty') -        # We order the formats by quality +          if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: -            sort_key = lambda f: ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) +            def sort_key(f): +                return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality'])          else: -            sort_key = lambda f: int(f.get('height',-1)) +            def sort_key(f): +                return ( +                    # Sort first by quality +                    int(f.get('height',-1)), +                    int(f.get('bitrate',-1)), +                    # The original version with subtitles has lower relevance +                    re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None, +                    # The version with sourds/mal subtitles has also lower relevance +                    re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None, +                )          formats = sorted(formats, key=sort_key) -        # Prefer videos without subtitles in the same language -        formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None) -        # Pick the best quality          def _format(format_info): -            quality = format_info['quality'] -            m_quality = re.match(r'\w*? - (\d*)p', quality) -            if m_quality is not None: -                quality = m_quality.group(1) +            quality = '' +            height = format_info.get('height') +            if height is not None: +                quality = compat_str(height) +            bitrate = format_info.get('bitrate') +            if bitrate is not None: +                quality += '-%d' % bitrate              if format_info.get('versionCode') is not None:                  format_id = u'%s-%s' % (quality, format_info['versionCode'])              else: @@ -203,7 +214,7 @@ class ArteTVPlus7IE(InfoExtractor):                  'format_id': format_id,                  'format_note': format_info.get('versionLibelle'),                  'width': format_info.get('width'), -                'height': format_info.get('height'), +                'height': height,              }              if format_info['mediaType'] == u'rtmp':                  info['url'] = format_info['streamer'] diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 1392f382a..d8c35465a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -9,10 +9,13 @@ from ..utils import (      compat_urllib_parse,      find_xpath_attr,      compat_urlparse, +    compat_str, +    compat_urllib_request,      ExtractorError,  ) +  class BrightcoveIE(InfoExtractor):      _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'      _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' @@ -23,7 +26,7 @@ class BrightcoveIE(InfoExtractor):              # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/              u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',              u'file': u'2371591881001.mp4', -            u'md5': u'9e80619e0a94663f0bdc849b4566af19', +            u'md5': u'8eccab865181d29ec2958f32a6a754f5',              u'note': u'Test Brightcove downloads and detection in GenericIE',              u'info_dict': {                  u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', @@ -41,6 +44,17 @@ class BrightcoveIE(InfoExtractor):                  u'uploader': u'Oracle',              },          }, +        { +            # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ +            u'url': u'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', +            u'info_dict': { +                u'id': u'2750934548001', +                u'ext': u'mp4', +                u'title': u'This Bracelet Acts as a Personal Thermostat', +                u'description': u'md5:547b78c64f4112766ccf4e151c20b6a0', +                u'uploader': u'Mashable', +            }, +        },      ]      @classmethod @@ -68,24 +82,48 @@ class BrightcoveIE(InfoExtractor):          videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')          if videoPlayer is not None:              params['@videoPlayer'] = videoPlayer.attrib['value'] +        linkBase = find_xpath_attr(object_doc, './param', 'name', 'linkBaseURL') +        if linkBase is not None: +            params['linkBaseURL'] = linkBase.attrib['value']          data = compat_urllib_parse.urlencode(params)          return cls._FEDERATED_URL_TEMPLATE % data +    @classmethod +    def _extract_brightcove_url(cls, webpage): +        """Try to extract the brightcove url from the wepbage, returns None +        if it can't be found +        """ +        m_brightcove = re.search( +            r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', +            webpage, re.DOTALL) +        if m_brightcove is not None: +            return cls._build_brighcove_url(m_brightcove.group()) +        else: +            return None +      def _real_extract(self, url): +        # Change the 'videoId' and others field to '@videoPlayer' +        url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url) +        # Change bckey (used by bcove.me urls) to playerKey +        url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)          mobj = re.match(self._VALID_URL, url)          query_str = mobj.group('query')          query = compat_urlparse.parse_qs(query_str)          videoPlayer = query.get('@videoPlayer')          if videoPlayer: -            return self._get_video_info(videoPlayer[0], query_str) +            return self._get_video_info(videoPlayer[0], query_str, query)          else:              player_key = query['playerKey']              return self._get_playlist_info(player_key[0]) -    def _get_video_info(self, video_id, query): -        request_url = self._FEDERATED_URL_TEMPLATE % query -        webpage = self._download_webpage(request_url, video_id) +    def _get_video_info(self, video_id, query_str, query): +        request_url = self._FEDERATED_URL_TEMPLATE % query_str +        req = compat_urllib_request.Request(request_url) +        linkBase = query.get('linkBaseURL') +        if linkBase is not None: +            req.add_header('Referer', linkBase[0]) +        webpage = self._download_webpage(req, video_id)          self.report_extraction(video_id)          info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json') @@ -109,7 +147,7 @@ class BrightcoveIE(InfoExtractor):      def _extract_video_info(self, video_info):          info = { -            'id': video_info['id'], +            'id': compat_str(video_info['id']),              'title': video_info['displayName'],              'description': video_info.get('shortDescription'),              'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), @@ -119,15 +157,14 @@ class BrightcoveIE(InfoExtractor):          renditions = video_info.get('renditions')          if renditions:              renditions = sorted(renditions, key=lambda r: r['size']) -            best_format = renditions[-1] -            info.update({ -                'url': best_format['defaultURL'], -                'ext': 'mp4', -            }) +            info['formats'] = [{ +                'url': rend['defaultURL'], +                'height': rend.get('frameHeight'), +                'width': rend.get('frameWidth'), +            } for rend in renditions]          elif video_info.get('FLVFullLengthURL') is not None:              info.update({                  'url': video_info['FLVFullLengthURL'], -                'ext': 'flv',              })          else:              raise ExtractorError(u'Unable to extract video url for %s' % info['id']) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index e7f4fa9fd..3d8d7f9d2 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -6,7 +6,7 @@ from .common import InfoExtractor  class Canalc2IE(InfoExtractor):      IE_NAME = 'canalc2.tv' -    _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui' +    _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)'      _TEST = {          u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', @@ -18,7 +18,9 @@ class Canalc2IE(InfoExtractor):      }      def _real_extract(self, url): -        video_id = re.match(self._VALID_URL, url).group(1) +        video_id = re.match(self._VALID_URL, url).group('id') +        # We need to set the voir field for getting the file name +        url = 'http://www.canalc2.tv/video.asp?idVideo=%s&voir=oui' % video_id          webpage = self._download_webpage(url, video_id)          file_name = self._search_regex(              r"so\.addVariable\('file','(.*?)'\);", diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 2fe1033f0..8f9396d6b 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -41,7 +41,7 @@ class CinemassacreIE(InfoExtractor):          webpage_url = u'http://' + mobj.group('url')          webpage = self._download_webpage(webpage_url, None) # Don't know video id yet          video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') -        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/(?:embed|player)\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) +        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)          if not mobj:              raise ExtractorError(u'Can\'t extract embed url and video id')          playerdata_url = mobj.group(u'embed_url') diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index a79f881cd..34adf6dda 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -6,7 +6,7 @@ from ..utils import determine_ext  class CNNIE(InfoExtractor): -    _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/ +    _VALID_URL = r'''(?x)https?://((edition|www)\.)?cnn\.com/video/(data/.+?|\?)/          (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))'''      _TESTS = [{ diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index cef4dce85..fb2d50a09 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -71,6 +71,9 @@ class InfoExtractor(object):                                  ("3D" or "DASH video")                      * width     Width of the video, if known                      * height    Height of the video, if known +    webpage_url:    The url to the video webpage, if given to youtube-dl it +                    should allow to get the same result again. (It will be set +                    by YoutubeDL if it's missing)      Unless mentioned otherwise, the fields should be Unicode strings. @@ -319,7 +322,9 @@ class InfoExtractor(object):          if name is None:              name = 'OpenGraph %s' % prop          escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) -        return unescapeHTML(escaped) +        if not escaped is None: +            return unescapeHTML(escaped) +        return None      def _og_search_thumbnail(self, html, **kargs):          return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs) diff --git a/youtube_dl/extractor/depositfiles.py b/youtube_dl/extractor/depositfiles.py index d43348955..2c9fb5f2e 100644 --- a/youtube_dl/extractor/depositfiles.py +++ b/youtube_dl/extractor/depositfiles.py @@ -25,7 +25,7 @@ class DepositFilesIE(InfoExtractor):          url = 'http://depositfiles.com/en/files/' + file_id          # Retrieve file webpage with 'Free download' button pressed -        free_download_indication = { 'gateway_result' : '1' } +        free_download_indication = {'gateway_result' : '1'}          request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))          try:              self.report_download_webpage(file_id) diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py new file mode 100644 index 000000000..4ba323148 --- /dev/null +++ b/youtube_dl/extractor/eitb.py @@ -0,0 +1,37 @@ +# encoding: utf-8 +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveIE +from ..utils import ExtractorError + + +class EitbIE(InfoExtractor): +    IE_NAME = u'eitb.tv' +    _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)' + +    _TEST = { +        u'add_ie': ['Brightcove'], +        u'url': u'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', +        u'md5': u'edf4436247185adee3ea18ce64c47998', +        u'info_dict': { +            u'id': u'2743577154001', +            u'ext': u'mp4', +            u'title': u'60 minutos (Lasa y Zabala, 30 años)', +            # All videos from eitb has this description in the brightcove info +            u'description': u'.', +            u'uploader': u'Euskal Telebista', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        chapter_id = mobj.group('chapter_id') +        webpage = self._download_webpage(url, chapter_id) +        bc_url = BrightcoveIE._extract_brightcove_url(webpage) +        if bc_url is None: +            raise ExtractorError(u'Could not extract the Brightcove url') +        # The BrightcoveExperience object doesn't contain the video id, we set +        # it manually +        bc_url += '&%40videoPlayer={0}'.format(chapter_id) +        return self.url_result(bc_url, BrightcoveIE.ie_key()) diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index c74556579..a51d79b08 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -21,6 +21,7 @@ class ExfmIE(InfoExtractor):                  u'description': u'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive',              },              u'note': u'Soundcloud song', +            u'skip': u'The site is down too often',          },          {              u'url': u'http://ex.fm/song/wddt8', @@ -30,6 +31,7 @@ class ExfmIE(InfoExtractor):                  u'title': u'Safe and Sound',                  u'uploader': u'Capital Cities',              }, +            u'skip': u'The site is down too often',          },      ] diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py new file mode 100644 index 000000000..1c20e4364 --- /dev/null +++ b/youtube_dl/extractor/extremetube.py @@ -0,0 +1,50 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_parse_urlparse, +    compat_urllib_request, +    compat_urllib_parse, +) + +class ExtremeTubeIE(InfoExtractor): +    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)' +    _TEST = { +        u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', +        u'file': u'652431.mp4', +        u'md5': u'1fb9228f5e3332ec8c057d6ac36f33e0', +        u'info_dict': { +            u"title": u"Music Video 14 british euro brit european cumshots swallow", +            u"uploader": u"unknown", +            u"age_limit": 18, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('videoid') +        url = 'http://www.' + mobj.group('url') + +        req = compat_urllib_request.Request(url) +        req.add_header('Cookie', 'age_verified=1') +        webpage = self._download_webpage(req, video_id) + +        video_title = self._html_search_regex(r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, u'title') +        uploader = self._html_search_regex(r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, u'uploader', fatal=False) +        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&', webpage, u'video_url')) +        path = compat_urllib_parse_urlparse(video_url).path +        extension = os.path.splitext(path)[1][1:] +        format = path.split('/')[5].split('_')[:2] +        format = "-".join(format) + +        return { +            'id': video_id, +            'title': video_title, +            'uploader': uploader, +            'url': video_url, +            'ext': extension, +            'format': format, +            'format_id': format, +            'age_limit': 18, +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 2c8fcf5ae..c7552fddb 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -33,6 +33,7 @@ class GenericIE(InfoExtractor):          },          # embedded vimeo video          { +            u'add_ie': ['Vimeo'],              u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',              u'file': u'22444065.mp4',              u'md5': u'2903896e23df39722c33f015af0666e2', @@ -44,6 +45,7 @@ class GenericIE(InfoExtractor):          },          # bandcamp page with custom domain          { +            u'add_ie': ['Bandcamp'],              u'url': u'http://bronyrock.com/track/the-pony-mash',              u'file': u'3235767654.mp3',              u'info_dict': { @@ -52,6 +54,23 @@ class GenericIE(InfoExtractor):              },              u'skip': u'There is a limit of 200 free downloads / month for the test song',          }, +        # embedded brightcove video +        # it also tests brightcove videos that need to set the 'Referer' in the +        # http requests +        { +            u'add_ie': ['Brightcove'], +            u'url': u'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', +            u'info_dict': { +                u'id': u'2765128793001', +                u'ext': u'mp4', +                u'title': u'Le cours de bourse : l’analyse technique', +                u'description': u'md5:7e9ad046e968cb2d1114004aba466fd9', +                u'uploader': u'BFM BUSINESS', +            }, +            u'params': { +                u'skip_download': True, +            }, +        },      ]      def report_download_webpage(self, video_id): @@ -144,10 +163,9 @@ class GenericIE(InfoExtractor):          self.report_extraction(video_id)          # Look for BrightCove: -        m_brightcove = re.search(r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) -        if m_brightcove is not None: +        bc_url = BrightcoveIE._extract_brightcove_url(webpage) +        if bc_url is not None:              self.to_screen(u'Brightcove video detected.') -            bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())              return self.url_result(bc_url, 'Brightcove')          # Look for embedded Vimeo player @@ -160,9 +178,9 @@ class GenericIE(InfoExtractor):          # Look for embedded YouTube player          mobj = re.search( -            r'<iframe[^>]+?src="(https?://(?:www\.)?youtube.com/embed/.+?)"', webpage) +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?youtube.com/embed/.+?)\1', webpage)          if mobj: -            surl = unescapeHTML(mobj.group(1)) +            surl = unescapeHTML(mobj.group(u'url'))              return self.url_result(surl, 'Youtube')          # Look for Bandcamp pages with custom domain diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py index ab2b59103..9bd06e7c7 100644 --- a/youtube_dl/extractor/hypem.py +++ b/youtube_dl/extractor/hypem.py @@ -30,7 +30,7 @@ class HypemIE(InfoExtractor):              raise ExtractorError(u'Invalid URL: %s' % url)          track_id = mobj.group(1) -        data = { 'ax': 1, 'ts': time.time() } +        data = {'ax': 1, 'ts': time.time()}          data_encoded = compat_urllib_parse.urlencode(data)          complete_url = url + "?" + data_encoded          request = compat_urllib_request.Request(complete_url) @@ -68,4 +68,4 @@ class HypemIE(InfoExtractor):              'ext':      "mp3",              'title':    title,              'artist':   artist, -        }]
\ No newline at end of file +        }] diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py index 445d46501..50916f4a6 100644 --- a/youtube_dl/extractor/kankan.py +++ b/youtube_dl/extractor/kankan.py @@ -1,8 +1,10 @@  import re +import hashlib  from .common import InfoExtractor  from ..utils import determine_ext +_md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()  class KankanIE(InfoExtractor):      _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P<id>\d+)\.shtml' @@ -30,7 +32,10 @@ class KankanIE(InfoExtractor):                                                   video_id, u'Downloading video url info')          ip = self._search_regex(r'ip:"(.+?)"', video_info_page, u'video url ip')          path = self._search_regex(r'path:"(.+?)"', video_info_page, u'video url path') -        video_url = 'http://%s%s' % (ip, path) +        param1 = self._search_regex(r'param1:(\d+)', video_info_page, u'param1') +        param2 = self._search_regex(r'param2:(\d+)', video_info_page, u'param2') +        key = _md5('xl_mp43651' + param1 + param2) +        video_url = 'http://%s%s?key=%s&key1=%s' % (ip, path, key, param2)          return {'id': video_id,                  'title': title, diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index 5e05900da..29658a7d6 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -12,7 +12,7 @@ from ..aes import (  )  class KeezMoviesIE(InfoExtractor): -    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))' +    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'      _TEST = {          u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',          u'file': u'1214711.mp4', @@ -43,10 +43,10 @@ class KeezMoviesIE(InfoExtractor):          if webpage.find('encrypted=true')!=-1:              password = self._html_search_regex(r'video_title=(.+?)&', webpage, u'password')              video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8') -        path = compat_urllib_parse_urlparse( video_url ).path -        extension = os.path.splitext( path )[1][1:] +        path = compat_urllib_parse_urlparse(video_url).path +        extension = os.path.splitext(path)[1][1:]          format = path.split('/')[4].split('_')[:2] -        format = "-".join( format ) +        format = "-".join(format)          age_limit = self._rta_search(webpage) diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py new file mode 100644 index 000000000..b9430b09b --- /dev/null +++ b/youtube_dl/extractor/mofosex.py @@ -0,0 +1,49 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_parse_urlparse, +    compat_urllib_request, +    compat_urllib_parse, +) + +class MofosexIE(InfoExtractor): +    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>mofosex\.com/videos/(?P<videoid>[0-9]+)/.*?\.html)' +    _TEST = { +        u'url': u'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', +        u'file': u'5018.mp4', +        u'md5': u'1b2eb47ac33cc75d4a80e3026b613c5a', +        u'info_dict': { +            u"title": u"Japanese Teen Music Video", +            u"age_limit": 18, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('videoid') +        url = 'http://www.' + mobj.group('url') + +        req = compat_urllib_request.Request(url) +        req.add_header('Cookie', 'age_verified=1') +        webpage = self._download_webpage(req, video_id) + +        video_title = self._html_search_regex(r'<h1>(.+?)<', webpage, u'title') +        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, u'video_url')) +        path = compat_urllib_parse_urlparse(video_url).path +        extension = os.path.splitext(path)[1][1:] +        format = path.split('/')[5].split('_')[:2] +        format = "-".join(format) + +        age_limit = self._rta_search(webpage) + +        return { +            'id': video_id, +            'title': video_title, +            'url': video_url, +            'ext': extension, +            'format': format, +            'format_id': format, +            'age_limit': age_limit, +        } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index e96d3952c..24a79ae13 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -26,6 +26,7 @@ class MTVIE(InfoExtractor):              },          },          { +            u'add_ie': ['Vevo'],              u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',              u'file': u'USCJY1331283.mp4',              u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 5e2454f1b..75cf4bb9f 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -47,10 +47,10 @@ class PornHubIE(InfoExtractor):          formats = []          for video_url in video_urls: -            path = compat_urllib_parse_urlparse( video_url ).path -            extension = os.path.splitext( path )[1][1:] +            path = compat_urllib_parse_urlparse(video_url).path +            extension = os.path.splitext(path)[1][1:]              format = path.split('/')[5].split('_')[:2] -            format = "-".join( format ) +            format = "-".join(format)              formats.append({                  'url': video_url,                  'ext': extension, diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py index 2cba53076..f5003c7f9 100644 --- a/youtube_dl/extractor/slashdot.py +++ b/youtube_dl/extractor/slashdot.py @@ -7,6 +7,7 @@ class SlashdotIE(InfoExtractor):      _VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P<id>.*?)(&|$)'      _TEST = { +        u'add_ie': ['Ooyala'],          u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz',          u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4',          u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 29cd5617c..4717fbb77 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,17 +29,34 @@ class SoundcloudIE(InfoExtractor):                      )                      '''      IE_NAME = u'soundcloud' -    _TEST = { -        u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', -        u'file': u'62986583.mp3', -        u'md5': u'ebef0a451b909710ed1d7787dddbf0d7', -        u'info_dict': { -            u"upload_date": u"20121011",  -            u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",  -            u"uploader": u"E.T. ExTerrestrial Music",  -            u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" -        } -    } +    _TESTS = [ +        { +            u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', +            u'file': u'62986583.mp3', +            u'md5': u'ebef0a451b909710ed1d7787dddbf0d7', +            u'info_dict': { +                u"upload_date": u"20121011",  +                u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",  +                u"uploader": u"E.T. ExTerrestrial Music",  +                u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" +            } +        }, +        # not streamable song +        { +            u'url': u'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', +            u'info_dict': { +                u'id': u'47127627', +                u'ext': u'mp3', +                u'title': u'Goldrushed', +                u'uploader': u'The Royal Concept', +                u'upload_date': u'20120521', +            }, +            u'params': { +                # rtmp +                u'skip_download': True, +            }, +        }, +    ]      _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' @@ -56,16 +73,16 @@ class SoundcloudIE(InfoExtractor):          return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID      def _extract_info_dict(self, info, full_title=None, quiet=False): -        video_id = info['id'] -        name = full_title or video_id +        track_id = compat_str(info['id']) +        name = full_title or track_id          if quiet == False:              self.report_extraction(name)          thumbnail = info['artwork_url']          if thumbnail is not None:              thumbnail = thumbnail.replace('-large', '-t500x500') -        return { -            'id':       info['id'], +        result = { +            'id':       track_id,              'url':      info['stream_url'] + '?client_id=' + self._CLIENT_ID,              'uploader': info['user']['username'],              'upload_date': unified_strdate(info['created_at']), @@ -74,6 +91,21 @@ class SoundcloudIE(InfoExtractor):              'description': info['description'],              'thumbnail': thumbnail,          } +        if info.get('downloadable', False): +            result['url'] = 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(track_id, self._CLIENT_ID) +        if not info.get('streamable', False): +            # We have to get the rtmp url +            stream_json = self._download_webpage( +                'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._CLIENT_ID), +                track_id, u'Downloading track url') +            rtmp_url = json.loads(stream_json)['rtmp_mp3_128_url'] +            # The url doesn't have an rtmp app, we have to extract the playpath +            url, path = rtmp_url.split('mp3:', 1) +            result.update({ +                'url': url, +                'play_path': 'mp3:' + path, +            }) +        return result      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) @@ -106,70 +138,8 @@ class SoundcloudIE(InfoExtractor):  class SoundcloudSetIE(SoundcloudIE):      _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'      IE_NAME = u'soundcloud:set' -    _TEST = { -        u"url":"https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep", -        u"playlist": [ -            { -                u"file":"30510138.mp3", -                u"md5":"f9136bf103901728f29e419d2c70f55d", -                u"info_dict": { -                    u"upload_date": u"20111213", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"D-D-Dance" -                } -            }, -            { -                u"file":"47127625.mp3", -                u"md5":"09b6758a018470570f8fd423c9453dd8", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"The Royal Concept - Gimme Twice" -                } -            }, -            { -                u"file":"47127627.mp3", -                u"md5":"154abd4e418cea19c3b901f1e1306d9c", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"uploader": u"The Royal Concept", -                    u"title": u"Goldrushed" -                } -            }, -            { -                u"file":"47127629.mp3", -                u"md5":"2f5471edc79ad3f33a683153e96a79c1", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"In the End" -                } -            }, -            { -                u"file":"47127631.mp3", -                u"md5":"f9ba87aa940af7213f98949254f1c6e2", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"Knocked Up" -                } -            }, -            { -                u"file":"75206121.mp3", -                u"md5":"f9d1fe9406717e302980c30de4af9353", -                u"info_dict": { -                    u"upload_date": u"20130116", -                    u"description": u"The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central).  \r\nAs a gift to our fans we would like to offer you a free download of the track!  ", -                    u"uploader": u"The Royal Concept", -                    u"title": u"World On Fire" -                } -            } -        ] -    } +    # it's in tests/test_playlists.py +    _TESTS = []      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -208,7 +178,7 @@ class SoundcloudUserIE(SoundcloudIE):      IE_NAME = u'soundcloud:user'      # it's in tests/test_playlists.py -    _TEST = None +    _TESTS = []      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py new file mode 100644 index 000000000..0d32a0688 --- /dev/null +++ b/youtube_dl/extractor/space.py @@ -0,0 +1,35 @@ +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveIE +from ..utils import RegexNotFoundError, ExtractorError + + +class SpaceIE(InfoExtractor): +    _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video.html' +    _TEST = { +        u'add_ie': ['Brightcove'], +        u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', +        u'info_dict': { +            u'id': u'2780937028001', +            u'ext': u'mp4', +            u'title': u'Huge Martian Landforms\' Detail Revealed By European Probe | Video', +            u'description': u'md5:db81cf7f3122f95ed234b631a6ea1e61', +            u'uploader': u'TechMedia Networks', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        title = mobj.group('title') +        webpage = self._download_webpage(url, title) +        try: +            # Some videos require the playerKey field, which isn't define in +            # the BrightcoveExperience object +            brightcove_url = self._og_search_video_url(webpage) +        except RegexNotFoundError: +            # Other videos works fine with the info from the object +            brightcove_url = BrightcoveIE._extract_brightcove_url(webpage) +        if brightcove_url is None: +            raise ExtractorError(u'The webpage does not contain a video', expected=True) +        return self.url_result(brightcove_url, BrightcoveIE.ie_key()) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 32df0a7fb..97f9c268a 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -49,10 +49,10 @@ class SpankwireIE(InfoExtractor):          formats = []          for video_url in video_urls: -            path = compat_urllib_parse_urlparse( video_url ).path -            extension = os.path.splitext( path )[1][1:] +            path = compat_urllib_parse_urlparse(video_url).path +            extension = os.path.splitext(path)[1][1:]              format = path.split('/')[4].split('_')[:2] -            format = "-".join( format ) +            format = "-".join(format)              formats.append({                  'url': video_url,                  'ext': extension, diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index c910110ca..bc48620f0 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,4 +1,5 @@  import re +import xml.etree.ElementTree  from .common import InfoExtractor  from ..utils import ( @@ -11,7 +12,7 @@ class TeamcocoIE(InfoExtractor):      _TEST = {          u'url': u'http://teamcoco.com/video/louis-ck-interview-george-w-bush',          u'file': u'19705.mp4', -        u'md5': u'27b6f7527da5acf534b15f21b032656e', +        u'md5': u'cde9ba0fa3506f5f017ce11ead928f9a',          u'info_dict': {              u"description": u"Louis C.K. got starstruck by George W. Bush, so what? Part one.",               u"title": u"Louis C.K. Interview Pt. 1 11/3/11" @@ -31,16 +32,40 @@ class TeamcocoIE(InfoExtractor):          self.report_extraction(video_id)          data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id -        data = self._download_webpage(data_url, video_id, 'Downloading data webpage') +        data_xml = self._download_webpage(data_url, video_id, 'Downloading data webpage') +        data = xml.etree.ElementTree.fromstring(data_xml.encode('utf-8')) -        video_url = self._html_search_regex(r'<file [^>]*type="high".*?>(.*?)</file>', -            data, u'video URL') -        return [{ +        qualities = ['500k', '480p', '1000k', '720p', '1080p'] +        formats = [] +        for file in data.findall('files/file'): +            if file.attrib.get('playmode') == 'all': +                # it just duplicates one of the entries +                break +            file_url = file.text +            m_format = re.search(r'(\d+(k|p))\.mp4', file_url) +            if m_format is not None: +                format_id = m_format.group(1) +            else: +                format_id = file.attrib['bitrate'] +            formats.append({ +                'url': file_url, +                'ext': 'mp4', +                'format_id': format_id, +            }) +        def sort_key(f): +            try: +                return qualities.index(f['format_id']) +            except ValueError: +                return -1 +        formats.sort(key=sort_key) +        if not formats: +            raise RegexNotFoundError(u'Unable to extract video URL') + +        return {              'id':          video_id, -            'url':         video_url, -            'ext':         'mp4', +            'formats': formats,              'title':       self._og_search_title(webpage),              'thumbnail':   self._og_search_thumbnail(webpage),              'description': self._og_search_description(webpage), -        }] +        } diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index aea9d9a24..d4b7603c7 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -46,10 +46,10 @@ class Tube8IE(InfoExtractor):          if webpage.find('"encrypted":true')!=-1:              password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, u'password')              video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8') -        path = compat_urllib_parse_urlparse( video_url ).path -        extension = os.path.splitext( path )[1][1:] +        path = compat_urllib_parse_urlparse(video_url).path +        extension = os.path.splitext(path)[1][1:]          format = path.split('/')[4].split('_')[:2] -        format = "-".join( format ) +        format = "-".join(format)          return {              'id': video_id, diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 12c84a985..826804af3 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -8,7 +8,7 @@ from ..utils import (  class ViddlerIE(InfoExtractor): -    _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[0-9]+)' +    _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)'      _TEST = {          u"url": u"http://www.viddler.com/v/43903784",          u'file': u'43903784.mp4', diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index c7d864a2b..d465bf20b 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -20,7 +20,7 @@ class VimeoIE(InfoExtractor):      """Information extractor for vimeo.com."""      # _VALID_URL matches Vimeo URLs -    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$' +    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'      _NETRC_MACHINE = 'vimeo'      IE_NAME = u'vimeo'      _TESTS = [ @@ -128,11 +128,9 @@ class VimeoIE(InfoExtractor):              raise ExtractorError(u'Invalid URL: %s' % url)          video_id = mobj.group('id') -        if not mobj.group('proto'): -            url = 'https://' + url -        elif mobj.group('pro'): +        if mobj.group('pro') or mobj.group('player'):              url = 'http://player.vimeo.com/video/' + video_id -        elif mobj.group('direct_link'): +        else:              url = 'https://vimeo.com/' + video_id          # Retrieve video webpage to extract further information @@ -205,7 +203,7 @@ class VimeoIE(InfoExtractor):          # Vimeo specific: extract video codec and quality information          # First consider quality, then codecs, then take everything          codecs = [('vp6', 'flv'), ('vp8', 'flv'), ('h264', 'mp4')] -        files = { 'hd': [], 'sd': [], 'other': []} +        files = {'hd': [], 'sd': [], 'other': []}          config_files = config["video"].get("files") or config["request"].get("files")          for codec_name, codec_extension in codecs:              for quality in config_files.get(codec_name, []): @@ -234,7 +232,7 @@ class VimeoIE(InfoExtractor):          if len(formats) == 0:              raise ExtractorError(u'No known codec found') -        return [{ +        return {              'id':       video_id,              'uploader': video_uploader,              'uploader_id': video_uploader_id, @@ -243,7 +241,8 @@ class VimeoIE(InfoExtractor):              'thumbnail':    video_thumbnail,              'description':  video_description,              'formats': formats, -        }] +            'webpage_url': url, +        }  class VimeoChannelIE(InfoExtractor): diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 0757495bd..fa784ab99 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -13,6 +13,7 @@ class WeiboIE(InfoExtractor):      _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P<id>.+?)\.htm'      _TEST = { +        u'add_ie': ['Sina'],          u'url': u'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',          u'file': u'98322879.flv',          u'info_dict': { diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index 8a0eb1afd..1177a4b14 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -9,7 +9,7 @@ from ..utils import (  class XNXXIE(InfoExtractor): -    _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)' +    _VALID_URL = r'^(?:https?://)?(?:video|www)\.xnxx\.com/video([0-9]+)/(.*)'      VIDEO_URL_RE = r'flv_url=(.*?)&'      VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'      VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&' diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py new file mode 100644 index 000000000..03ad88bed --- /dev/null +++ b/youtube_dl/extractor/xtube.py @@ -0,0 +1,55 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_parse_urlparse, +    compat_urllib_request, +    compat_urllib_parse, +) + +class XTubeIE(InfoExtractor): +    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))' +    _TEST = { +        u'url': u'http://www.xtube.com/watch.php?v=kVTUy_G222_', +        u'file': u'kVTUy_G222_.mp4', +        u'md5': u'092fbdd3cbe292c920ef6fc6a8a9cdab', +        u'info_dict': { +            u"title": u"strange erotica", +            u"description": u"surreal gay themed erotica...almost an ET kind of thing", +            u"uploader": u"greenshowers", +            u"age_limit": 18, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('videoid') +        url = 'http://www.' + mobj.group('url') + +        req = compat_urllib_request.Request(url) +        req.add_header('Cookie', 'age_verified=1') +        webpage = self._download_webpage(req, video_id) + +        video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, u'title') +        video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, u'uploader', fatal=False) +        video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, u'description', default=None) +        video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, u'video_url').replace('\\/', '/') +        path = compat_urllib_parse_urlparse(video_url).path +        extension = os.path.splitext(path)[1][1:] +        format = path.split('/')[5].split('_')[:2] +        format[0] += 'p' +        format[1] += 'k' +        format = "-".join(format) + +        return { +            'id': video_id, +            'title': video_title, +            'uploader': video_uploader, +            'description': video_description, +            'url': video_url, +            'ext': extension, +            'format': format, +            'format_id': format, +            'age_limit': 18, +        } diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 464b498f5..34e6afb20 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -132,7 +132,7 @@ class YahooSearchIE(SearchInfoExtractor):                  mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)                  e = self.url_result('http://' + mobj.group('url'), 'Yahoo')                  res['entries'].append(e) -            if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )): +            if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1)):                  break          return res diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 9d88c17f5..a8fd40c83 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -18,7 +18,7 @@ class YoukuIE(InfoExtractor):          u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",          u"file": u"XNDgyMDQ2NTQw_part00.flv",          u"md5": u"ffe3f2e435663dc2d1eea34faeff5b5b", -        u"params": { u"test": False }, +        u"params": {u"test": False},          u"info_dict": {              u"title": u"youtube-dl test video \"'/\\ä↭𝕐"          } @@ -37,8 +37,8 @@ class YoukuIE(InfoExtractor):          source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")          seed = float(seed)          for i in range(len(source)): -            seed  =  (seed * 211 + 30031 ) % 65536 -            index  =  math.floor(seed / 65536 * len(source) ) +            seed  =  (seed * 211 + 30031) % 65536 +            index  =  math.floor(seed / 65536 * len(source))              mixed.append(source[int(index)])              source.remove(source[int(index)])          #return ''.join(mixed) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index e46a9b4d6..bd0f2cae0 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -81,14 +81,14 @@ class YouPornIE(InfoExtractor):              # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0              # A path looks like this:              # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 -            video_url = unescapeHTML( link ) -            path = compat_urllib_parse_urlparse( video_url ).path -            extension = os.path.splitext( path )[1][1:] +            video_url = unescapeHTML(link) +            path = compat_urllib_parse_urlparse(video_url).path +            extension = os.path.splitext(path)[1][1:]              format = path.split('/')[4].split('_')[:2]              # size = format[0]              # bitrate = format[1] -            format = "-".join( format ) +            format = "-".join(format)              # title = u'%s-%s-%s' % (video_title, size, bitrate)              formats.append({ diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9053f3ead..c992cba97 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -340,18 +340,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              }          },          { -            u"url":  u"http://www.youtube.com/watch?v=1ltcDfZMA3U", -            u"file":  u"1ltcDfZMA3U.mp4", -            u"note": u"Test VEVO video (#897)", -            u"info_dict": { -                u"upload_date": u"20070518", -                u"title": u"Maps - It Will Find You", -                u"description": u"Music video by Maps performing It Will Find You.", -                u"uploader": u"MuteUSA", -                u"uploader_id": u"MuteUSA" -            } -        }, -        {              u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",              u"file":  u"UxxajLWwzqY.mp4",              u"note": u"Test generic use_cipher_signature video (#897)", @@ -1497,7 +1485,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  'subtitles':    video_subtitles,                  'duration':     video_duration,                  'age_limit':    18 if age_gate else 0, -                'annotations':  video_annotations +                'annotations':  video_annotations, +                'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,              })          return results @@ -1583,7 +1572,6 @@ class YoutubePlaylistIE(InfoExtractor):  class YoutubeChannelIE(InfoExtractor):      IE_DESC = u'YouTube.com channels'      _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" -    _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'      _MORE_PAGES_INDICATOR = 'yt-uix-load-more'      _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'      IE_NAME = u'youtube:channel' @@ -1604,30 +1592,20 @@ class YoutubeChannelIE(InfoExtractor):          # Download channel page          channel_id = mobj.group(1)          video_ids = [] -        pagenum = 1 - -        url = self._TEMPLATE_URL % (channel_id, pagenum) -        page = self._download_webpage(url, channel_id, -                                      u'Downloading page #%s' % pagenum) -        # Extract video identifiers -        ids_in_page = self.extract_videos_from_page(page) -        video_ids.extend(ids_in_page) +        # Download all channel pages using the json-based channel_ajax query +        for pagenum in itertools.count(1): +            url = self._MORE_PAGES_URL % (pagenum, channel_id) +            page = self._download_webpage(url, channel_id, +                                          u'Downloading page #%s' % pagenum) -        # Download any subsequent channel pages using the json-based channel_ajax query -        if self._MORE_PAGES_INDICATOR in page: -            for pagenum in itertools.count(1): -                url = self._MORE_PAGES_URL % (pagenum, channel_id) -                page = self._download_webpage(url, channel_id, -                                              u'Downloading page #%s' % pagenum) +            page = json.loads(page) -                page = json.loads(page) - -                ids_in_page = self.extract_videos_from_page(page['content_html']) -                video_ids.extend(ids_in_page) +            ids_in_page = self.extract_videos_from_page(page['content_html']) +            video_ids.extend(ids_in_page) -                if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']: -                    break +            if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: +                break          self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) @@ -1743,6 +1721,10 @@ class YoutubeSearchIE(SearchInfoExtractor):          videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]          return self.playlist_result(videos, query) +class YoutubeSearchDateIE(YoutubeSearchIE): +    _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' +    _SEARCH_KEY = 'ytsearchdate' +    IE_DESC = u'YouTube.com searches, newest videos first'  class YoutubeShowIE(InfoExtractor):      IE_DESC = u'YouTube.com (multi-season) shows' diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 75a46a2d5..84bf0f35c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.02' +__version__ = '2013.11.07' | 
