diff options
| -rw-r--r-- | test/test_utils.py | 32 | ||||
| -rwxr-xr-x | youtube_dl/YoutubeDL.py | 5 | ||||
| -rw-r--r-- | youtube_dl/downloader/f4m.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/adultswim.py | 44 | ||||
| -rw-r--r-- | youtube_dl/extractor/breakcom.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/gazeta.py | 38 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 19 | ||||
| -rw-r--r-- | youtube_dl/extractor/pladform.py | 90 | ||||
| -rw-r--r-- | youtube_dl/extractor/teamcoco.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/twitch.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/vidme.py | 6 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 25 | 
13 files changed, 246 insertions, 34 deletions
| diff --git a/test/test_utils.py b/test/test_utils.py index 64fad58ad..28bda654e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -38,6 +38,7 @@ from youtube_dl.utils import (      parse_iso8601,      read_batch_urls,      sanitize_filename, +    sanitize_path,      shell_quote,      smuggle_url,      str_to_int, @@ -131,6 +132,37 @@ class TestUtil(unittest.TestCase):          self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw')          self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI') +    def test_sanitize_path(self): +        if sys.platform != 'win32': +            return + +        self.assertEqual(sanitize_path('abc'), 'abc') +        self.assertEqual(sanitize_path('abc/def'), 'abc\\def') +        self.assertEqual(sanitize_path('abc\\def'), 'abc\\def') +        self.assertEqual(sanitize_path('abc|def'), 'abc#def') +        self.assertEqual(sanitize_path('<>:"|?*'), '#######') +        self.assertEqual(sanitize_path('C:/abc/def'), 'C:\\abc\\def') +        self.assertEqual(sanitize_path('C?:/abc/def'), 'C##\\abc\\def') + +        self.assertEqual(sanitize_path('\\\\?\\UNC\\ComputerName\\abc'), '\\\\?\\UNC\\ComputerName\\abc') +        self.assertEqual(sanitize_path('\\\\?\\UNC/ComputerName/abc'), '\\\\?\\UNC\\ComputerName\\abc') + +        self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc') +        self.assertEqual(sanitize_path('\\\\?\\C:/abc'), '\\\\?\\C:\\abc') +        self.assertEqual(sanitize_path('\\\\?\\C:\\ab?c\\de:f'), '\\\\?\\C:\\ab#c\\de#f') +        self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc') + +        self.assertEqual( +            sanitize_path('youtube/%(uploader)s/%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s'), +            'youtube\\%(uploader)s\\%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s') + +        self.assertEqual( +            sanitize_path('youtube/TheWreckingYard ./00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part'), +            'youtube\\TheWreckingYard #\\00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part') +        self.assertEqual(sanitize_path('abc/def...'), 'abc\\def..#') +        self.assertEqual(sanitize_path('abc.../def'), 'abc..#\\def') +        self.assertEqual(sanitize_path('abc.../def...'), 'abc..#\\def..#') +      def test_ordered_set(self):          self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7])          self.assertEqual(orderedSet([]), []) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index df2aebb59..bce7587fd 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -61,6 +61,7 @@ from .utils import (      render_table,      SameFileError,      sanitize_filename, +    sanitize_path,      std_headers,      subtitles_filename,      takewhile_inclusive, @@ -562,7 +563,7 @@ class YoutubeDL(object):                                   if v is not None)              template_dict = collections.defaultdict(lambda: 'NA', template_dict) -            outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) +            outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL))              tmpl = compat_expanduser(outtmpl)              filename = tmpl % template_dict              # Temporary fix for #4787 @@ -1261,7 +1262,7 @@ class YoutubeDL(object):              return          try: -            dn = os.path.dirname(encodeFilename(filename)) +            dn = os.path.dirname(sanitize_path(encodeFilename(filename)))              if dn and not os.path.exists(dn):                  os.makedirs(dn)          except (OSError, IOError) as err: diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 3dc796faa..4ab000d67 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -281,7 +281,7 @@ class F4mFD(FileDownloader):              boot_info = self._get_bootstrap_from_url(bootstrap_url)          else:              bootstrap_url = None -            bootstrap = base64.b64decode(node.text) +            bootstrap = base64.b64decode(node.text.encode('ascii'))              boot_info = read_bootstrap_info(bootstrap)          return (boot_info, bootstrap_url) @@ -308,7 +308,7 @@ class F4mFD(FileDownloader):          live = boot_info['live']          metadata_node = media.find(_add_ns('metadata'))          if metadata_node is not None: -            metadata = base64.b64decode(metadata_node.text) +            metadata = base64.b64decode(metadata_node.text.encode('ascii'))          else:              metadata = None diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b489d5770..14172ca56 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -175,6 +175,7 @@ from .gameone import (  from .gamespot import GameSpotIE  from .gamestar import GameStarIE  from .gametrailers import GametrailersIE +from .gazeta import GazetaIE  from .gdcvault import GDCVaultIE  from .generic import GenericIE  from .giantbomb import GiantBombIE @@ -363,6 +364,7 @@ from .pbs import PBSIE  from .phoenix import PhoenixIE  from .photobucket import PhotobucketIE  from .planetaplay import PlanetaPlayIE +from .pladform import PladformIE  from .played import PlayedIE  from .playfm import PlayFMIE  from .playvid import PlayvidIE diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 34b8b0115..39335b827 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -2,13 +2,12 @@  from __future__ import unicode_literals  import re -import json  from .common import InfoExtractor  from ..utils import (      ExtractorError, -    xpath_text,      float_or_none, +    xpath_text,  ) @@ -60,6 +59,24 @@ class AdultSwimIE(InfoExtractor):              'title': 'American Dad - Putting Francine Out of Business',              'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'          }, +    }, { +        'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/', +        'playlist': [ +            { +                'md5': '3e346a2ab0087d687a05e1e7f3b3e529', +                'info_dict': { +                    'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', +                    'ext': 'flv', +                    'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', +                    'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', +                }, +            } +        ], +        'info_dict': { +            'id': 'sY3cMUR_TbuE4YmdjzbIcQ', +            'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', +            'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', +        },      }]      @staticmethod @@ -80,6 +97,7 @@ class AdultSwimIE(InfoExtractor):              for video in collection.get('videos'):                  if video.get('slug') == slug:                      return collection, video +        return None, None      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -90,28 +108,30 @@ class AdultSwimIE(InfoExtractor):          webpage = self._download_webpage(url, episode_path)          # Extract the value of `bootstrappedData` from the Javascript in the page. -        bootstrappedDataJS = self._search_regex(r'var bootstrappedData = ({.*});', webpage, episode_path) - -        try: -            bootstrappedData = json.loads(bootstrappedDataJS) -        except ValueError as ve: -            errmsg = '%s: Failed to parse JSON ' % episode_path -            raise ExtractorError(errmsg, cause=ve) +        bootstrapped_data = self._parse_json(self._search_regex( +            r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path)          # Downloading videos from a /videos/playlist/ URL needs to be handled differently.          # NOTE: We are only downloading one video (the current one) not the playlist          if is_playlist: -            collections = bootstrappedData['playlists']['collections'] +            collections = bootstrapped_data['playlists']['collections']              collection = self.find_collection_by_linkURL(collections, show_path)              video_info = self.find_video_info(collection, episode_path)              show_title = video_info['showTitle']              segment_ids = [video_info['videoPlaybackID']]          else: -            collections = bootstrappedData['show']['collections'] +            collections = bootstrapped_data['show']['collections']              collection, video_info = self.find_collection_containing_video(collections, episode_path) -            show = bootstrappedData['show'] +            # Video wasn't found in the collections, let's try `slugged_video`. +            if video_info is None: +                if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: +                    video_info = bootstrapped_data['slugged_video'] +                else: +                    raise ExtractorError('Unable to find video info') + +            show = bootstrapped_data['show']              show_title = show['title']              segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 4bcc897c9..809287d14 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -41,7 +41,7 @@ class BreakIE(InfoExtractor):              'tbr': media['bitRate'],              'width': media['width'],              'height': media['height'], -        } for media in info['media']] +        } for media in info['media'] if media.get('mediaPurpose') == 'play']          if not formats:              formats.append({ diff --git a/youtube_dl/extractor/gazeta.py b/youtube_dl/extractor/gazeta.py new file mode 100644 index 000000000..ea32b621c --- /dev/null +++ b/youtube_dl/extractor/gazeta.py @@ -0,0 +1,38 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class GazetaIE(InfoExtractor): +    _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:(?:main|\d{4}/\d{2}/\d{2})/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)' +    _TESTS = [{ +        'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml', +        'md5': 'd49c9bdc6e5a7888f27475dc215ee789', +        'info_dict': { +            'id': '205566', +            'ext': 'mp4', +            'title': '«70–80 процентов гражданских в Донецке на грани голода»', +            'description': 'md5:38617526050bd17b234728e7f9620a71', +            'thumbnail': 're:^https?://.*\.jpg', +        }, +    }, { +        'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) + +        display_id = mobj.group('id') +        embed_url = '%s?p=embed' % mobj.group('url') +        embed_page = self._download_webpage( +            embed_url, display_id, 'Downloading embed page') + +        video_id = self._search_regex( +            r'<div[^>]*?class="eagleplayer"[^>]*?data-id="([^"]+)"', embed_page, 'video id') + +        return self.url_result( +            'eagleplatform:gazeta.media.eagleplatform.com:%s' % video_id, 'EaglePlatform') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 013198b0d..4e6927b08 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -596,6 +596,19 @@ class GenericIE(InfoExtractor):                  'view_count': int,              },          }, +        # Pladform embed +        { +            'url': 'http://muz-tv.ru/kinozal/view/7400/', +            'info_dict': { +                'id': '100183293', +                'ext': 'mp4', +                'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть', +                'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', +                'thumbnail': 're:^https?://.*\.jpg$', +                'duration': 694, +                'age_limit': 0, +            }, +        },          # RSS feed with enclosure          {              'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', @@ -1193,6 +1206,12 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') +        # Look for Pladform embeds +        mobj = re.search( +            r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'Pladform') +          def check_video(vurl):              if YoutubeIE.suitable(vurl):                  return True diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py new file mode 100644 index 000000000..abde34b94 --- /dev/null +++ b/youtube_dl/extractor/pladform.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    int_or_none, +    xpath_text, +    qualities, +) + + +class PladformIE(InfoExtractor): +    _VALID_URL = r'''(?x) +                    https?:// +                        (?: +                            (?: +                                out\.pladform\.ru/player| +                                static\.pladform\.ru/player\.swf +                            ) +                            \?.*\bvideoid=| +                            video\.pladform\.ru/catalog/video/videoid/ +                        ) +                        (?P<id>\d+) +                    ''' +    _TESTS = [{ +        # http://muz-tv.ru/kinozal/view/7400/ +        'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293', +        'md5': '61f37b575dd27f1bb2e1854777fe31f4', +        'info_dict': { +            'id': '100183293', +            'ext': 'mp4', +            'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть', +            'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 694, +            'age_limit': 0, +        }, +    }, { +        'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', +        'only_matching': True, +    }, { +        'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        video = self._download_xml( +            'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id, +            video_id) + +        if video.tag == 'error': +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, video.text), +                expected=True) + +        quality = qualities(('ld', 'sd', 'hd')) + +        formats = [{ +            'url': src.text, +            'format_id': src.get('quality'), +            'quality': quality(src.get('quality')), +        } for src in video.findall('./src')] +        self._sort_formats(formats) + +        webpage = self._download_webpage( +            'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, +            video_id) + +        title = self._og_search_title(webpage, fatal=False) or xpath_text( +            video, './/title', 'title', fatal=True) +        description = self._search_regex( +            r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False) +        thumbnail = self._og_search_thumbnail(webpage) or xpath_text( +            video, './/cover', 'cover') + +        duration = int_or_none(xpath_text(video, './/time', 'duration')) +        age_limit = int_or_none(xpath_text(video, './/age18', 'age limit')) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'age_limit': age_limit, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 5793dbc10..7cb06f351 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -53,10 +53,10 @@ class TeamcocoIE(InfoExtractor):          embed = self._download_webpage(              embed_url, video_id, 'Downloading embed page') -        encoded_data = self._search_regex( -            r'"preload"\s*:\s*"([^"]+)"', embed, 'encoded data') +        player_data = self._parse_json(self._search_regex( +            r'Y\.Ginger\.Module\.Player\((\{.*?\})\);', embed, 'player data'), video_id)          data = self._parse_json( -            base64.b64decode(encoded_data.encode('ascii')).decode('utf-8'), video_id) +            base64.b64decode(player_data['preload'].encode('ascii')).decode('utf-8'), video_id)          formats = []          get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index b058891bd..cbdaf9c7a 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -358,13 +358,12 @@ class TwitchStreamIE(TwitchBaseIE):              'p': random.randint(1000000, 10000000),              'player': 'twitchweb',              'segment_preference': '4', -            'sig': access_token['sig'], -            'token': access_token['token'], +            'sig': access_token['sig'].encode('utf-8'), +            'token': access_token['token'].encode('utf-8'),          } -          formats = self._extract_m3u8_formats(              '%s/api/channel/hls/%s.m3u8?%s' -            % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query).encode('utf-8')), +            % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query)),              channel_id, 'mp4')          self._prefer_source(formats) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 339c3d897..bd953fb4c 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -41,13 +41,10 @@ class VidmeIE(InfoExtractor):          duration = float_or_none(self._html_search_regex(              r'data-duration="([^"]+)"', webpage, 'duration', fatal=False))          view_count = str_to_int(self._html_search_regex( -            r'<span class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False)) +            r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False))          like_count = str_to_int(self._html_search_regex(              r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">',              webpage, 'like count', fatal=False)) -        comment_count = str_to_int(self._html_search_regex( -            r'class="js-comment-count"[^>]+data-count="([\d,\.\s]+)">', -            webpage, 'comment count', fatal=False))          return {              'id': video_id, @@ -61,5 +58,4 @@ class VidmeIE(InfoExtractor):              'duration': duration,              'view_count': view_count,              'like_count': like_count, -            'comment_count': comment_count,          } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7426e2a1f..d5597d514 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -252,15 +252,12 @@ def sanitize_open(filename, open_mode):              raise          # In case of error, try to remove win32 forbidden chars -        alt_filename = os.path.join( -            re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part) -            for path_part in os.path.split(filename) -        ) +        alt_filename = sanitize_path(filename)          if alt_filename == filename:              raise          else:              # An exception here should be caught in the caller -            stream = open(encodeFilename(filename), open_mode) +            stream = open(encodeFilename(alt_filename), open_mode)              return (stream, alt_filename) @@ -311,6 +308,24 @@ def sanitize_filename(s, restricted=False, is_id=False):      return result +def sanitize_path(s): +    """Sanitizes and normalizes path on Windows""" +    if sys.platform != 'win32': +        return s +    drive, _ = os.path.splitdrive(s) +    unc, _ = os.path.splitunc(s) +    unc_or_drive = unc or drive +    norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep) +    if unc_or_drive: +        norm_path.pop(0) +    sanitized_path = [ +        re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) +        for path_part in norm_path] +    if unc_or_drive: +        sanitized_path.insert(0, unc_or_drive + os.path.sep) +    return os.path.join(*sanitized_path) + +  def orderedSet(iterable):      """ Remove all duplicates from the input iterable """      res = [] | 
