diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 10 | ||||
| -rw-r--r-- | youtube_dl/extractor/adultswim.py | 44 | ||||
| -rw-r--r-- | youtube_dl/extractor/breakcom.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 13 | ||||
| -rw-r--r-- | youtube_dl/extractor/dailymotion.py | 10 | ||||
| -rw-r--r-- | youtube_dl/extractor/eagleplatform.py | 98 | ||||
| -rw-r--r-- | youtube_dl/extractor/funnyordie.py | 9 | ||||
| -rw-r--r-- | youtube_dl/extractor/gazeta.py | 38 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 57 | ||||
| -rw-r--r-- | youtube_dl/extractor/niconico.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/npo.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/orf.py | 94 | ||||
| -rw-r--r-- | youtube_dl/extractor/pladform.py | 90 | ||||
| -rw-r--r-- | youtube_dl/extractor/ssa.py | 58 | ||||
| -rw-r--r-- | youtube_dl/extractor/teamcoco.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/twitch.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/vidme.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/vimeo.py | 38 | ||||
| -rw-r--r-- | youtube_dl/extractor/yandexmusic.py | 127 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 2 | 
20 files changed, 654 insertions, 64 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5ca534cdf..7adcc4dbf 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -116,6 +116,7 @@ from .defense import DefenseGouvFrIE  from .discovery import DiscoveryIE  from .divxstage import DivxStageIE  from .dropbox import DropboxIE +from .eagleplatform import EaglePlatformIE  from .ebaumsworld import EbaumsWorldIE  from .echomsk import EchoMskIE  from .ehow import EHowIE @@ -174,6 +175,7 @@ from .gameone import (  from .gamespot import GameSpotIE  from .gamestar import GameStarIE  from .gametrailers import GametrailersIE +from .gazeta import GazetaIE  from .gdcvault import GDCVaultIE  from .generic import GenericIE  from .giantbomb import GiantBombIE @@ -354,6 +356,7 @@ from .orf import (      ORFTVthekIE,      ORFOE1IE,      ORFFM4IE, +    ORFIPTVIE,  )  from .parliamentliveuk import ParliamentLiveUKIE  from .patreon import PatreonIE @@ -361,6 +364,7 @@ from .pbs import PBSIE  from .phoenix import PhoenixIE  from .photobucket import PhotobucketIE  from .planetaplay import PlanetaPlayIE +from .pladform import PladformIE  from .played import PlayedIE  from .playfm import PlayFMIE  from .playvid import PlayvidIE @@ -456,6 +460,7 @@ from .sport5 import Sport5IE  from .sportbox import SportBoxIE  from .sportdeutschland import SportDeutschlandIE  from .srmediathek import SRMediathekIE +from .ssa import SSAIE  from .stanfordoc import StanfordOpenClassroomIE  from .steam import SteamIE  from .streamcloud import StreamcloudIE @@ -607,6 +612,11 @@ from .yahoo import (      YahooSearchIE,  )  from .yam import YamIE +from .yandexmusic import ( +    YandexMusicTrackIE, +    YandexMusicAlbumIE, +    YandexMusicPlaylistIE, +)  from .yesjapan import YesJapanIE  from .ynet import YnetIE  from .youjizz import YouJizzIE diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 34b8b0115..39335b827 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -2,13 +2,12 @@  from __future__ import unicode_literals  import re -import json  from .common import InfoExtractor  from ..utils import (      ExtractorError, -    xpath_text,      float_or_none, +    xpath_text,  ) @@ -60,6 +59,24 @@ class AdultSwimIE(InfoExtractor):              'title': 'American Dad - Putting Francine Out of Business',              'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'          }, +    }, { +        'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/', +        'playlist': [ +            { +                'md5': '3e346a2ab0087d687a05e1e7f3b3e529', +                'info_dict': { +                    'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', +                    'ext': 'flv', +                    'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', +                    'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', +                }, +            } +        ], +        'info_dict': { +            'id': 'sY3cMUR_TbuE4YmdjzbIcQ', +            'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', +            'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', +        },      }]      @staticmethod @@ -80,6 +97,7 @@ class AdultSwimIE(InfoExtractor):              for video in collection.get('videos'):                  if video.get('slug') == slug:                      return collection, video +        return None, None      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -90,28 +108,30 @@ class AdultSwimIE(InfoExtractor):          webpage = self._download_webpage(url, episode_path)          # Extract the value of `bootstrappedData` from the Javascript in the page. -        bootstrappedDataJS = self._search_regex(r'var bootstrappedData = ({.*});', webpage, episode_path) - -        try: -            bootstrappedData = json.loads(bootstrappedDataJS) -        except ValueError as ve: -            errmsg = '%s: Failed to parse JSON ' % episode_path -            raise ExtractorError(errmsg, cause=ve) +        bootstrapped_data = self._parse_json(self._search_regex( +            r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path)          # Downloading videos from a /videos/playlist/ URL needs to be handled differently.          # NOTE: We are only downloading one video (the current one) not the playlist          if is_playlist: -            collections = bootstrappedData['playlists']['collections'] +            collections = bootstrapped_data['playlists']['collections']              collection = self.find_collection_by_linkURL(collections, show_path)              video_info = self.find_video_info(collection, episode_path)              show_title = video_info['showTitle']              segment_ids = [video_info['videoPlaybackID']]          else: -            collections = bootstrappedData['show']['collections'] +            collections = bootstrapped_data['show']['collections']              collection, video_info = self.find_collection_containing_video(collections, episode_path) -            show = bootstrappedData['show'] +            # Video wasn't found in the collections, let's try `slugged_video`. +            if video_info is None: +                if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: +                    video_info = bootstrapped_data['slugged_video'] +                else: +                    raise ExtractorError('Unable to find video info') + +            show = bootstrapped_data['show']              show_title = show['title']              segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 4bcc897c9..809287d14 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -41,7 +41,7 @@ class BreakIE(InfoExtractor):              'tbr': media['bitRate'],              'width': media['width'],              'height': media['height'], -        } for media in info['media']] +        } for media in info['media'] if media.get('mediaPurpose') == 'play']          if not formats:              formats.append({ diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8ff76342f..f9e8e2bad 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -839,7 +839,7 @@ class InfoExtractor(object):                                m3u8_id=None):          formats = [{ -            'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])), +            'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),              'url': m3u8_url,              'ext': ext,              'protocol': 'm3u8', @@ -883,12 +883,13 @@ class InfoExtractor(object):                      formats.append({'url': format_url(line)})                      continue                  tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) -                format_id = last_media.get('NAME') -                if not format_id: -                    format_id = '-'.join(filter(None, [ -                        m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])) +                format_id = [] +                if m3u8_id: +                    format_id.append(m3u8_id) +                last_media_name = last_media.get('NAME') if last_media else None +                format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))                  f = { -                    'format_id': format_id, +                    'format_id': '-'.join(format_id),                      'url': format_url(line.strip()),                      'tbr': tbr,                      'ext': ext, diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 42b20a46d..4f67c3aac 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -46,13 +46,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor):      _TESTS = [          { -            'url': 'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech', -            'md5': '392c4b85a60a90dc4792da41ce3144eb', +            'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', +            'md5': '2137c41a8e78554bb09225b8eb322406',              'info_dict': { -                'id': 'x33vw9', +                'id': 'x2iuewm',                  'ext': 'mp4', -                'uploader': 'Amphora Alex and Van .', -                'title': 'Tutoriel de Youtubeur"DL DES VIDEO DE YOUTUBE"', +                'uploader': 'IGN', +                'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',              }          },          # Vevo video diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py new file mode 100644 index 000000000..7173371ee --- /dev/null +++ b/youtube_dl/extractor/eagleplatform.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    int_or_none, +) + + +class EaglePlatformIE(InfoExtractor): +    _VALID_URL = r'''(?x) +                    (?: +                        eagleplatform:(?P<custom_host>[^/]+):| +                        https?://(?P<host>.+?\.media\.eagleplatform\.com)/index/player\?.*\brecord_id= +                    ) +                    (?P<id>\d+) +                ''' +    _TESTS = [{ +        # http://lenta.ru/news/2015/03/06/navalny/ +        'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201', +        'md5': '0b7994faa2bd5c0f69a3db6db28d078d', +        'info_dict': { +            'id': '227304', +            'ext': 'mp4', +            'title': 'Навальный вышел на свободу', +            'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 87, +            'view_count': int, +            'age_limit': 0, +        }, +    }, { +        # http://muz-tv.ru/play/7129/ +        # http://media.clipyou.ru/index/player?record_id=12820&width=730&height=415&autoplay=true +        'url': 'eagleplatform:media.clipyou.ru:12820', +        'md5': '6c2ebeab03b739597ce8d86339d5a905', +        'info_dict': { +            'id': '12820', +            'ext': 'mp4', +            'title': "'O Sole Mio", +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 216, +            'view_count': int, +        }, +    }] + +    def _handle_error(self, response): +        status = int_or_none(response.get('status', 200)) +        if status != 200: +            raise ExtractorError(' '.join(response['errors']), expected=True) + +    def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'): +        response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) +        self._handle_error(response) +        return response + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id') + +        player_data = self._download_json( +            'http://%s/api/player_data?id=%s' % (host, video_id), video_id) + +        media = player_data['data']['playlist']['viewports'][0]['medialist'][0] + +        title = media['title'] +        description = media.get('description') +        thumbnail = media.get('snapshot') +        duration = int_or_none(media.get('duration')) +        view_count = int_or_none(media.get('views')) + +        age_restriction = media.get('age_restriction') +        age_limit = None +        if age_restriction: +            age_limit = 0 if age_restriction == 'allow_all' else 18 + +        m3u8_data = self._download_json( +            media['sources']['secure_m3u8']['auto'], +            video_id, 'Downloading m3u8 JSON') + +        formats = self._extract_m3u8_formats( +            m3u8_data['data'][0], video_id, +            'mp4', entry_protocol='m3u8_native') +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'view_count': view_count, +            'age_limit': age_limit, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index a49fc1151..dd87257c4 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -50,7 +50,6 @@ class FunnyOrDieIE(InfoExtractor):          bitrates.sort()          formats = [] -          for bitrate in bitrates:              for link in links:                  formats.append({ @@ -59,6 +58,13 @@ class FunnyOrDieIE(InfoExtractor):                      'vbr': bitrate,                  }) +        subtitles = {} +        for src, src_lang in re.findall(r'<track kind="captions" src="([^"]+)" srclang="([^"]+)"', webpage): +            subtitles[src_lang] = [{ +                'ext': src.split('/')[-1], +                'url': 'http://www.funnyordie.com%s' % src, +            }] +          post_json = self._search_regex(              r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details')          post = json.loads(post_json) @@ -69,4 +75,5 @@ class FunnyOrDieIE(InfoExtractor):              'description': post.get('description'),              'thumbnail': post.get('picture'),              'formats': formats, +            'subtitles': subtitles,          } diff --git a/youtube_dl/extractor/gazeta.py b/youtube_dl/extractor/gazeta.py new file mode 100644 index 000000000..ea32b621c --- /dev/null +++ b/youtube_dl/extractor/gazeta.py @@ -0,0 +1,38 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class GazetaIE(InfoExtractor): +    _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:(?:main|\d{4}/\d{2}/\d{2})/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)' +    _TESTS = [{ +        'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml', +        'md5': 'd49c9bdc6e5a7888f27475dc215ee789', +        'info_dict': { +            'id': '205566', +            'ext': 'mp4', +            'title': '«70–80 процентов гражданских в Донецке на грани голода»', +            'description': 'md5:38617526050bd17b234728e7f9620a71', +            'thumbnail': 're:^https?://.*\.jpg', +        }, +    }, { +        'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) + +        display_id = mobj.group('id') +        embed_url = '%s?p=embed' % mobj.group('url') +        embed_page = self._download_webpage( +            embed_url, display_id, 'Downloading embed page') + +        video_id = self._search_regex( +            r'<div[^>]*?class="eagleplayer"[^>]*?data-id="([^"]+)"', embed_page, 'video id') + +        return self.url_result( +            'eagleplatform:gazeta.media.eagleplatform.com:%s' % video_id, 'EaglePlatform') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5dc53685c..4e6927b08 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -570,6 +570,45 @@ class GenericIE(InfoExtractor):                  'title': 'John Carlson Postgame 2/25/15',              },          }, +        # Eagle.Platform embed (generic URL) +        { +            'url': 'http://lenta.ru/news/2015/03/06/navalny/', +            'info_dict': { +                'id': '227304', +                'ext': 'mp4', +                'title': 'Навальный вышел на свободу', +                'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', +                'thumbnail': 're:^https?://.*\.jpg$', +                'duration': 87, +                'view_count': int, +                'age_limit': 0, +            }, +        }, +        # ClipYou (Eagle.Platform) embed (custom URL) +        { +            'url': 'http://muz-tv.ru/play/7129/', +            'info_dict': { +                'id': '12820', +                'ext': 'mp4', +                'title': "'O Sole Mio", +                'thumbnail': 're:^https?://.*\.jpg$', +                'duration': 216, +                'view_count': int, +            }, +        }, +        # Pladform embed +        { +            'url': 'http://muz-tv.ru/kinozal/view/7400/', +            'info_dict': { +                'id': '100183293', +                'ext': 'mp4', +                'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть', +                'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', +                'thumbnail': 're:^https?://.*\.jpg$', +                'duration': 694, +                'age_limit': 0, +            }, +        },          # RSS feed with enclosure          {              'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', @@ -1155,6 +1194,24 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura') +        # Look for Eagle.Platform embeds +        mobj = re.search( +            r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'EaglePlatform') + +        # Look for ClipYou (uses Eagle.Platform) embeds +        mobj = re.search( +            r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) +        if mobj is not None: +            return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') + +        # Look for Pladform embeds +        mobj = re.search( +            r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'Pladform') +          def check_video(vurl):              if YoutubeIE.suitable(vurl):                  return True diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 4c1890416..7fb4e57df 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -41,7 +41,7 @@ class NiconicoIE(InfoExtractor):          },      } -    _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/((?:[a-z]{2})?[0-9]+)' +    _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'      _NETRC_MACHINE = 'niconico'      # Determine whether the downloader used authentication to download video      _AUTHENTICATED = False @@ -76,8 +76,7 @@ class NiconicoIE(InfoExtractor):          return True      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group(1) +        video_id = self._match_id(url)          # Get video webpage. We are not actually interested in it, but need          # the cookies in order to be able to download the info webpage diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 9c01eb0af..557dffa46 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -219,7 +219,8 @@ class NPOLiveIE(NPOBaseIE):          if streams:              for stream in streams:                  stream_type = stream.get('type').lower() -                if stream_type == 'ss': +                # smooth streaming is not supported +                if stream_type in ['ss', 'ms']:                      continue                  stream_info = self._download_json(                      'http://ida.omroep.nl/aapi/?stream=%s&token=%s&type=jsonp' @@ -242,6 +243,7 @@ class NPOLiveIE(NPOBaseIE):                  else:                      formats.append({                          'url': stream_url, +                        'preference': -10,                      })          self._sort_formats(formats) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 4e293392b..ca1a5bb3c 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -11,6 +11,11 @@ from ..utils import (      HEADRequest,      unified_strdate,      ExtractorError, +    strip_jsonp, +    int_or_none, +    float_or_none, +    determine_ext, +    remove_end,  ) @@ -197,3 +202,92 @@ class ORFFM4IE(InfoExtractor):              'description': data['subtitle'],              'entries': entries          } + + +class ORFIPTVIE(InfoExtractor): +    IE_NAME = 'orf:iptv' +    IE_DESC = 'iptv.ORF.at' +    _VALID_URL = r'http://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' + +    _TEST = { +        'url': 'http://iptv.orf.at/stories/2267952', +        'md5': '26ffa4bab6dbce1eee78bbc7021016cd', +        'info_dict': { +            'id': '339775', +            'ext': 'flv', +            'title': 'Kreml-Kritiker Nawalny wieder frei', +            'description': 'md5:6f24e7f546d364dacd0e616a9e409236', +            'duration': 84.729, +            'thumbnail': 're:^https?://.*\.jpg$', +            'upload_date': '20150306', +        }, +    } + +    def _real_extract(self, url): +        story_id = self._match_id(url) + +        webpage = self._download_webpage( +            'http://iptv.orf.at/stories/%s' % story_id, story_id) + +        video_id = self._search_regex( +            r'data-video(?:id)?="(\d+)"', webpage, 'video id') + +        data = self._download_json( +            'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, +            video_id)[0] + +        duration = float_or_none(data['duration'], 1000) + +        video = data['sources']['default'] +        load_balancer_url = video['loadBalancerUrl'] +        abr = int_or_none(video.get('audioBitrate')) +        vbr = int_or_none(video.get('bitrate')) +        fps = int_or_none(video.get('videoFps')) +        width = int_or_none(video.get('videoWidth')) +        height = int_or_none(video.get('videoHeight')) +        thumbnail = video.get('preview') + +        rendition = self._download_json( +            load_balancer_url, video_id, transform_source=strip_jsonp) + +        f = { +            'abr': abr, +            'vbr': vbr, +            'fps': fps, +            'width': width, +            'height': height, +        } + +        formats = [] +        for format_id, format_url in rendition['redirect'].items(): +            if format_id == 'rtmp': +                ff = f.copy() +                ff.update({ +                    'url': format_url, +                    'format_id': format_id, +                }) +                formats.append(ff) +            elif determine_ext(format_url) == 'f4m': +                formats.extend(self._extract_f4m_formats( +                    format_url, video_id, f4m_id=format_id)) +            elif determine_ext(format_url) == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', m3u8_id=format_id)) +            else: +                continue +        self._sort_formats(formats) + +        title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at') +        description = self._og_search_description(webpage) +        upload_date = unified_strdate(self._html_search_meta( +            'dc.date', webpage, 'upload date')) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'duration': duration, +            'thumbnail': thumbnail, +            'upload_date': upload_date, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py new file mode 100644 index 000000000..abde34b94 --- /dev/null +++ b/youtube_dl/extractor/pladform.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    int_or_none, +    xpath_text, +    qualities, +) + + +class PladformIE(InfoExtractor): +    _VALID_URL = r'''(?x) +                    https?:// +                        (?: +                            (?: +                                out\.pladform\.ru/player| +                                static\.pladform\.ru/player\.swf +                            ) +                            \?.*\bvideoid=| +                            video\.pladform\.ru/catalog/video/videoid/ +                        ) +                        (?P<id>\d+) +                    ''' +    _TESTS = [{ +        # http://muz-tv.ru/kinozal/view/7400/ +        'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293', +        'md5': '61f37b575dd27f1bb2e1854777fe31f4', +        'info_dict': { +            'id': '100183293', +            'ext': 'mp4', +            'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть', +            'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 694, +            'age_limit': 0, +        }, +    }, { +        'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', +        'only_matching': True, +    }, { +        'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        video = self._download_xml( +            'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id, +            video_id) + +        if video.tag == 'error': +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, video.text), +                expected=True) + +        quality = qualities(('ld', 'sd', 'hd')) + +        formats = [{ +            'url': src.text, +            'format_id': src.get('quality'), +            'quality': quality(src.get('quality')), +        } for src in video.findall('./src')] +        self._sort_formats(formats) + +        webpage = self._download_webpage( +            'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, +            video_id) + +        title = self._og_search_title(webpage, fatal=False) or xpath_text( +            video, './/title', 'title', fatal=True) +        description = self._search_regex( +            r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False) +        thumbnail = self._og_search_thumbnail(webpage) or xpath_text( +            video, './/cover', 'cover') + +        duration = int_or_none(xpath_text(video, './/time', 'duration')) +        age_limit = int_or_none(xpath_text(video, './/age18', 'age limit')) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'age_limit': age_limit, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/ssa.py b/youtube_dl/extractor/ssa.py new file mode 100644 index 000000000..13101c714 --- /dev/null +++ b/youtube_dl/extractor/ssa.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    unescapeHTML, +    parse_duration, +) + + +class SSAIE(InfoExtractor): +    _VALID_URL = r'http://ssa\.nls\.uk/film/(?P<id>\d+)' +    _TEST = { +        'url': 'http://ssa.nls.uk/film/3561', +        'info_dict': { +            'id': '3561', +            'ext': 'flv', +            'title': 'SHETLAND WOOL', +            'description': 'md5:c5afca6871ad59b4271e7704fe50ab04', +            'duration': 900, +            'thumbnail': 're:^https?://.*\.jpg$', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        streamer = self._search_regex( +            r"'streamer'\s*,\S*'(rtmp[^']+)'", webpage, 'streamer') +        play_path = self._search_regex( +            r"'file'\s*,\s*'([^']+)'", webpage, 'file').rpartition('.')[0] + +        def search_field(field_name, fatal=False): +            return self._search_regex( +                r'<span\s+class="field_title">%s:</span>\s*<span\s+class="field_content">([^<]+)</span>' % field_name, +                webpage, 'title', fatal=fatal) + +        title = unescapeHTML(search_field('Title', fatal=True)).strip('()[]') +        description = unescapeHTML(search_field('Description')) +        duration = parse_duration(search_field('Running time')) +        thumbnail = self._search_regex( +            r"'image'\s*,\s*'([^']+)'", webpage, 'thumbnails', fatal=False) + +        return { +            'id': video_id, +            'url': streamer, +            'play_path': play_path, +            'ext': 'flv', +            'title': title, +            'description': description, +            'duration': duration, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 5793dbc10..7cb06f351 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -53,10 +53,10 @@ class TeamcocoIE(InfoExtractor):          embed = self._download_webpage(              embed_url, video_id, 'Downloading embed page') -        encoded_data = self._search_regex( -            r'"preload"\s*:\s*"([^"]+)"', embed, 'encoded data') +        player_data = self._parse_json(self._search_regex( +            r'Y\.Ginger\.Module\.Player\((\{.*?\})\);', embed, 'player data'), video_id)          data = self._parse_json( -            base64.b64decode(encoded_data.encode('ascii')).decode('utf-8'), video_id) +            base64.b64decode(player_data['preload'].encode('ascii')).decode('utf-8'), video_id)          formats = []          get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index b058891bd..cbdaf9c7a 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -358,13 +358,12 @@ class TwitchStreamIE(TwitchBaseIE):              'p': random.randint(1000000, 10000000),              'player': 'twitchweb',              'segment_preference': '4', -            'sig': access_token['sig'], -            'token': access_token['token'], +            'sig': access_token['sig'].encode('utf-8'), +            'token': access_token['token'].encode('utf-8'),          } -          formats = self._extract_m3u8_formats(              '%s/api/channel/hls/%s.m3u8?%s' -            % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query).encode('utf-8')), +            % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query)),              channel_id, 'mp4')          self._prefer_source(formats) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 339c3d897..bd953fb4c 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -41,13 +41,10 @@ class VidmeIE(InfoExtractor):          duration = float_or_none(self._html_search_regex(              r'data-duration="([^"]+)"', webpage, 'duration', fatal=False))          view_count = str_to_int(self._html_search_regex( -            r'<span class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False)) +            r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False))          like_count = str_to_int(self._html_search_regex(              r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">',              webpage, 'like count', fatal=False)) -        comment_count = str_to_int(self._html_search_regex( -            r'class="js-comment-count"[^>]+data-count="([\d,\.\s]+)">', -            webpage, 'comment count', fatal=False))          return {              'id': video_id, @@ -61,5 +58,4 @@ class VidmeIE(InfoExtractor):              'duration': duration,              'view_count': view_count,              'like_count': like_count, -            'comment_count': comment_count,          } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 8f540f578..b84a83ba6 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals  import json  import re  import itertools -import hashlib  from .common import InfoExtractor  from ..compat import ( @@ -20,6 +19,7 @@ from ..utils import (      RegexNotFoundError,      smuggle_url,      std_headers, +    unified_strdate,      unsmuggle_url,      urlencode_postdata,  ) @@ -140,6 +140,7 @@ class VimeoIE(VimeoBaseInfoExtractor):                  'description': 'md5:8678b246399b070816b12313e8b4eb5c',                  'uploader_id': 'atencio',                  'uploader': 'Peter Atencio', +                'upload_date': '20130927',                  'duration': 187,              },          }, @@ -176,17 +177,15 @@ class VimeoIE(VimeoBaseInfoExtractor):          password = self._downloader.params.get('videopassword', None)          if password is None:              raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) -        token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') -        data = compat_urllib_parse.urlencode({ +        token = self._search_regex(r'xsrft = \'(.*?)\'', webpage, 'login token') +        data = urlencode_postdata({              'password': password,              'token': token,          }) -        # I didn't manage to use the password with https -        if url.startswith('https'): -            pass_url = url.replace('https', 'http') -        else: -            pass_url = url -        password_request = compat_urllib_request.Request(pass_url + '/password', data) +        if url.startswith('http://'): +            # vimeo only supports https now, but the user can give an http url +            url = url.replace('http://', 'https://') +        password_request = compat_urllib_request.Request(url + '/password', data)          password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')          password_request.add_header('Cookie', 'xsrft=%s' % token)          return self._download_webpage( @@ -223,12 +222,7 @@ class VimeoIE(VimeoBaseInfoExtractor):          video_id = mobj.group('id')          orig_url = url          if mobj.group('pro') or mobj.group('player'): -            url = 'http://player.vimeo.com/video/' + video_id - -        password = self._downloader.params.get('videopassword', None) -        if password: -            headers['Cookie'] = '%s_password=%s' % ( -                video_id, hashlib.md5(password.encode('utf-8')).hexdigest()) +            url = 'https://player.vimeo.com/video/' + video_id          # Retrieve video webpage to extract further information          request = compat_urllib_request.Request(url, None, headers) @@ -323,9 +317,9 @@ class VimeoIE(VimeoBaseInfoExtractor):          # Extract upload date          video_upload_date = None -        mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage) +        mobj = re.search(r'<time[^>]+datetime="([^"]+)"', webpage)          if mobj is not None: -            video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) +            video_upload_date = unified_strdate(mobj.group(1))          try:              view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) @@ -435,10 +429,10 @@ class VimeoChannelIE(InfoExtractor):              name="([^"]+)"\s+              value="([^"]*)"              ''', login_form)) -        token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') +        token = self._search_regex(r'xsrft = \'(.*?)\'', webpage, 'login token')          fields['token'] = token          fields['password'] = password -        post = compat_urllib_parse.urlencode(fields) +        post = urlencode_postdata(fields)          password_path = self._search_regex(              r'action="([^"]+)"', login_form, 'password URL')          password_url = compat_urlparse.urljoin(page_url, password_path) @@ -500,10 +494,10 @@ class VimeoUserIE(VimeoChannelIE):  class VimeoAlbumIE(VimeoChannelIE):      IE_NAME = 'vimeo:album' -    _VALID_URL = r'https?://vimeo\.com/album/(?P<id>\d+)' +    _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)'      _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'      _TESTS = [{ -        'url': 'http://vimeo.com/album/2632481', +        'url': 'https://vimeo.com/album/2632481',          'info_dict': {              'id': '2632481',              'title': 'Staff Favorites: November 2013', @@ -527,7 +521,7 @@ class VimeoAlbumIE(VimeoChannelIE):      def _real_extract(self, url):          album_id = self._match_id(url) -        return self._extract_videos(album_id, 'http://vimeo.com/album/%s' % album_id) +        return self._extract_videos(album_id, 'https://vimeo.com/album/%s' % album_id)  class VimeoGroupsIE(VimeoAlbumIE): diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py new file mode 100644 index 000000000..b47aecb15 --- /dev/null +++ b/youtube_dl/extractor/yandexmusic.py @@ -0,0 +1,127 @@ +# coding=utf-8 +from __future__ import unicode_literals + +import re +import hashlib + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    int_or_none, +    float_or_none, +) + + +class YandexMusicBaseIE(InfoExtractor): +    def _get_track_url(self, storage_dir, track_id): +        data = self._download_json( +            'http://music.yandex.ru/api/v1.5/handlers/api-jsonp.jsx?action=getTrackSrc&p=download-info/%s' +            % storage_dir, +            track_id, 'Downloading track location JSON') + +        key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + data['path'][1:] + data['s']).encode('utf-8')).hexdigest() +        storage = storage_dir.split('.') + +        return ('http://%s/get-mp3/%s/%s?track-id=%s&from=service-10-track&similarities-experiment=default' +                % (data['host'], key, data['ts'] + data['path'], storage[1])) + +    def _get_track_info(self, track): +        return { +            'id': track['id'], +            'ext': 'mp3', +            'url': self._get_track_url(track['storageDir'], track['id']), +            'title': '%s - %s' % (track['artists'][0]['name'], track['title']), +            'filesize': int_or_none(track.get('fileSize')), +            'duration': float_or_none(track.get('durationMs'), 1000), +        } + + +class YandexMusicTrackIE(YandexMusicBaseIE): +    IE_NAME = 'yandexmusic:track' +    IE_DESC = 'Яндекс.Музыка - Трек' +    _VALID_URL = r'https?://music\.yandex\.ru/album/(?P<album_id>\d+)/track/(?P<id>\d+)' + +    _TEST = { +        'url': 'http://music.yandex.ru/album/540508/track/4878838', +        'md5': 'f496818aa2f60b6c0062980d2e00dc20', +        'info_dict': { +            'id': '4878838', +            'ext': 'mp3', +            'title': 'Carlo Ambrosio - Gypsy Eyes 1', +            'filesize': 4628061, +            'duration': 193.04, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        album_id, track_id = mobj.group('album_id'), mobj.group('id') + +        track = self._download_json( +            'http://music.yandex.ru/handlers/track.jsx?track=%s:%s' % (track_id, album_id), +            track_id, 'Downloading track JSON')['track'] + +        return self._get_track_info(track) + + +class YandexMusicAlbumIE(YandexMusicBaseIE): +    IE_NAME = 'yandexmusic:album' +    IE_DESC = 'Яндекс.Музыка - Альбом' +    _VALID_URL = r'https?://music\.yandex\.ru/album/(?P<id>\d+)/?(\?|$)' + +    _TEST = { +        'url': 'http://music.yandex.ru/album/540508', +        'info_dict': { +            'id': '540508', +            'title': 'Carlo Ambrosio - Gypsy Soul (2009)', +        }, +        'playlist_count': 50, +    } + +    def _real_extract(self, url): +        album_id = self._match_id(url) + +        album = self._download_json( +            'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id, +            album_id, 'Downloading album JSON') + +        entries = [self._get_track_info(track) for track in album['volumes'][0]] + +        title = '%s - %s' % (album['artists'][0]['name'], album['title']) +        year = album.get('year') +        if year: +            title += ' (%s)' % year + +        return self.playlist_result(entries, compat_str(album['id']), title) + + +class YandexMusicPlaylistIE(YandexMusicBaseIE): +    IE_NAME = 'yandexmusic:playlist' +    IE_DESC = 'Яндекс.Музыка - Плейлист' +    _VALID_URL = r'https?://music\.yandex\.ru/users/[^/]+/playlists/(?P<id>\d+)' + +    _TEST = { +        'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', +        'info_dict': { +            'id': '1245', +            'title': 'Что слушают Enter Shikari', +            'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', +        }, +        'playlist_count': 6, +    } + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) + +        webpage = self._download_webpage(url, playlist_id) + +        playlist = self._parse_json( +            self._search_regex( +                r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'), +            playlist_id)['pageData']['playlist'] + +        entries = [self._get_track_info(track) for track in playlist['tracks']] + +        return self.playlist_result( +            entries, compat_str(playlist_id), +            playlist['title'], playlist.get('description')) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3690f8021..27c8c4453 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1532,7 +1532,7 @@ class YoutubeSearchURLIE(InfoExtractor):          webpage = self._download_webpage(url, query)          result_code = self._search_regex( -            r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML') +            r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')          part_codes = re.findall(              r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)  | 
