diff options
Diffstat (limited to 'youtube_dl/extractor/generic.py')
| -rw-r--r-- | youtube_dl/extractor/generic.py | 155 | 
1 files changed, 149 insertions, 6 deletions
| diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 27e2bc300..2ff002643 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -26,8 +26,10 @@ from ..utils import (      unsmuggle_url,      UnsupportedError,      url_basename, +    xpath_text,  )  from .brightcove import BrightcoveIE +from .nbc import NBCSportsVPlayerIE  from .ooyala import OoyalaIE  from .rutv import RUTVIE  from .smotri import SmotriIE @@ -526,6 +528,17 @@ class GenericIE(InfoExtractor):              },              'add_ie': ['Viddler'],          }, +        # Libsyn embed +        { +            'url': 'http://thedailyshow.cc.com/podcast/episodetwelve', +            'info_dict': { +                'id': '3377616', +                'ext': 'mp3', +                'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", +                'description': 'md5:601cb790edd05908957dae8aaa866465', +                'upload_date': '20150220', +            }, +        },          # jwplayer YouTube          {              'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/', @@ -569,6 +582,75 @@ class GenericIE(InfoExtractor):                  'title': 'John Carlson Postgame 2/25/15',              },          }, +        # Eagle.Platform embed (generic URL) +        { +            'url': 'http://lenta.ru/news/2015/03/06/navalny/', +            'info_dict': { +                'id': '227304', +                'ext': 'mp4', +                'title': 'Навальный вышел на свободу', +                'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', +                'thumbnail': 're:^https?://.*\.jpg$', +                'duration': 87, +                'view_count': int, +                'age_limit': 0, +            }, +        }, +        # ClipYou (Eagle.Platform) embed (custom URL) +        { +            'url': 'http://muz-tv.ru/play/7129/', +            'info_dict': { +                'id': '12820', +                'ext': 'mp4', +                'title': "'O Sole Mio", +                'thumbnail': 're:^https?://.*\.jpg$', +                'duration': 216, +                'view_count': int, +            }, +        }, +        # Pladform embed +        { +            'url': 'http://muz-tv.ru/kinozal/view/7400/', +            'info_dict': { +                'id': '100183293', +                'ext': 'mp4', +                'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть', +                'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', +                'thumbnail': 're:^https?://.*\.jpg$', +                'duration': 694, +                'age_limit': 0, +            }, +        }, +        # 5min embed +        { +            'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/', +            'md5': '4c6f127a30736b59b3e2c19234ee2bf7', +            'info_dict': { +                'id': '518726732', +                'ext': 'mp4', +                'title': 'Facebook Creates "On This Day" | Crunch Report', +            }, +        }, +        # RSS feed with enclosure +        { +            'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', +            'info_dict': { +                'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', +                'ext': 'm4v', +                'upload_date': '20150228', +                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', +            } +        }, +        # NBC Sports vplayer embed +        { +            'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a', +            'info_dict': { +                'id': 'ln7x1qSThw4k', +                'ext': 'flv', +                'title': "PFT Live: New leader in the 'new-look' defense", +                'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', +            }, +        }      ]      def report_following_redirect(self, new_url): @@ -580,11 +662,24 @@ class GenericIE(InfoExtractor):          playlist_desc_el = doc.find('./channel/description')          playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text -        entries = [{ -            '_type': 'url', -            'url': e.find('link').text, -            'title': e.find('title').text, -        } for e in doc.findall('./channel/item')] +        entries = [] +        for it in doc.findall('./channel/item'): +            next_url = xpath_text(it, 'link', fatal=False) +            if not next_url: +                enclosure_nodes = it.findall('./enclosure') +                for e in enclosure_nodes: +                    next_url = e.attrib.get('url') +                    if next_url: +                        break + +            if not next_url: +                continue + +            entries.append({ +                '_type': 'url', +                'url': next_url, +                'title': it.find('title').text, +            })          return {              '_type': 'playlist', @@ -943,6 +1038,19 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result(mobj.group('url')) +        # Look for NYTimes player +        mobj = re.search( +            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', +            webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url')) + +        # Look for Libsyn player +        mobj = re.search( +            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url')) +          # Look for Ooyala videos          mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or                  re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or @@ -1131,6 +1239,35 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura') +        # Look for Eagle.Platform embeds +        mobj = re.search( +            r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'EaglePlatform') + +        # Look for ClipYou (uses Eagle.Platform) embeds +        mobj = re.search( +            r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) +        if mobj is not None: +            return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') + +        # Look for Pladform embeds +        mobj = re.search( +            r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'Pladform') + +        # Look for 5min embeds +        mobj = re.search( +            r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage) +        if mobj is not None: +            return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin') + +        # Look for NBC Sports VPlayer embeds +        nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) +        if nbc_sports_url: +            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') +          def check_video(vurl):              if YoutubeIE.suitable(vurl):                  return True @@ -1187,10 +1324,16 @@ class GenericIE(InfoExtractor):              # HTML5 video              found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)          if not found: +            REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'              found = re.search(                  r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' -                r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)', +                r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,                  webpage) +            if not found: +                # Look also in Refresh HTTP header +                refresh_header = head_response.headers.get('Refresh') +                if refresh_header: +                    found = re.search(REDIRECT_REGEX, refresh_header)              if found:                  new_url = found.group(1)                  self.report_following_redirect(new_url) | 
