diff options
Diffstat (limited to 'youtube_dl/extractor/generic.py')
| -rw-r--r-- | youtube_dl/extractor/generic.py | 146 | 
1 files changed, 114 insertions, 32 deletions
| diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0f1eb7fa6..90575ab0e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -61,6 +61,9 @@ from .jwplatform import JWPlatformIE  from .digiteka import DigitekaIE  from .instagram import InstagramIE  from .liveleak import LiveLeakIE +from .threeqsdn import ThreeQSDNIE +from .theplatform import ThePlatformIE +from .vessel import VesselIE  class GenericIE(InfoExtractor): @@ -716,15 +719,18 @@ class GenericIE(InfoExtractor):          },          # Wistia embed          { -            'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', -            'md5': '8788b683c777a5cf25621eaf286d0c23', +            'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', +            'md5': '1953f3a698ab51cfc948ed3992a0b7ff',              'info_dict': { -                'id': '1cfaf6b7ea', +                'id': '6e2wtrbdaf',                  'ext': 'mov', -                'title': 'md5:51364a8d3d009997ba99656004b5e20d', -                'duration': 643.0, -                'filesize': 182808282, -                'uploader': 'education-portal.com', +                'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', +                'description': 'a Paywall Videos video from Remilon', +                'duration': 644.072, +                'uploader': 'study.com', +                'timestamp': 1459678540, +                'upload_date': '20160403', +                'filesize': 24687186,              },          },          { @@ -733,13 +739,29 @@ class GenericIE(InfoExtractor):              'info_dict': {                  'id': 'uxjb0lwrcz',                  'ext': 'mp4', -                'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', +                'title': 'Conversation about Hexagonal Rails Part 1',                  'description': 'a Martin Fowler video from ThoughtWorks',                  'duration': 1715.0,                  'uploader': 'thoughtworks.wistia.com', -                'upload_date': '20140603',                  'timestamp': 1401832161, +                'upload_date': '20140603', +            }, +        }, +        # Wistia standard embed (async) +        { +            'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/', +            'info_dict': { +                'id': '807fafadvk', +                'ext': 'mp4', +                'title': 'Drip Brennan Dunn Workshop', +                'description': 'a JV Webinars video from getdrip-1', +                'duration': 4986.95, +                'timestamp': 1463607249, +                'upload_date': '20160518',              }, +            'params': { +                'skip_download': True, +            }          },          # Soundcloud embed          { @@ -763,6 +785,19 @@ class GenericIE(InfoExtractor):                  'title': 'Rosetta #CometLanding webcast HL 10',              }          }, +        # Another Livestream embed, without 'new.' in URL +        { +            'url': 'https://www.freespeech.org/', +            'info_dict': { +                'id': '123537347', +                'ext': 'mp4', +                'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            }, +            'params': { +                # Live stream +                'skip_download': True, +            }, +        },          # LazyYT          {              'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', @@ -847,18 +882,6 @@ class GenericIE(InfoExtractor):                  'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',              }          }, -        # Kaltura embed -        { -            'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15', -            'info_dict': { -                'id': '1_eergr3h1', -                'ext': 'mp4', -                'upload_date': '20150226', -                'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com', -                'timestamp': int, -                'title': 'John Carlson Postgame 2/25/15', -            }, -        },          # Kaltura embed (different embed code)          {              'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014', @@ -884,6 +907,19 @@ class GenericIE(InfoExtractor):                  'uploader_id': 'echojecka',              },          }, +        # Kaltura embed with single quotes +        { +            'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY', +            'info_dict': { +                'id': '0_izeg5utt', +                'ext': 'mp4', +                'title': '35871', +                'timestamp': 1355743100, +                'upload_date': '20121217', +                'uploader_id': 'batchUser', +            }, +            'add_ie': ['Kaltura'], +        },          # Eagle.Platform embed (generic URL)          {              'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -998,14 +1034,18 @@ class GenericIE(InfoExtractor):          },          # UDN embed          { -            'url': 'http://www.udn.com/news/story/7314/822787', +            'url': 'https://video.udn.com/news/300346',              'md5': 'fd2060e988c326991037b9aff9df21a6',              'info_dict': {                  'id': '300346',                  'ext': 'mp4',                  'title': '中一中男師變性 全校師生力挺',                  'thumbnail': 're:^https?://.*\.jpg$', -            } +            }, +            'params': { +                # m3u8 download +                'skip_download': True, +            },          },          # Ooyala embed          { @@ -1173,6 +1213,16 @@ class GenericIE(InfoExtractor):                  'uploader': 'Lake8737',              }          }, +        # Duplicated embedded video URLs +        { +            'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443', +            'info_dict': { +                'id': '149298443_480_16c25b74_2', +                'ext': 'mp4', +                'title': 'vs. Blue Orange Spring Game', +                'uploader': 'www.hudl.com', +            }, +        },      ]      def report_following_redirect(self, new_url): @@ -1427,7 +1477,8 @@ class GenericIE(InfoExtractor):          #   Site Name | Video Title          #   Video Title - Tagline | Site Name          # and so on and so forth; it's just not practical -        video_title = self._html_search_regex( +        video_title = self._og_search_title( +            webpage, default=None) or self._html_search_regex(              r'(?s)<title>(.*?)</title>', webpage, 'video title',              default='video') @@ -1445,6 +1496,9 @@ class GenericIE(InfoExtractor):          video_uploader = self._search_regex(              r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') +        video_description = self._og_search_description(webpage, default=None) +        video_thumbnail = self._og_search_thumbnail(webpage, default=None) +          # Helper method          def _playlist_from_matches(matches, getter=None, ie=None):              urlrs = orderedSet( @@ -1475,6 +1529,16 @@ class GenericIE(InfoExtractor):          if bc_urls:              return _playlist_from_matches(bc_urls, ie='BrightcoveNew') +        # Look for ThePlatform embeds +        tp_urls = ThePlatformIE._extract_urls(webpage) +        if tp_urls: +            return _playlist_from_matches(tp_urls, ie='ThePlatform') + +        # Look for Vessel embeds +        vessel_urls = VesselIE._extract_urls(webpage) +        if vessel_urls: +            return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key()) +          # Look for embedded rtl.nl player          matches = re.findall(              r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', @@ -1543,21 +1607,26 @@ class GenericIE(InfoExtractor):                  'url': embed_url,                  'ie_key': 'Wistia',                  'uploader': video_uploader, -                'title': video_title, -                'id': video_id,              }          match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)          if match:              return {                  '_type': 'url_transparent', -                'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')), +                'url': 'wistia:%s' % match.group('id'),                  'ie_key': 'Wistia',                  'uploader': video_uploader, -                'title': video_title, -                'id': match.group('id')              } +        match = re.search( +            r'''(?sx) +                <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? +                <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 +            ''', webpage) +        if match: +            return self.url_result(self._proto_relative_url( +                'wistia:%s' % match.group('id')), 'Wistia') +          # Look for SVT player          svt_url = SVTIE._extract_url(webpage)          if svt_url: @@ -1833,7 +1902,7 @@ class GenericIE(InfoExtractor):              return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')          mobj = re.search( -            r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"', +            r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"',              webpage)          if mobj is not None:              return self.url_result(mobj.group('url'), 'Livestream') @@ -1845,7 +1914,7 @@ class GenericIE(InfoExtractor):              return self.url_result(mobj.group('url'), 'Zapiks')          # Look for Kaltura embeds -        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or +        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or                  re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))          if mobj is not None:              return self.url_result(smuggle_url( @@ -1983,6 +2052,19 @@ class GenericIE(InfoExtractor):          if liveleak_url:              return self.url_result(liveleak_url, 'LiveLeak') +        # Look for 3Q SDN embeds +        threeqsdn_url = ThreeQSDNIE._extract_url(webpage) +        if threeqsdn_url: +            return { +                '_type': 'url_transparent', +                'ie_key': ThreeQSDNIE.ie_key(), +                'url': self._proto_relative_url(threeqsdn_url), +                'title': video_title, +                'description': video_description, +                'thumbnail': video_thumbnail, +                'uploader': video_uploader, +            } +          def check_video(vurl):              if YoutubeIE.suitable(vurl):                  return True @@ -2063,7 +2145,7 @@ class GenericIE(InfoExtractor):              raise UnsupportedError(url)          entries = [] -        for video_url in found: +        for video_url in orderedSet(found):              video_url = unescapeHTML(video_url)              video_url = video_url.replace('\\/', '/')              video_url = compat_urlparse.urljoin(url, video_url) | 
