diff options
Diffstat (limited to 'youtube_dl/InfoExtractors.py')
| -rwxr-xr-x | youtube_dl/InfoExtractors.py | 96 | 
1 files changed, 75 insertions, 21 deletions
| diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index a7fdf1607..ae36558d7 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -115,7 +115,8 @@ class InfoExtractor(object):          """ Returns the response handle """          if note is None:              note = u'Downloading video webpage' -        self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note)) +        if note is not False: +            self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))          try:              return compat_urllib_request.urlopen(url_or_request)          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -133,6 +134,14 @@ class InfoExtractor(object):          else:              encoding = 'utf-8'          webpage_bytes = urlh.read() +        if self._downloader.params.get('dump_intermediate_pages', False): +            try: +                url = url_or_request.get_full_url() +            except AttributeError: +                url = url_or_request +            self._downloader.to_screen(u'Dumping request to ' + url) +            dump = base64.b64encode(webpage_bytes).decode('ascii') +            self._downloader.to_screen(dump)          return webpage_bytes.decode(encoding, 'replace')      #Methods for following #608 @@ -485,18 +494,14 @@ class YoutubeIE(InfoExtractor):          # Get video info          self.report_video_info_webpage_download(video_id)          for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: -            video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' +            video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'                      % (video_id, el_type)) -            request = compat_urllib_request.Request(video_info_url) -            try: -                video_info_webpage_bytes = compat_urllib_request.urlopen(request).read() -                video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore') -                video_info = compat_parse_qs(video_info_webpage) -                if 'token' in video_info: -                    break -            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -                self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err)) -                return +            video_info_webpage = self._download_webpage(video_info_url, video_id, +                                    note=False, +                                    errnote='unable to download video info webpage') +            video_info = compat_parse_qs(video_info_webpage) +            if 'token' in video_info: +                break          if 'token' not in video_info:              if 'reason' in video_info:                  self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0]) @@ -1151,7 +1156,7 @@ class VimeoIE(InfoExtractor):          # Extract video description          video_description = get_element_by_attribute("itemprop", "description", webpage)          if video_description: video_description = clean_html(video_description) -        else: video_description = '' +        else: video_description = u''          # Extract upload date          video_upload_date = None @@ -1794,9 +1799,13 @@ class YoutubePlaylistIE(InfoExtractor):                  self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))                  return -            if not 'feed' in response or not 'entry' in response['feed']: +            if 'feed' not in response:                  self._downloader.report_error(u'Got a malformed response from YouTube API')                  return +            if 'entry' not in response['feed']: +                # Number of videos is a multiple of self._MAX_RESULTS +                break +              videos += [ (entry['yt$position']['$t'], entry['content']['src'])                          for entry in response['feed']['entry']                          if 'content' in entry ] @@ -2144,7 +2153,7 @@ class FacebookIE(InfoExtractor):          url = 'https://www.facebook.com/video/video.php?v=%s' % video_id          webpage = self._download_webpage(url, video_id) -        BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n' +        BEFORE = '{swf.addParam(param[0], param[1]);});\n'          AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'          m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)          if not m: @@ -2152,12 +2161,14 @@ class FacebookIE(InfoExtractor):          data = dict(json.loads(m.group(1)))          params_raw = compat_urllib_parse.unquote(data['params'])          params = json.loads(params_raw) -        video_url = params['hd_src'] +        video_data = params['video_data'][0] +        video_url = video_data.get('hd_src')          if not video_url: -            video_url = params['sd_src'] +            video_url = video_data['sd_src']          if not video_url:              raise ExtractorError(u'Cannot find video URL') -        video_duration = int(params['video_duration']) +        video_duration = int(video_data['video_duration']) +        thumbnail = video_data['thumbnail_src']          m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)          if not m: @@ -2170,7 +2181,7 @@ class FacebookIE(InfoExtractor):              'url': video_url,              'ext': 'mp4',              'duration': video_duration, -            'thumbnail': params['thumbnail_src'], +            'thumbnail': thumbnail,          }          return [info] @@ -3685,7 +3696,9 @@ class FunnyOrDieIE(InfoExtractor):          m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)          if not m: -            self._downloader.trouble(u'Cannot find video title') +            m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage) +            if not m: +                self._downloader.trouble(u'Cannot find video title')          title = clean_html(m.group('title'))          m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage) @@ -4119,7 +4132,7 @@ class KeekIE(InfoExtractor):          video_url = u'http://cdn.keek.com/keek/video/%s' % video_id          thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id          webpage = self._download_webpage(url, video_id) -        m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage) +        m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)          title = unescapeHTML(m.group('title'))          m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)          uploader = clean_html(m.group('uploader')) @@ -4344,6 +4357,46 @@ class LiveLeakIE(InfoExtractor):          return [info] +class ARDIE(InfoExtractor): +    _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' +    _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>' +    _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)' + +    def _real_extract(self, url): +        # determine video id from url +        m = re.match(self._VALID_URL, url) + +        numid = re.search(r'documentId=([0-9]+)', url) +        if numid: +            video_id = numid.group(1) +        else: +            video_id = m.group('video_id') + +        # determine title and media streams from webpage +        html = self._download_webpage(url, video_id) +        title = re.search(self._TITLE, html).group('title') +        streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] +        if not streams: +            assert '"fsk"' in html +            self._downloader.report_error(u'this video is only available after 8:00 pm') +            return + +        # choose default media type and highest quality for now +        stream = max([s for s in streams if int(s["media_type"]) == 0], +                     key=lambda s: int(s["quality"])) + +        # there's two possibilities: RTMP stream or HTTP download +        info = {'id': video_id, 'title': title, 'ext': 'mp4'} +        if stream['rtmp_url']: +            self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME) +            assert stream['video_url'].startswith('mp4:') +            info["url"] = stream["rtmp_url"] +            info["play_path"] = stream['video_url'] +        else: +            assert stream["video_url"].endswith('.mp4') +            info["url"] = stream["video_url"] +        return [info] +  def gen_extractors():      """ Return a list of an instance of every supported extractor. @@ -4397,5 +4450,6 @@ def gen_extractors():          MySpassIE(),          SpiegelIE(),          LiveLeakIE(), +        ARDIE(),          GenericIE()      ] | 
