diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/aftonbladet.py | 15 | ||||
| -rw-r--r-- | youtube_dl/extractor/bandcamp.py | 38 | ||||
| -rw-r--r-- | youtube_dl/extractor/blinkx.py | 30 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 11 | ||||
| -rw-r--r-- | youtube_dl/extractor/mailru.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/ndr.py | 24 | ||||
| -rw-r--r-- | youtube_dl/extractor/nowness.py | 9 | 
7 files changed, 62 insertions, 70 deletions
| diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index 6a8cd14c9..cfc7370ae 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -1,7 +1,6 @@  # encoding: utf-8  from __future__ import unicode_literals -import datetime  import re  from .common import InfoExtractor @@ -16,6 +15,7 @@ class AftonbladetIE(InfoExtractor):              'ext': 'mp4',              'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna',              'description': 'Jupiters måne mest aktiv av alla himlakroppar', +            'timestamp': 1394142732,              'upload_date': '20140306',          },      } @@ -27,17 +27,17 @@ class AftonbladetIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          # find internal video meta data -        META_URL = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' +        meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'          internal_meta_id = self._html_search_regex(              r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id') -        internal_meta_url = META_URL % internal_meta_id +        internal_meta_url = meta_url % internal_meta_id          internal_meta_json = self._download_json(              internal_meta_url, video_id, 'Downloading video meta data')          # find internal video formats -        FORMATS_URL = 'http://aftonbladet-play.videodata.drvideo.aptoma.no/actions/video/?id=%s' +        format_url = 'http://aftonbladet-play.videodata.drvideo.aptoma.no/actions/video/?id=%s'          internal_video_id = internal_meta_json['videoId'] -        internal_formats_url = FORMATS_URL % internal_video_id +        internal_formats_url = format_url % internal_video_id          internal_formats_json = self._download_json(              internal_formats_url, video_id, 'Downloading video formats') @@ -54,16 +54,13 @@ class AftonbladetIE(InfoExtractor):              })          self._sort_formats(formats) -        timestamp = datetime.datetime.fromtimestamp(internal_meta_json['timePublished']) -        upload_date = timestamp.strftime('%Y%m%d') -          return {              'id': video_id,              'title': internal_meta_json['title'],              'formats': formats,              'thumbnail': internal_meta_json['imageUrl'],              'description': internal_meta_json['shortPreamble'], -            'upload_date': upload_date, +            'timestamp': internal_meta_json['timePublished'],              'duration': internal_meta_json['duration'],              'view_count': internal_meta_json['views'],          } diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 929aafdff..dcbbdef43 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -19,7 +19,7 @@ class BandcampIE(InfoExtractor):          'md5': 'c557841d5e50261777a6585648adf439',          'info_dict': {              "title": "youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", -            "duration": 10, +            "duration": 9.8485,          },          '_skip': 'There is a limit of 200 free downloads / month for the test song'      }] @@ -28,36 +28,32 @@ class BandcampIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          title = mobj.group('title')          webpage = self._download_webpage(url, title) -        # We get the link to the free download page          m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) -        if m_download is None: +        if not m_download:              m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)              if m_trackinfo:                  json_code = m_trackinfo.group(1) -                data = json.loads(json_code) -                d = data[0] +                data = json.loads(json_code)[0] -                duration = int(round(d['duration']))                  formats = [] -                for format_id, format_url in d['file'].items(): -                    ext, _, abr_str = format_id.partition('-') - +                for format_id, format_url in data['file'].items(): +                    ext, abr_str = format_id.split('-', 1)                      formats.append({                          'format_id': format_id,                          'url': format_url, -                        'ext': format_id.partition('-')[0], +                        'ext': ext,                          'vcodec': 'none', -                        'acodec': format_id.partition('-')[0], -                        'abr': int(format_id.partition('-')[2]), +                        'acodec': ext, +                        'abr': int(abr_str),                      })                  self._sort_formats(formats)                  return { -                    'id': compat_str(d['id']), -                    'title': d['title'], +                    'id': compat_str(data['id']), +                    'title': data['title'],                      'formats': formats, -                    'duration': duration, +                    'duration': float(data['duration']),                  }              else:                  raise ExtractorError('No free songs found') @@ -67,11 +63,9 @@ class BandcampIE(InfoExtractor):              r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',              webpage, re.MULTILINE | re.DOTALL).group('id') -        download_webpage = self._download_webpage(download_link, video_id, -                                                  'Downloading free downloads page') -        # We get the dictionary of the track from some javascrip code -        info = re.search(r'items: (.*?),$', -                         download_webpage, re.MULTILINE).group(1) +        download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page') +        # We get the dictionary of the track from some javascript code +        info = re.search(r'items: (.*?),$', download_webpage, re.MULTILINE).group(1)          info = json.loads(info)[0]          # We pick mp3-320 for now, until format selection can be easily implemented.          mp3_info = info['downloads']['mp3-320'] @@ -100,7 +94,7 @@ class BandcampIE(InfoExtractor):  class BandcampAlbumIE(InfoExtractor):      IE_NAME = 'Bandcamp:album' -    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+))?' +    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+))'      _TEST = {          'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -123,7 +117,7 @@ class BandcampAlbumIE(InfoExtractor):          'params': {              'playlistend': 2          }, -        'skip': 'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' +        'skip': 'Bandcamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index 96408e4e0..38ccd957f 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -1,6 +1,5 @@  from __future__ import unicode_literals -import datetime  import json  import re @@ -19,15 +18,16 @@ class BlinkxIE(InfoExtractor):          'file': '8aQUy7GV.mp4',          'md5': '2e9a07364af40163a908edbf10bb2492',          'info_dict': { -            "title": "Police Car Rolls Away", -            "uploader": "stupidvideos.com", -            "upload_date": "20131215", -            "description": "A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!", -            "duration": 14.886, -            "thumbnails": [{ -                "width": 100, -                "height": 76, -                "url": "http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg", +            'title': 'Police Car Rolls Away', +            'uploader': 'stupidvideos.com', +            'upload_date': '20131215', +            'timestamp': 1387068000, +            'description': 'A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!', +            'duration': 14.886, +            'thumbnails': [{ +                'width': 100, +                'height': 76, +                'url': 'http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg',              }],          },      } @@ -41,9 +41,6 @@ class BlinkxIE(InfoExtractor):                     'video=%s' % video_id)          data_json = self._download_webpage(api_url, display_id)          data = json.loads(data_json)['api']['results'][0] -        dt = datetime.datetime.fromtimestamp(data['pubdate_epoch']) -        pload_date = dt.strftime('%Y%m%d') -          duration = None          thumbnails = []          formats = [] @@ -64,10 +61,7 @@ class BlinkxIE(InfoExtractor):                  vcodec = remove_start(m['vcodec'], 'ff')                  acodec = remove_start(m['acodec'], 'ff')                  tbr = (int(m['vbr']) + int(m['abr'])) // 1000 -                format_id = (u'%s-%sk-%s' % -                             (vcodec, -                              tbr, -                              m['w'])) +                format_id = u'%s-%sk-%s' % (vcodec, tbr, m['w'])                  formats.append({                      'format_id': format_id,                      'url': m['link'], @@ -88,7 +82,7 @@ class BlinkxIE(InfoExtractor):              'title': data['title'],              'formats': formats,              'uploader': data['channel_name'], -            'upload_date': pload_date, +            'timestamp': data['pubdate_epoch'],              'description': data.get('description'),              'thumbnails': thumbnails,              'duration': duration, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 286133282..38a357d3b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -363,8 +363,13 @@ class GenericIE(InfoExtractor):                      return self.url_result('http://' + url)                  else:                      if default_search == 'auto_warning': -                        self._downloader.report_warning( -                            'Falling back to youtube search for  %s . Set --default-search to "auto" to suppress this warning.' % url) +                        if re.match(r'^(?:url|URL)$', url): +                            raise ExtractorError( +                                'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url, +                                expected=True) +                        else: +                            self._downloader.report_warning( +                                'Falling back to youtube search for  %s . Set --default-search to "auto" to suppress this warning.' % url)                      return self.url_result('ytsearch:' + url)              else:                  assert ':' in default_search @@ -560,7 +565,7 @@ class GenericIE(InfoExtractor):          # Look for embedded NovaMov-based player          mobj = re.search( -            r'''(?x)<iframe[^>]+?src=(["\']) +            r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])                      (?P<url>http://(?:(?:embed|www)\.)?                          (?:novamov\.com|                             nowvideo\.(?:ch|sx|eu|at|ag|co)| diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index f819c09b3..5016989cc 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -2,7 +2,6 @@  from __future__ import unicode_literals  import re -import datetime  from .common import InfoExtractor @@ -19,6 +18,7 @@ class MailRuIE(InfoExtractor):              'id': '46301138',              'ext': 'mp4',              'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро', +            'timestamp': 1393232740,              'upload_date': '20140224',              'uploader': 'sonypicturesrus',              'uploader_id': 'sonypicturesrus@mail.ru', @@ -43,7 +43,6 @@ class MailRuIE(InfoExtractor):          thumbnail = movie['poster']          duration = movie['duration'] -        upload_date = datetime.datetime.fromtimestamp(video_data['timestamp']).strftime('%Y%m%d')          view_count = video_data['views_count']          formats = [ @@ -57,7 +56,7 @@ class MailRuIE(InfoExtractor):              'id': content_id,              'title': title,              'thumbnail': thumbnail, -            'upload_date': upload_date, +            'timestamp': video_data['timestamp'],              'uploader': uploader,              'uploader_id': uploader_id,              'duration': duration, diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 0650f9564..53b34f5e6 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( +    ExtractorError, +    int_or_none, +)  class NDRIE(InfoExtractor): @@ -45,13 +48,12 @@ class NDRIE(InfoExtractor):          page = self._download_webpage(url, video_id, 'Downloading page') -        title = self._og_search_title(page) +        title = self._og_search_title(page).strip()          description = self._og_search_description(page) +        if description: +            description = description.strip() -        mobj = re.search( -            r'<div class="duration"><span class="min">(?P<minutes>\d+)</span>:<span class="sec">(?P<seconds>\d+)</span></div>', -            page) -        duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None +        duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', fatal=False))          formats = [] @@ -66,10 +68,12 @@ class NDRIE(InfoExtractor):          video_url = re.search(r'''3: {src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)          if video_url: -            thumbnail = self._html_search_regex(r'(?m)title: "NDR PLAYER",\s*poster: "([^"]+)",', -                page, 'thumbnail', fatal=False) -            if thumbnail: -                thumbnail = 'http://www.ndr.de' + thumbnail +            thumbnails = re.findall(r'''\d+: {src: "([^"]+)"(?: \|\| '[^']+')?, quality: '([^']+)'}''', page) +            if thumbnails: +                QUALITIES = ['xs', 's', 'm', 'l', 'xl'] +                thumbnails.sort(key=lambda thumb: QUALITIES.index(thumb[1]) if thumb[1] in QUALITIES else -1) +                thumbnail = 'http://www.ndr.de' + thumbnails[-1][0] +              for format_id in ['lo', 'hi', 'hq']:                  formats.append({                      'url': '%s.%s.mp4' % (video_url.group('video'), format_id), diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index b1bcb7e54..1c5e9401f 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -4,9 +4,7 @@ import re  from .brightcove import BrightcoveIE  from .common import InfoExtractor -from ..utils import ( -    ExtractorError, -) +from ..utils import ExtractorError  class NownessIE(InfoExtractor): @@ -14,9 +12,10 @@ class NownessIE(InfoExtractor):      _TEST = {          'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation', -        'file': '2520295746001.mp4', -        'md5': '0ece2f70a7bd252c7b00f3070182d418', +        'md5': '068bc0202558c2e391924cb8cc470676',          'info_dict': { +            'id': '2520295746001', +            'ext': 'mp4',              'description': 'Candor: The Art of Gesticulation',              'uploader': 'Nowness',              'title': 'Candor: The Art of Gesticulation', | 
