diff options
49 files changed, 926 insertions, 281 deletions
| diff --git a/CHANGELOG b/CHANGELOG deleted file mode 100644 index 3fa116733..000000000 --- a/CHANGELOG +++ /dev/null @@ -1,14 +0,0 @@ -2013.01.02  Codename: GIULIA - -    * Add support for ComedyCentral clips <nto> -    * Corrected Vimeo description fetching <Nick Daniels> -    * Added the --no-post-overwrites argument <Barbu Paul - Gheorghe> -    * --verbose offers more environment info -    * New info_dict field: uploader_id -    * New updates system, with signature checking -    * New IEs: NBA, JustinTV, FunnyOrDie, TweetReel, Steam, Ustream -    * Fixed IEs: BlipTv -    * Fixed for Python 3 IEs: Xvideo, Youku, XNXX, Dailymotion, Vimeo, InfoQ -    * Simplified IEs and test code -    * Various (Python 3 and other) fixes -    * Revamped and expanded tests @@ -77,6 +77,6 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-  		--exclude 'docs/_build' \  		-- \  		bin devscripts test youtube_dl docs \ -		CHANGELOG LICENSE README.md README.txt \ +		LICENSE README.md README.txt \  		Makefile MANIFEST.in youtube-dl.1 youtube-dl.bash-completion setup.py \  		youtube-dl diff --git a/devscripts/release.sh b/devscripts/release.sh index 2974a7c3e..453087e5f 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -45,9 +45,9 @@ fi  /bin/echo -e "\n### Changing version in version.py..."  sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py -/bin/echo -e "\n### Committing CHANGELOG README.md and youtube_dl/version.py..." +/bin/echo -e "\n### Committing README.md and youtube_dl/version.py..."  make README.md -git add CHANGELOG README.md youtube_dl/version.py +git add README.md youtube_dl/version.py  git commit -m "release $version"  /bin/echo -e "\n### Now tagging, signing and pushing..." diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 8735013f7..e794cc97f 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -67,7 +67,7 @@ class TestFormatSelection(unittest.TestCase):          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded['ext'], 'mp4') -        # No prefer_free_formats => prefer mp4 and flv for greater compatibilty +        # No prefer_free_formats => prefer mp4 and flv for greater compatibility          ydl = YDL()          ydl.params['prefer_free_formats'] = False          formats = [ @@ -279,7 +279,7 @@ class TestFormatSelection(unittest.TestCase):          self.assertEqual(ydl._format_note({}), '')          assertRegexpMatches(self, ydl._format_note({              'vbr': 10, -        }), '^x\s*10k$') +        }), '^\s*10k$')  if __name__ == '__main__':      unittest.main() diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index c9cdb96cb..71e80b037 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -13,7 +13,7 @@ from youtube_dl import YoutubeDL  def _download_restricted(url, filename, age): -    """ Returns true iff the file has been downloaded """ +    """ Returns true if the file has been downloaded """      params = {          'age_limit': age, diff --git a/test/test_playlists.py b/test/test_playlists.py index cc871698a..465b07b9e 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -28,6 +28,7 @@ from youtube_dl.extractor import (      SoundcloudSetIE,      SoundcloudUserIE,      SoundcloudPlaylistIE, +    TeacherTubeClassroomIE,      LivestreamIE,      NHLVideocenterIE,      BambuserChannelIE, @@ -209,20 +210,20 @@ class TestPlaylists(unittest.TestCase):      def test_ivi_compilation(self):          dl = FakeYDL()          ie = IviCompilationIE(dl) -        result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel') +        result = ie.extract('http://www.ivi.ru/watch/dvoe_iz_lartsa')          self.assertIsPlaylist(result) -        self.assertEqual(result['id'], 'dezhurnyi_angel') -        self.assertEqual(result['title'], 'Дежурный ангел (2010 - 2012)') -        self.assertTrue(len(result['entries']) >= 23) +        self.assertEqual(result['id'], 'dvoe_iz_lartsa') +        self.assertEqual(result['title'], 'Двое из ларца (2006 - 2008)') +        self.assertTrue(len(result['entries']) >= 24)      def test_ivi_compilation_season(self):          dl = FakeYDL()          ie = IviCompilationIE(dl) -        result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel/season2') +        result = ie.extract('http://www.ivi.ru/watch/dvoe_iz_lartsa/season1')          self.assertIsPlaylist(result) -        self.assertEqual(result['id'], 'dezhurnyi_angel/season2') -        self.assertEqual(result['title'], 'Дежурный ангел (2010 - 2012) 2 сезон') -        self.assertTrue(len(result['entries']) >= 7) +        self.assertEqual(result['id'], 'dvoe_iz_lartsa/season1') +        self.assertEqual(result['title'], 'Двое из ларца (2006 - 2008) 1 сезон') +        self.assertTrue(len(result['entries']) >= 12)      def test_imdb_list(self):          dl = FakeYDL() @@ -360,5 +361,13 @@ class TestPlaylists(unittest.TestCase):              result['title'], 'Brace Yourself - Today\'s Weirdest News')          self.assertTrue(len(result['entries']) >= 10) +    def test_TeacherTubeClassroom(self): +        dl = FakeYDL() +        ie = TeacherTubeClassroomIE(dl) +        result = ie.extract('http://www.teachertube.com/view_classroom.php?user=rbhagwati2') +        self.assertIsPlaylist(result) +        self.assertEqual(result['id'], 'rbhagwati2') +        self.assertTrue(len(result['entries']) >= 20) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 7d3b9c705..3aadedd64 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -112,11 +112,11 @@ class TestYoutubeLists(unittest.TestCase):      def test_youtube_mix(self):          dl = FakeYDL()          ie = YoutubePlaylistIE(dl) -        result = ie.extract('http://www.youtube.com/watch?v=lLJf9qJHR3E&list=RDrjFaenf1T-Y') +        result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w')          entries = result['entries']          self.assertTrue(len(entries) >= 20)          original_video = entries[0] -        self.assertEqual(original_video['id'], 'rjFaenf1T-Y') +        self.assertEqual(original_video['id'], 'OQpdSVF_k_w')      def test_youtube_toptracks(self):          print('Skipping: The playlist page gives error 500') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f3666573a..dc0ba986a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -717,6 +717,17 @@ class YoutubeDL(object):              info_dict['playlist'] = None              info_dict['playlist_index'] = None +        thumbnails = info_dict.get('thumbnails') +        if thumbnails: +            thumbnails.sort(key=lambda t: ( +                t.get('width'), t.get('height'), t.get('url'))) +            for t in thumbnails: +                if 'width' in t and 'height' in t: +                    t['resolution'] = '%dx%d' % (t['width'], t['height']) + +        if thumbnails and 'thumbnail' not in info_dict: +            info_dict['thumbnail'] = thumbnails[-1]['url'] +          if 'display_id' not in info_dict and 'id' in info_dict:              info_dict['display_id'] = info_dict['id'] diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 4e657e297..e2a4c04da 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -56,6 +56,8 @@ __authors__  = (      'Nicolas Évrard',      'Jason Normore',      'Hoje Lee', +    'Adam Thalhammer', +    'Georg Jähnig',  )  __license__ = 'Public Domain' diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 78b1e7cd2..cc6a84106 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -96,6 +96,7 @@ class RtmpFD(FileDownloader):          flash_version = info_dict.get('flash_version', None)          live = info_dict.get('rtmp_live', False)          conn = info_dict.get('rtmp_conn', None) +        protocol = info_dict.get('rtmp_protocol', None)          self.report_destination(filename)          tmpfilename = self.temp_name(filename) @@ -133,6 +134,8 @@ class RtmpFD(FileDownloader):                  basic_args += ['--conn', entry]          elif isinstance(conn, compat_str):              basic_args += ['--conn', conn] +        if protocol is not None: +            basic_args += ['--protocol', protocol]          args = basic_args + [[], ['--resume', '--skip', '1']][not live and self.params.get('continuedl', False)]          if sys.platform == 'win32' and sys.version_info < (3, 0): diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3e3d99b3e..01c21189b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -142,6 +142,7 @@ from .khanacademy import KhanAcademyIE  from .kickstarter import KickStarterIE  from .keek import KeekIE  from .kontrtube import KontrTubeIE +from .ku6 import Ku6IE  from .la7 import LA7IE  from .lifenews import LifeNewsIE  from .liveleak import LiveLeakIE @@ -194,7 +195,10 @@ from .normalboots import NormalbootsIE  from .novamov import NovaMovIE  from .nowness import NownessIE  from .nowvideo import NowVideoIE -from .nrk import NRKIE +from .nrk import ( +    NRKIE, +    NRKTVIE, +)  from .ntv import NTVIE  from .nytimes import NYTimesIE  from .nuvid import NuvidIE @@ -255,13 +259,21 @@ from .southparkstudios import (  from .space import SpaceIE  from .spankwire import SpankwireIE  from .spiegel import SpiegelIE +from .spiegeltv import SpiegeltvIE  from .spike import SpikeIE  from .stanfordoc import StanfordOpenClassroomIE  from .steam import SteamIE  from .streamcloud import StreamcloudIE  from .streamcz import StreamCZIE +from .swrmediathek import SWRMediathekIE  from .syfy import SyfyIE  from .sztvhu import SztvHuIE +from .tagesschau import TagesschauIE +from .teachertube import ( +    TeacherTubeIE, +    TeacherTubeClassroomIE, +) +from .teachingchannel import TeachingChannelIE  from .teamcoco import TeamcocoIE  from .techtalks import TechTalksIE  from .ted import TEDIE diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index 6a8cd14c9..cfc7370ae 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -1,7 +1,6 @@  # encoding: utf-8  from __future__ import unicode_literals -import datetime  import re  from .common import InfoExtractor @@ -16,6 +15,7 @@ class AftonbladetIE(InfoExtractor):              'ext': 'mp4',              'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna',              'description': 'Jupiters måne mest aktiv av alla himlakroppar', +            'timestamp': 1394142732,              'upload_date': '20140306',          },      } @@ -27,17 +27,17 @@ class AftonbladetIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          # find internal video meta data -        META_URL = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' +        meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'          internal_meta_id = self._html_search_regex(              r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id') -        internal_meta_url = META_URL % internal_meta_id +        internal_meta_url = meta_url % internal_meta_id          internal_meta_json = self._download_json(              internal_meta_url, video_id, 'Downloading video meta data')          # find internal video formats -        FORMATS_URL = 'http://aftonbladet-play.videodata.drvideo.aptoma.no/actions/video/?id=%s' +        format_url = 'http://aftonbladet-play.videodata.drvideo.aptoma.no/actions/video/?id=%s'          internal_video_id = internal_meta_json['videoId'] -        internal_formats_url = FORMATS_URL % internal_video_id +        internal_formats_url = format_url % internal_video_id          internal_formats_json = self._download_json(              internal_formats_url, video_id, 'Downloading video formats') @@ -54,16 +54,13 @@ class AftonbladetIE(InfoExtractor):              })          self._sort_formats(formats) -        timestamp = datetime.datetime.fromtimestamp(internal_meta_json['timePublished']) -        upload_date = timestamp.strftime('%Y%m%d') -          return {              'id': video_id,              'title': internal_meta_json['title'],              'formats': formats,              'thumbnail': internal_meta_json['imageUrl'],              'description': internal_meta_json['shortPreamble'], -            'upload_date': upload_date, +            'timestamp': internal_meta_json['timePublished'],              'duration': internal_meta_json['duration'],              'view_count': internal_meta_json['views'],          } diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index b88f71bc4..c6d22c029 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -38,15 +38,19 @@ class ARDIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          title = self._html_search_regex( -            r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', webpage, 'title') +            [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', +             r'<meta name="dcterms.title" content="(.*?)"/>', +             r'<h4 class="headline">(.*?)</h4>'], +            webpage, 'title')          description = self._html_search_meta(              'dcterms.abstract', webpage, 'description')          thumbnail = self._og_search_thumbnail(webpage) -        streams = [ -            mo.groupdict() -            for mo in re.finditer( -                r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)', webpage)] + +        media_info = self._download_json( +            'http://www.ardmediathek.de/play/media/%s' % video_id, video_id) +        # The second element of the _mediaArray contains the standard http urls +        streams = media_info['_mediaArray'][1]['_mediaStreamArray']          if not streams:              if '"fsk"' in webpage:                  raise ExtractorError('This video is only available after 20:00') @@ -54,21 +58,12 @@ class ARDIE(InfoExtractor):          formats = []          for s in streams:              format = { -                'quality': int(s['quality']), +                'quality': s['_quality'], +                'url': s['_stream'],              } -            if s.get('rtmp_url'): -                format['protocol'] = 'rtmp' -                format['url'] = s['rtmp_url'] -                format['playpath'] = s['video_url'] -            else: -                format['url'] = s['video_url'] - -            quality_name = self._search_regex( -                r'[,.]([a-zA-Z0-9_-]+),?\.mp4', format['url'], -                'quality name', default='NA') -            format['format_id'] = '%s-%s-%s-%s' % ( -                determine_ext(format['url']), quality_name, s['media_type'], -                s['quality']) + +            format['format_id'] = '%s-%s' % ( +                determine_ext(format['url']), format['quality'])              formats.append(format) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 929aafdff..dcbbdef43 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -19,7 +19,7 @@ class BandcampIE(InfoExtractor):          'md5': 'c557841d5e50261777a6585648adf439',          'info_dict': {              "title": "youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", -            "duration": 10, +            "duration": 9.8485,          },          '_skip': 'There is a limit of 200 free downloads / month for the test song'      }] @@ -28,36 +28,32 @@ class BandcampIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          title = mobj.group('title')          webpage = self._download_webpage(url, title) -        # We get the link to the free download page          m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) -        if m_download is None: +        if not m_download:              m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)              if m_trackinfo:                  json_code = m_trackinfo.group(1) -                data = json.loads(json_code) -                d = data[0] +                data = json.loads(json_code)[0] -                duration = int(round(d['duration']))                  formats = [] -                for format_id, format_url in d['file'].items(): -                    ext, _, abr_str = format_id.partition('-') - +                for format_id, format_url in data['file'].items(): +                    ext, abr_str = format_id.split('-', 1)                      formats.append({                          'format_id': format_id,                          'url': format_url, -                        'ext': format_id.partition('-')[0], +                        'ext': ext,                          'vcodec': 'none', -                        'acodec': format_id.partition('-')[0], -                        'abr': int(format_id.partition('-')[2]), +                        'acodec': ext, +                        'abr': int(abr_str),                      })                  self._sort_formats(formats)                  return { -                    'id': compat_str(d['id']), -                    'title': d['title'], +                    'id': compat_str(data['id']), +                    'title': data['title'],                      'formats': formats, -                    'duration': duration, +                    'duration': float(data['duration']),                  }              else:                  raise ExtractorError('No free songs found') @@ -67,11 +63,9 @@ class BandcampIE(InfoExtractor):              r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',              webpage, re.MULTILINE | re.DOTALL).group('id') -        download_webpage = self._download_webpage(download_link, video_id, -                                                  'Downloading free downloads page') -        # We get the dictionary of the track from some javascrip code -        info = re.search(r'items: (.*?),$', -                         download_webpage, re.MULTILINE).group(1) +        download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page') +        # We get the dictionary of the track from some javascript code +        info = re.search(r'items: (.*?),$', download_webpage, re.MULTILINE).group(1)          info = json.loads(info)[0]          # We pick mp3-320 for now, until format selection can be easily implemented.          mp3_info = info['downloads']['mp3-320'] @@ -100,7 +94,7 @@ class BandcampIE(InfoExtractor):  class BandcampAlbumIE(InfoExtractor):      IE_NAME = 'Bandcamp:album' -    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+))?' +    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+))'      _TEST = {          'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -123,7 +117,7 @@ class BandcampAlbumIE(InfoExtractor):          'params': {              'playlistend': 2          }, -        'skip': 'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' +        'skip': 'Bandcamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index 96408e4e0..38ccd957f 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -1,6 +1,5 @@  from __future__ import unicode_literals -import datetime  import json  import re @@ -19,15 +18,16 @@ class BlinkxIE(InfoExtractor):          'file': '8aQUy7GV.mp4',          'md5': '2e9a07364af40163a908edbf10bb2492',          'info_dict': { -            "title": "Police Car Rolls Away", -            "uploader": "stupidvideos.com", -            "upload_date": "20131215", -            "description": "A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!", -            "duration": 14.886, -            "thumbnails": [{ -                "width": 100, -                "height": 76, -                "url": "http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg", +            'title': 'Police Car Rolls Away', +            'uploader': 'stupidvideos.com', +            'upload_date': '20131215', +            'timestamp': 1387068000, +            'description': 'A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!', +            'duration': 14.886, +            'thumbnails': [{ +                'width': 100, +                'height': 76, +                'url': 'http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg',              }],          },      } @@ -41,9 +41,6 @@ class BlinkxIE(InfoExtractor):                     'video=%s' % video_id)          data_json = self._download_webpage(api_url, display_id)          data = json.loads(data_json)['api']['results'][0] -        dt = datetime.datetime.fromtimestamp(data['pubdate_epoch']) -        pload_date = dt.strftime('%Y%m%d') -          duration = None          thumbnails = []          formats = [] @@ -64,10 +61,7 @@ class BlinkxIE(InfoExtractor):                  vcodec = remove_start(m['vcodec'], 'ff')                  acodec = remove_start(m['acodec'], 'ff')                  tbr = (int(m['vbr']) + int(m['abr'])) // 1000 -                format_id = (u'%s-%sk-%s' % -                             (vcodec, -                              tbr, -                              m['w'])) +                format_id = u'%s-%sk-%s' % (vcodec, tbr, m['w'])                  formats.append({                      'format_id': format_id,                      'url': m['link'], @@ -88,7 +82,7 @@ class BlinkxIE(InfoExtractor):              'title': data['title'],              'formats': formats,              'uploader': data['channel_name'], -            'upload_date': pload_date, +            'timestamp': data['pubdate_epoch'],              'description': data.get('description'),              'thumbnails': thumbnails,              'duration': duration, diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 2301f61b6..496271be4 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -1,10 +1,12 @@  # encoding: utf-8  from __future__ import unicode_literals +  import re  from .common import InfoExtractor  from ..utils import (      ExtractorError, +    int_or_none,  ) @@ -13,9 +15,10 @@ class CinemassacreIE(InfoExtractor):      _TESTS = [          {              'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', -            'file': '19911.mp4', -            'md5': '782f8504ca95a0eba8fc9177c373eec7', +            'md5': 'fde81fbafaee331785f58cd6c0d46190',              'info_dict': { +                'id': '19911', +                'ext': 'mp4',                  'upload_date': '20121110',                  'title': '“Angry Video Game Nerd: The Movie” – Trailer',                  'description': 'md5:fb87405fcb42a331742a0dce2708560b', @@ -23,9 +26,10 @@ class CinemassacreIE(InfoExtractor):          },          {              'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', -            'file': '521be8ef82b16.mp4', -            'md5': 'dec39ee5118f8d9cc067f45f9cbe3a35', +            'md5': 'd72f10cd39eac4215048f62ab477a511',              'info_dict': { +                'id': '521be8ef82b16', +                'ext': 'mp4',                  'upload_date': '20131002',                  'title': 'The Mummy’s Hand (1940)',              }, @@ -50,29 +54,40 @@ class CinemassacreIE(InfoExtractor):              r'<div class="entry-content">(?P<description>.+?)</div>',              webpage, 'description', flags=re.DOTALL, fatal=False) -        playerdata = self._download_webpage(playerdata_url, video_id) +        playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage') +        video_thumbnail = self._search_regex( +            r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False) +        sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file') +        videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url') -        sd_url = self._html_search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file') -        hd_url = self._html_search_regex( -            r'file: \'([^\']+)\', label: \'HD\'', playerdata, 'hd_file', -            default=None) -        video_thumbnail = self._html_search_regex(r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False) +        videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') -        formats = [{ -            'url': sd_url, -            'ext': 'mp4', -            'format': 'sd', -            'format_id': 'sd', -            'quality': 1, -        }] -        if hd_url: -            formats.append({ -                'url': hd_url, -                'ext': 'mp4', -                'format': 'hd', -                'format_id': 'hd', -                'quality': 2, -            }) +        formats = [] +        baseurl = sd_url[:sd_url.rfind('/')+1] +        for video in videolist.findall('.//video'): +            src = video.get('src') +            if not src: +                continue +            file_ = src.partition(':')[-1] +            width = int_or_none(video.get('width')) +            height = int_or_none(video.get('height')) +            bitrate = int_or_none(video.get('system-bitrate')) +            format = { +                'url': baseurl + file_, +                'format_id': src.rpartition('.')[0].rpartition('_')[-1], +            } +            if width or height: +                format.update({ +                    'tbr': bitrate // 1000 if bitrate else None, +                    'width': width, +                    'height': height, +                }) +            else: +                format.update({ +                    'abr': bitrate // 1000 if bitrate else None, +                    'vcodec': 'none', +                }) +            formats.append(format)          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py index 88e0e9aba..e96c59f71 100644 --- a/youtube_dl/extractor/cmt.py +++ b/youtube_dl/extractor/cmt.py @@ -1,19 +1,19 @@ +from __future__ import unicode_literals  from .mtv import MTVIE +  class CMTIE(MTVIE): -    IE_NAME = u'cmt.com' +    IE_NAME = 'cmt.com'      _VALID_URL = r'https?://www\.cmt\.com/videos/.+?/(?P<videoid>[^/]+)\.jhtml'      _FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/' -    _TESTS = [ -        { -            u'url': u'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', -            u'md5': u'e6b7ef3c4c45bbfae88061799bbba6c2', -            u'info_dict': { -                u'id': u'989124', -                u'ext': u'mp4', -                u'title': u'Garth Brooks - "The Call (featuring Trisha Yearwood)"', -                u'description': u'Blame It All On My Roots', -            }, +    _TESTS = [{ +        'url': 'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', +        'md5': 'e6b7ef3c4c45bbfae88061799bbba6c2', +        'info_dict': { +            'id': '989124', +            'ext': 'mp4', +            'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"', +            'description': 'Blame It All On My Roots',          }, -    ] +    }] diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index b32cb8980..dae40c136 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -79,8 +79,11 @@ class CNNIE(InfoExtractor):          self._sort_formats(formats) -        thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')]) -        thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails] +        thumbnails = [{ +            'height': int(t.attrib['height']), +            'width': int(t.attrib['width']), +            'url': t.text, +        } for t in info.findall('images/image')]          metas_el = info.find('metas')          upload_date = ( @@ -93,8 +96,7 @@ class CNNIE(InfoExtractor):              'id': info.attrib['id'],              'title': info.find('headline').text,              'formats': formats, -            'thumbnail': thumbnails[-1][1], -            'thumbnails': thumbs_dict, +            'thumbnails': thumbnails,              'description': info.find('description').text,              'duration': duration,              'upload_date': upload_date, diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 6e3a316c6..ba4d73ab8 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -188,7 +188,7 @@ class ComedyCentralShowsIE(InfoExtractor):                  })                  formats.append({                      'format_id': 'rtmp-%s' % format, -                    'url': rtmp_video_url, +                    'url': rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm'),                      'ext': self._video_extensions.get(format, 'mp4'),                      'height': h,                      'width': w, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index db472aace..49e75405e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -92,8 +92,12 @@ class InfoExtractor(object):                      unique, but available before title. Typically, id is                      something like "4234987", title "Dancing naked mole rats",                      and display_id "dancing-naked-mole-rats" -    thumbnails:     A list of dictionaries (with the entries "resolution" and -                    "url") for the varying thumbnails +    thumbnails:     A list of dictionaries, with the following entries: +                        * "url" +                        * "width" (optional, int) +                        * "height" (optional, int) +                        * "resolution" (optional, string "{width}x{height"}, +                                        deprecated)      thumbnail:      Full URL to a video thumbnail image.      description:    One-line video description.      uploader:       Full name of the video uploader. diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py index eaeee5a51..e6952588f 100644 --- a/youtube_dl/extractor/empflix.py +++ b/youtube_dl/extractor/empflix.py @@ -3,20 +3,18 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import ( -    ExtractorError, -)  class EmpflixIE(InfoExtractor):      _VALID_URL = r'^https?://www\.empflix\.com/videos/.*?-(?P<id>[0-9]+)\.html'      _TEST = {          'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', -        'md5': '5e5cc160f38ca9857f318eb97146e13e', +        'md5': 'b1bc15b6412d33902d6e5952035fcabc',          'info_dict': {              'id': '33051', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Amateur Finger Fuck', +            'description': 'Amateur solo finger fucking.',              'age_limit': 18,          }      } @@ -30,6 +28,8 @@ class EmpflixIE(InfoExtractor):          video_title = self._html_search_regex(              r'name="title" value="(?P<title>[^"]*)"', webpage, 'title') +        video_description = self._html_search_regex( +            r'name="description" value="([^"]*)"', webpage, 'description', fatal=False)          cfg_url = self._html_search_regex(              r'flashvars\.config = escape\("([^"]+)"', @@ -37,12 +37,18 @@ class EmpflixIE(InfoExtractor):          cfg_xml = self._download_xml(              cfg_url, video_id, note='Downloading metadata') -        video_url = cfg_xml.find('videoLink').text + +        formats = [ +            { +                'url': item.find('videoLink').text, +                'format_id': item.find('res').text, +            } for item in cfg_xml.findall('./quality/item') +        ]          return {              'id': video_id, -            'url': video_url, -            'ext': 'flv',              'title': video_title, +            'description': video_description, +            'formats': formats,              'age_limit': age_limit,          } diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index ff7c0cd3e..14a196ffc 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -37,7 +37,7 @@ class ExtremeTubeIE(InfoExtractor):          webpage = self._download_webpage(req, video_id)          video_title = self._html_search_regex( -            r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, 'title') +            r'<h1 [^>]*?title="([^"]+)"[^>]*>', webpage, 'title')          uploader = self._html_search_regex(              r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, 'uploader',              fatal=False) diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index ca8993241..18f91efac 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -13,7 +13,7 @@ from ..utils import (  class FC2IE(InfoExtractor): -    _VALID_URL = r'^http://video\.fc2\.com/(?P<lang>[^/]+)/content/(?P<id>[^/]+)' +    _VALID_URL = r'^http://video\.fc2\.com/((?P<lang>[^/]+)/)?content/(?P<id>[^/]+)'      IE_NAME = 'fc2'      _TEST = {          'url': 'http://video.fc2.com/en/content/20121103kUan1KHs', @@ -36,7 +36,7 @@ class FC2IE(InfoExtractor):          thumbnail = self._og_search_thumbnail(webpage)          refer = url.replace('/content/', '/a/content/') -        mimi = hashlib.md5(video_id + '_gGddgPfeaf_gzyr').hexdigest() +        mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest()          info_url = (              "http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&". diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index 233398966..11fee3d31 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -15,7 +15,7 @@ class GamekingsIE(InfoExtractor):              'id': '20130811',              'ext': 'mp4',              'title': 'Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review', -            'description': 'md5:632e61a9f97d700e83f43d77ddafb6a4', +            'description': 'md5:36fd701e57e8c15ac8682a2374c99731',          }      } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 286133282..38a357d3b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -363,8 +363,13 @@ class GenericIE(InfoExtractor):                      return self.url_result('http://' + url)                  else:                      if default_search == 'auto_warning': -                        self._downloader.report_warning( -                            'Falling back to youtube search for  %s . Set --default-search to "auto" to suppress this warning.' % url) +                        if re.match(r'^(?:url|URL)$', url): +                            raise ExtractorError( +                                'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url, +                                expected=True) +                        else: +                            self._downloader.report_warning( +                                'Falling back to youtube search for  %s . Set --default-search to "auto" to suppress this warning.' % url)                      return self.url_result('ytsearch:' + url)              else:                  assert ':' in default_search @@ -560,7 +565,7 @@ class GenericIE(InfoExtractor):          # Look for embedded NovaMov-based player          mobj = re.search( -            r'''(?x)<iframe[^>]+?src=(["\']) +            r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])                      (?P<url>http://(?:(?:embed|www)\.)?                          (?:novamov\.com|                             nowvideo\.(?:ch|sx|eu|at|ag|co)| diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 1ba4966c7..528be1524 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -33,14 +33,14 @@ class IviIE(InfoExtractor):          },          # Serial's serie          { -            'url': 'http://www.ivi.ru/watch/dezhurnyi_angel/74791', -            'md5': '3e6cc9a848c1d2ebcc6476444967baa9', +            'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/9549', +            'md5': '221f56b35e3ed815fde2df71032f4b3e',              'info_dict': { -                'id': '74791', +                'id': '9549',                  'ext': 'mp4', -                'title': 'Дежурный ангел - 1 серия', -                'duration': 2490, -                'thumbnail': 'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg', +                'title': 'Двое из ларца - Серия 1', +                'duration': 2655, +                'thumbnail': 'http://thumbs.ivi.ru/f15.vcp.digitalaccess.ru/contents/8/4/0068dc0677041f3336b7c2baad8fc0.jpg',              },              'skip': 'Only works from Russia',           } diff --git a/youtube_dl/extractor/ku6.py b/youtube_dl/extractor/ku6.py new file mode 100644 index 000000000..484239b19 --- /dev/null +++ b/youtube_dl/extractor/ku6.py @@ -0,0 +1,35 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class Ku6IE(InfoExtractor): +    _VALID_URL = r'http://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html' +    _TEST = { +        'url': 'http://v.ku6.com/show/JG-8yS14xzBr4bCn1pu0xw...html', +        'md5': '01203549b9efbb45f4b87d55bdea1ed1', +        'info_dict': { +            'id': 'JG-8yS14xzBr4bCn1pu0xw', +            'ext': 'f4v', +            'title': 'techniques test', +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) +        title = self._search_regex(r'<h1 title=.*>(.*?)</h1>', webpage, 'title') +        dataUrl = 'http://v.ku6.com/fetchVideo4Player/%s.html' % video_id +        jsonData = self._download_json(dataUrl, video_id) +        downloadUrl = jsonData['data']['f'] + +        return { +            'id': video_id, +            'title': title, +            'url': downloadUrl +        } + diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index f819c09b3..7460d81cd 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -2,7 +2,6 @@  from __future__ import unicode_literals  import re -import datetime  from .common import InfoExtractor @@ -10,28 +9,48 @@ from .common import InfoExtractor  class MailRuIE(InfoExtractor):      IE_NAME = 'mailru'      IE_DESC = 'Видео@Mail.Ru' -    _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/video/.*#video=/?(?P<id>[^/]+/[^/]+/[^/]+/\d+)' +    _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)' -    _TEST = { -        'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76', -        'md5': 'dea205f03120046894db4ebb6159879a', -        'info_dict': { -            'id': '46301138', -            'ext': 'mp4', -            'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро', -            'upload_date': '20140224', -            'uploader': 'sonypicturesrus', -            'uploader_id': 'sonypicturesrus@mail.ru', -            'duration': 184, -        } -    } +    _TESTS = [ +        { +            'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76', +            'md5': 'dea205f03120046894db4ebb6159879a', +            'info_dict': { +                'id': '46301138', +                'ext': 'mp4', +                'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро', +                'timestamp': 1393232740, +                'upload_date': '20140224', +                'uploader': 'sonypicturesrus', +                'uploader_id': 'sonypicturesrus@mail.ru', +                'duration': 184, +            }, +        }, +        { +            'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html', +            'md5': '00a91a58c3402204dcced523777b475f', +            'info_dict': { +                'id': '46843144', +                'ext': 'mp4', +                'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion', +                'timestamp': 1397217632, +                'upload_date': '20140411', +                'uploader': 'hitech', +                'uploader_id': 'hitech@corp.mail.ru', +                'duration': 245, +            }, +        }, +    ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = mobj.group('idv1') + +        if not video_id: +            video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix')          video_data = self._download_json( -            'http://videoapi.my.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON') +            'http://api.video.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON')          author = video_data['author']          uploader = author['name'] @@ -40,10 +59,11 @@ class MailRuIE(InfoExtractor):          movie = video_data['movie']          content_id = str(movie['contentId'])          title = movie['title'] +        if title.endswith('.mp4'): +            title = title[:-4]          thumbnail = movie['poster']          duration = movie['duration'] -        upload_date = datetime.datetime.fromtimestamp(video_data['timestamp']).strftime('%Y%m%d')          view_count = video_data['views_count']          formats = [ @@ -57,7 +77,7 @@ class MailRuIE(InfoExtractor):              'id': content_id,              'title': title,              'thumbnail': thumbnail, -            'upload_date': upload_date, +            'timestamp': video_data['timestamp'],              'uploader': uploader,              'uploader_id': uploader_id,              'duration': duration, diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 4cab30631..c0231c197 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -1,4 +1,6 @@  # encoding: utf-8 +from __future__ import unicode_literals +  import re  from .common import InfoExtractor @@ -12,12 +14,13 @@ class NaverIE(InfoExtractor):      _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'      _TEST = { -        u'url': u'http://tvcast.naver.com/v/81652', -        u'file': u'81652.mp4', -        u'info_dict': { -            u'title': u'[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', -            u'description': u'합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', -            u'upload_date': u'20130903', +        'url': 'http://tvcast.naver.com/v/81652', +        'info_dict': { +            'id': '81652', +            'ext': 'mp4', +            'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', +            'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', +            'upload_date': '20130903',          },      } @@ -28,7 +31,7 @@ class NaverIE(InfoExtractor):          m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"',              webpage)          if m_id is None: -            raise ExtractorError(u'couldn\'t extract vid and key') +            raise ExtractorError('couldn\'t extract vid and key')          vid = m_id.group(1)          key = m_id.group(2)          query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key,}) @@ -39,22 +42,27 @@ class NaverIE(InfoExtractor):          })          info = self._download_xml(              'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query, -            video_id, u'Downloading video info') +            video_id, 'Downloading video info')          urls = self._download_xml(              'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls, -            video_id, u'Downloading video formats info') +            video_id, 'Downloading video formats info')          formats = []          for format_el in urls.findall('EncodingOptions/EncodingOption'):              domain = format_el.find('Domain').text -            if domain.startswith('rtmp'): -                continue -            formats.append({ +            f = {                  'url': domain + format_el.find('uri').text,                  'ext': 'mp4',                  'width': int(format_el.find('width').text),                  'height': int(format_el.find('height').text), -            }) +            } +            if domain.startswith('rtmp'): +                f.update({ +                    'ext': 'flv', +                    'rtmp_protocol': '1', # rtmpt +                }) +            formats.append(f) +        self._sort_formats(formats)          return {              'id': video_id, diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 1a63ab56a..aa34665d1 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -1,6 +1,7 @@  from __future__ import unicode_literals  import re +import json  from .common import InfoExtractor  from ..utils import find_xpath_attr, compat_str @@ -31,30 +32,68 @@ class NBCIE(InfoExtractor):  class NBCNewsIE(InfoExtractor): -    _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)' +    _VALID_URL = r'''(?x)https?://www\.nbcnews\.com/ +        ((video/.+?/(?P<id>\d+))| +        (feature/[^/]+/(?P<title>.+))) +        ''' -    _TEST = { -        'url': 'http://www.nbcnews.com/video/nbc-news/52753292', -        'md5': '47abaac93c6eaf9ad37ee6c4463a5179', -        'info_dict': { -            'id': '52753292', -            'ext': 'flv', -            'title': 'Crew emerges after four-month Mars food study', -            'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1', +    _TESTS = [ +        { +            'url': 'http://www.nbcnews.com/video/nbc-news/52753292', +            'md5': '47abaac93c6eaf9ad37ee6c4463a5179', +            'info_dict': { +                'id': '52753292', +                'ext': 'flv', +                'title': 'Crew emerges after four-month Mars food study', +                'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1', +            },          }, -    } +        { +            'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236', +            'md5': 'b2421750c9f260783721d898f4c42063', +            'info_dict': { +                'id': 'I1wpAI_zmhsQ', +                'ext': 'flv', +                'title': 'How Twitter Reacted To The Snowden Interview', +                'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', +            }, +            'add_ie': ['ThePlatform'], +        }, +    ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') -        all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) -        info = all_info.find('video') +        if video_id is not None: +            all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) +            info = all_info.find('video') -        return { -            'id': video_id, -            'title': info.find('headline').text, -            'ext': 'flv', -            'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, -            'description': compat_str(info.find('caption').text), -            'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, -        } +            return { +                'id': video_id, +                'title': info.find('headline').text, +                'ext': 'flv', +                'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, +                'description': compat_str(info.find('caption').text), +                'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, +            } +        else: +            # "feature" pages use theplatform.com +            title = mobj.group('title') +            webpage = self._download_webpage(url, title) +            bootstrap_json = self._search_regex( +                r'var bootstrapJson = ({.+})\s*$', webpage, 'bootstrap json', +                flags=re.MULTILINE) +            bootstrap = json.loads(bootstrap_json) +            info = bootstrap['results'][0]['video'] +            playlist_url = info['fallbackPlaylistUrl'] + '?form=MPXNBCNewsAPI' +            mpxid = info['mpxId'] +            all_videos = self._download_json(playlist_url, title)['videos'] +            # The response contains additional videos +            info = next(v for v in all_videos if v['mpxId'] == mpxid) + +            return { +                '_type': 'url', +                # We get the best quality video +                'url': info['videoAssets'][-1]['publicUrl'], +                'ie_key': 'ThePlatform', +            } diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 0650f9564..3d6096e46 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( +    ExtractorError, +    int_or_none, +    qualities, +)  class NDRIE(InfoExtractor): @@ -45,17 +49,16 @@ class NDRIE(InfoExtractor):          page = self._download_webpage(url, video_id, 'Downloading page') -        title = self._og_search_title(page) +        title = self._og_search_title(page).strip()          description = self._og_search_description(page) +        if description: +            description = description.strip() -        mobj = re.search( -            r'<div class="duration"><span class="min">(?P<minutes>\d+)</span>:<span class="sec">(?P<seconds>\d+)</span></div>', -            page) -        duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None +        duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', fatal=False))          formats = [] -        mp3_url = re.search(r'''{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page) +        mp3_url = re.search(r'''\{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page)          if mp3_url:              formats.append({                  'url': mp3_url.group('audio'), @@ -64,13 +67,15 @@ class NDRIE(InfoExtractor):          thumbnail = None -        video_url = re.search(r'''3: {src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page) +        video_url = re.search(r'''3: \{src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)          if video_url: -            thumbnail = self._html_search_regex(r'(?m)title: "NDR PLAYER",\s*poster: "([^"]+)",', -                page, 'thumbnail', fatal=False) -            if thumbnail: -                thumbnail = 'http://www.ndr.de' + thumbnail -            for format_id in ['lo', 'hi', 'hq']: +            thumbnails = re.findall(r'''\d+: \{src: "([^"]+)"(?: \|\| '[^']+')?, quality: '([^']+)'}''', page) +            if thumbnails: +                quality_key = qualities(['xs', 's', 'm', 'l', 'xl']) +                largest = max(thumbnails, key=lambda thumb: quality_key(thumb[1])) +                thumbnail = 'http://www.ndr.de' + largest[0] + +            for format_id in 'lo', 'hi', 'hq':                  formats.append({                      'url': '%s.%s.mp4' % (video_url.group('video'), format_id),                      'format_id': format_id, diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index b1bcb7e54..1c5e9401f 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -4,9 +4,7 @@ import re  from .brightcove import BrightcoveIE  from .common import InfoExtractor -from ..utils import ( -    ExtractorError, -) +from ..utils import ExtractorError  class NownessIE(InfoExtractor): @@ -14,9 +12,10 @@ class NownessIE(InfoExtractor):      _TEST = {          'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation', -        'file': '2520295746001.mp4', -        'md5': '0ece2f70a7bd252c7b00f3070182d418', +        'md5': '068bc0202558c2e391924cb8cc470676',          'info_dict': { +            'id': '2520295746001', +            'ext': 'mp4',              'description': 'Candor: The Art of Gesticulation',              'uploader': 'Nowness',              'title': 'Candor: The Art of Gesticulation', diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index e6d68b836..3a6a7883e 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( +    ExtractorError, +    int_or_none, +    unified_strdate, +)  class NRKIE(InfoExtractor): @@ -64,4 +68,78 @@ class NRKIE(InfoExtractor):              'title': data['title'],              'description': data['description'],              'thumbnail': thumbnail, +        } + + +class NRKTVIE(InfoExtractor): +    _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-z]{4}\d{8})' + +    _TESTS = [ +        { +            'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/muhh48000314/23-05-2014', +            'md5': '7b96112fbae1faf09a6f9ae1aff6cb84', +            'info_dict': { +                'id': 'muhh48000314', +                'ext': 'flv', +                'title': '20 spørsmål', +                'description': 'md5:bdea103bc35494c143c6a9acdd84887a', +                'upload_date': '20140523', +                'duration': 1741.52, +            } +        }, +        { +            'url': 'http://tv.nrk.no/program/mdfp15000514', +            'md5': '383650ece2b25ecec996ad7b5bb2a384', +            'info_dict': { +                'id': 'mdfp15000514', +                'ext': 'flv', +                'title': 'Kunnskapskanalen: Grunnlovsjubiléet - Stor ståhei for ingenting', +                'description': 'md5:654c12511f035aed1e42bdf5db3b206a', +                'upload_date': '20140524', +                'duration': 4605.0, +            } +        }, +    ] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        page = self._download_webpage(url, video_id) + +        title = self._html_search_meta('title', page, 'title') +        description = self._html_search_meta('description', page, 'description') +        thumbnail = self._html_search_regex(r'data-posterimage="([^"]+)"', page, 'thumbnail', fatal=False) +        upload_date = unified_strdate(self._html_search_meta('rightsfrom', page, 'upload date', fatal=False)) +        duration = self._html_search_regex(r'data-duration="([^"]+)"', page, 'duration', fatal=False) +        if duration: +            duration = float(duration) + +        formats = [] + +        f4m_url = re.search(r'data-media="([^"]+)"', page) +        if f4m_url: +            formats.append({ +                'url': f4m_url.group(1) + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', +                'format_id': 'f4m', +                'ext': 'flv', +            }) + +        m3u8_url = re.search(r'data-hls-media="([^"]+)"', page) +        if m3u8_url: +            formats.append({ +                'url': m3u8_url.group(1), +                'format_id': 'm3u8', +            }) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'upload_date': upload_date, +            'duration': duration, +            'formats': formats,          }
\ No newline at end of file diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py index f0befa116..e3db9fe8c 100644 --- a/youtube_dl/extractor/nuvid.py +++ b/youtube_dl/extractor/nuvid.py @@ -30,7 +30,7 @@ class NuvidIE(InfoExtractor):              webpage, 'title').strip()          url_end = self._html_search_regex( -            r'href="(/mp4/[^"]+)"[^>]*data-link_type="mp4"', +            r'href="(/[^"]+)"[^>]*data-link_type="mp4"',              webpage, 'video_url')          video_url = 'http://m.nuvid.com' + url_end diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 7dd3dca0d..4118ee956 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -45,7 +45,7 @@ class PornHubIE(InfoExtractor):          video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')          video_uploader = self._html_search_regex( -            r'(?s)<div class="video-info-row">\s*From: .+?<(?:a href="/users/|<span class="username)[^>]+>(.+?)<', +            r'(?s)From: .+?<(?:a href="/users/|<span class="username)[^>]+>(.+?)<',              webpage, 'uploader', fatal=False)          thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)          if thumbnail: diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index d6f453fb9..25515f068 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -12,6 +12,7 @@ from ..utils import (      compat_urllib_parse,      ExtractorError, +    int_or_none,      unified_strdate,  ) @@ -44,7 +45,8 @@ class SoundcloudIE(InfoExtractor):                  "upload_date": "20121011",                  "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",                  "uploader": "E.T. ExTerrestrial Music", -                "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" +                "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1", +                "duration": 143,              }          },          # not streamable song @@ -57,6 +59,7 @@ class SoundcloudIE(InfoExtractor):                  'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',                  'uploader': 'The Royal Concept',                  'upload_date': '20120521', +                'duration': 227,              },              'params': {                  # rtmp @@ -74,6 +77,7 @@ class SoundcloudIE(InfoExtractor):                  'uploader': 'jaimeMF',                  'description': 'test chars:  \"\'/\\ä↭',                  'upload_date': '20131209', +                'duration': 9,              },          },          # downloadable song @@ -87,6 +91,7 @@ class SoundcloudIE(InfoExtractor):                  'description': 'Vocals',                  'uploader': 'Sim Gretina',                  'upload_date': '20130815', +                #'duration': 42,              },          },      ] @@ -119,6 +124,7 @@ class SoundcloudIE(InfoExtractor):              'title': info['title'],              'description': info['description'],              'thumbnail': thumbnail, +            'duration': int_or_none(info.get('duration'), 1000),          }          formats = []          if info.get('downloadable', False): diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py new file mode 100644 index 000000000..7f388aced --- /dev/null +++ b/youtube_dl/extractor/spiegeltv.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor + + +class SpiegeltvIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/filme/(?P<id>[\-a-z0-9]+)' +    _TEST = { +        'url': 'http://www.spiegel.tv/filme/flug-mh370/', +        'info_dict': { +            'id': 'flug-mh370', +            'ext': 'm4v', +            'title': 'Flug MH370', +            'description': 'Das Rätsel um die Boeing 777 der Malaysia-Airlines', +            'thumbnail': 're:http://.*\.jpg$', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) +        title = self._html_search_regex(r'<h1.*?>(.*?)</h1>', webpage, 'title') + +        apihost = 'http://spiegeltv-ivms2-restapi.s3.amazonaws.com' +        version_json = self._download_json( +            '%s/version.json' % apihost, video_id, +            note='Downloading version information') +        version_name = version_json['version_name'] + +        slug_json = self._download_json( +            '%s/%s/restapi/slugs/%s.json' % (apihost, version_name, video_id), +            video_id, +            note='Downloading object information') +        oid = slug_json['object_id'] + +        media_json = self._download_json( +            '%s/%s/restapi/media/%s.json' % (apihost, version_name, oid), +            video_id, note='Downloading media information') +        uuid = media_json['uuid'] +        is_wide = media_json['is_wide'] + +        server_json = self._download_json( +            'http://www.spiegel.tv/streaming_servers/', video_id, +            note='Downloading server information') +        server = server_json[0]['endpoint'] + +        thumbnails = [] +        for image in media_json['images']: +            thumbnails.append({ +                'url': image['url'], +                'width': image['width'], +                'height': image['height'], +            }) + +        description = media_json['subtitle'] +        duration = media_json['duration_in_ms'] / 1000. + +        if is_wide: +            format = '16x9' +        else: +            format = '4x3' + +        url = server + 'mp4:' + uuid + '_spiegeltv_0500_' + format + '.m4v' + +        return { +            'id': video_id, +            'title': title, +            'url': url, +            'ext': 'm4v', +            'description': description, +            'duration': duration, +            'thumbnails': thumbnails +        }
\ No newline at end of file diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 7362904db..73efe9542 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -5,13 +5,16 @@ import re  import json  from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( +    int_or_none, +    compat_str, +)  class StreamCZIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<videoid>.+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti',          'md5': '6d3ca61a8d0633c9c542b92fcb936b0c',          'info_dict': { @@ -22,7 +25,18 @@ class StreamCZIE(InfoExtractor):              'thumbnail': 'http://im.stream.cz/episode/52961d7e19d423f8f06f0100',              'duration': 256,          }, -    } +    }, { +        'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', +        'md5': '246272e753e26bbace7fcd9deca0650c', +        'info_dict': { +            'id': '10002447', +            'ext': 'mp4', +            'title': 'Kancelář Blaník: Tři roky pro Mazánka', +            'description': 'md5:9177695a8b756a0a8ab160de4043b392', +            'thumbnail': 'http://im.stream.cz/episode/537f838c50c11f8d21320000', +            'duration': 368, +        }, +    }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -57,7 +71,7 @@ class StreamCZIE(InfoExtractor):          self._sort_formats(formats)          return { -            'id': str(jsonData['id']), +            'id': compat_str(jsonData['episode_id']),              'title': self._og_search_title(webpage),              'thumbnail': jsonData['episode_image_original_url'].replace('//', 'http://'),              'formats': formats, diff --git a/youtube_dl/extractor/swrmediathek.py b/youtube_dl/extractor/swrmediathek.py new file mode 100644 index 000000000..6c688c520 --- /dev/null +++ b/youtube_dl/extractor/swrmediathek.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_duration + + +class SWRMediathekIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?swrmediathek\.de/player\.htm\?show=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + +    _TESTS = [{ +        'url': 'http://swrmediathek.de/player.htm?show=849790d0-dab8-11e3-a953-0026b975f2e6', +        'md5': '8c5f6f0172753368547ca8413a7768ac', +        'info_dict': { +            'id': '849790d0-dab8-11e3-a953-0026b975f2e6', +            'ext': 'mp4', +            'title': 'SWR odysso', +            'description': 'md5:2012e31baad36162e97ce9eb3f157b8a', +            'thumbnail': 're:^http:.*\.jpg$', +            'duration': 2602, +            'upload_date': '20140515', +            'uploader': 'SWR Fernsehen', +            'uploader_id': '990030', +        }, +    }, { +        'url': 'http://swrmediathek.de/player.htm?show=0e1a8510-ddf2-11e3-9be3-0026b975f2e6', +        'md5': 'b10ab854f912eecc5a6b55cd6fc1f545', +        'info_dict': { +            'id': '0e1a8510-ddf2-11e3-9be3-0026b975f2e6', +            'ext': 'mp4', +            'title': 'Nachtcafé - Alltagsdroge Alkohol - zwischen Sektempfang und Komasaufen', +            'description': 'md5:e0a3adc17e47db2c23aab9ebc36dbee2', +            'thumbnail': 're:http://.*\.jpg', +            'duration': 5305, +            'upload_date': '20140516', +            'uploader': 'SWR Fernsehen', +            'uploader_id': '990030', +        }, +    }, { +        'url': 'http://swrmediathek.de/player.htm?show=bba23e10-cb93-11e3-bf7f-0026b975f2e6', +        'md5': '4382e4ef2c9d7ce6852535fa867a0dd3', +        'info_dict': { +            'id': 'bba23e10-cb93-11e3-bf7f-0026b975f2e6', +            'ext': 'mp3', +            'title': 'Saša Stanišic: Vor dem Fest', +            'description': 'md5:5b792387dc3fbb171eb709060654e8c9', +            'thumbnail': 're:http://.*\.jpg', +            'duration': 3366, +            'upload_date': '20140520', +            'uploader': 'SWR 2', +            'uploader_id': '284670', +        } +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        video = self._download_json( +            'http://swrmediathek.de/AjaxEntry?ekey=%s' % video_id, video_id, 'Downloading video JSON') + +        attr = video['attr'] +        media_type = attr['entry_etype'] + +        formats = [] +        for entry in video['sub']: +            if entry['name'] != 'entry_media': +                continue + +            entry_attr = entry['attr'] +            codec = entry_attr['val0'] +            quality = int(entry_attr['val1']) + +            fmt = { +                'url': entry_attr['val2'], +                'quality': quality, +            } + +            if media_type == 'Video': +                fmt.update({ +                    'format_note': ['144p', '288p', '544p'][quality-1], +                    'vcodec': codec, +                }) +            elif media_type == 'Audio': +                fmt.update({ +                    'acodec': codec, +                }) +            formats.append(fmt) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': attr['entry_title'], +            'description': attr['entry_descl'], +            'thumbnail': attr['entry_image_16_9'], +            'duration': parse_duration(attr['entry_durat']), +            'upload_date': attr['entry_pdatet'][:-4], +            'uploader': attr['channel_title'], +            'uploader_id': attr['channel_idkey'], +            'formats': formats, +        }
\ No newline at end of file diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py new file mode 100644 index 000000000..36331529e --- /dev/null +++ b/youtube_dl/extractor/tagesschau.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class TagesschauIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/video/video(?P<id>-?[0-9]+)\.html' + +    _TESTS = [{ +        'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html', +        'md5': 'bcdeac2194fb296d599ce7929dfa4009', +        'info_dict': { +            'id': '1399128', +            'ext': 'mp4', +            'title': 'Harald Range, Generalbundesanwalt, zu den Ermittlungen', +            'description': 'md5:69da3c61275b426426d711bde96463ab', +            'thumbnail': 're:^http:.*\.jpg$', +        }, +    }, { +        'url': 'http://www.tagesschau.de/multimedia/video/video-196.html', +        'md5': '8aaa8bf3ae1ca2652309718c03019128', +        'info_dict': { +            'id': '196', +            'ext': 'mp4', +            'title': 'Ukraine-Konflikt: Klitschko in Kiew als Bürgermeister vereidigt', +            'description': 'md5:f22e4af75821d174fa6c977349682691', +            'thumbnail': 're:http://.*\.jpg', +        }, +    }] + +    _FORMATS = { +        's': {'width': 256, 'height': 144, 'quality': 1}, +        'm': {'width': 512, 'height': 288, 'quality': 2}, +        'l': {'width': 960, 'height': 544, 'quality': 3}, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        if video_id.startswith('-'): +            display_id = video_id.strip('-') +        else: +            display_id = video_id + +        webpage = self._download_webpage(url, display_id) + +        playerpage = self._download_webpage( +            'http://www.tagesschau.de/multimedia/video/video%s~player_autoplay-true.html' % video_id, +            display_id, 'Downloading player page') + +        medias = re.findall( +            r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"', +            playerpage) + +        formats = [] +        for url, ext, res in medias: +            f = { +                'format_id': res + '_' + ext, +                'url': url, +                'ext': ext, +            } +            f.update(self._FORMATS.get(res, {})) +            formats.append(f) + +        self._sort_formats(formats) + +        thumbnail = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1] + +        return { +            'id': display_id, +            'title': self._og_search_title(webpage).strip(), +            'thumbnail': 'http://www.tagesschau.de' + thumbnail, +            'formats': formats, +            'description': self._og_search_description(webpage).strip(), +        } diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py new file mode 100644 index 000000000..6d52763f9 --- /dev/null +++ b/youtube_dl/extractor/teachertube.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    qualities, +    determine_ext, +) + + +class TeacherTubeIE(InfoExtractor): +    IE_NAME = 'teachertube' +    IE_DESC = 'teachertube.com videos' + +    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=)(?P<id>\d+)' + +    _TESTS = [{ +        'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997', +        'md5': 'f9434ef992fd65936d72999951ee254c', +        'info_dict': { +            'id': '339997', +            'ext': 'mp4', +            'title': 'Measures of dispersion from a frequency table_x264', +            'description': 'md5:a3e9853487185e9fcd7181a07164650b', +            'thumbnail': 're:http://.*\.jpg', +        }, +    }, { +        'url': 'http://www.teachertube.com/viewVideo.php?video_id=340064', +        'md5': '0d625ec6bc9bf50f70170942ad580676', +        'info_dict': { +            'id': '340064', +            'ext': 'mp4', +            'title': 'How to Make Paper Dolls _ Paper Art Projects', +            'description': 'md5:2ca52b20cd727773d1dc418b3d6bd07b', +            'thumbnail': 're:http://.*\.jpg', +        }, +    }, { +        'url': 'http://www.teachertube.com/music.php?music_id=8805', +        'md5': '01e8352006c65757caf7b961f6050e21', +        'info_dict': { +            'id': '8805', +            'ext': 'mp3', +            'title': 'PER ASPERA AD ASTRA', +            'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNIČKE ŠKOLE PER ASPERA AD ASTRA', +        }, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        quality = qualities(['mp3', 'flv', 'mp4']) + +        formats = [ +            { +                'url': media_url, +                'quality': quality(determine_ext(media_url)) +            } for media_url in set(zip(*re.findall(r'([\'"])file\1\s*:\s*"([^"]+)"', webpage))[1]) +        ] + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': self._og_search_title(webpage), +            'thumbnail': self._og_search_thumbnail(webpage), +            'formats': formats, +            'description': self._og_search_description(webpage), +        } + + +class TeacherTubeClassroomIE(InfoExtractor): +    IE_NAME = 'teachertube:classroom' +    IE_DESC = 'teachertube.com online classrooms' + +    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/view_classroom\.php\?user=(?P<user>[0-9a-zA-Z]+)' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        user_id = mobj.group('user') + +        rss = self._download_xml('http://www.teachertube.com/rssclassroom.php?mode=user&username=%s' % user_id, +                                      user_id, 'Downloading classroom RSS') + +        entries = [] +        for url in rss.findall('.//{http://search.yahoo.com/mrss/}player'): +            entries.append(self.url_result(url.attrib['url'], 'TeacherTube')) + +        return self.playlist_result(entries, user_id) diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py new file mode 100644 index 000000000..117afa9bf --- /dev/null +++ b/youtube_dl/extractor/teachingchannel.py @@ -0,0 +1,33 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE + + +class TeachingChannelIE(InfoExtractor): +    _VALID_URL = r'https?://www\.teachingchannel\.org/videos/(?P<title>.+)' + +    _TEST = { +        'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution', +        'info_dict': { +            'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', +            'ext': 'mp4', +            'title': 'A History of Teaming', +            'description': 'md5:2a9033db8da81f2edffa4c99888140b3', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        title = mobj.group('title') +        webpage = self._download_webpage(url, title) +        ooyala_code = self._search_regex( +            r'data-embed-code=\'(.+?)\'', webpage, 'ooyala code') + +        return OoyalaIE._build_url_result(ooyala_code) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index f15780ef5..b6b2dba9c 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import re  import json @@ -18,17 +20,17 @@ class ThePlatformIE(InfoExtractor):      _TEST = {          # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/ -        u'url': u'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true', -        u'info_dict': { -            u'id': u'e9I_cZgTgIPd', -            u'ext': u'flv', -            u'title': u'Blackberry\'s big, bold Z30', -            u'description': u'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.', -            u'duration': 247, +        'url': 'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true', +        'info_dict': { +            'id': 'e9I_cZgTgIPd', +            'ext': 'flv', +            'title': 'Blackberry\'s big, bold Z30', +            'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.', +            'duration': 247,          }, -        u'params': { +        'params': {              # rtmp download -            u'skip_download': True, +            'skip_download': True,          },      } @@ -39,7 +41,7 @@ class ThePlatformIE(InfoExtractor):              error_msg = next(                  n.attrib['abstract']                  for n in meta.findall(_x('.//smil:ref')) -                if n.attrib.get('title') == u'Geographic Restriction') +                if n.attrib.get('title') == 'Geographic Restriction')          except StopIteration:              pass          else: @@ -101,8 +103,7 @@ class ThePlatformIE(InfoExtractor):              config_url = url+ '&form=json'              config_url = config_url.replace('swf/', 'config/')              config_url = config_url.replace('onsite/', 'onsite/config/') -            config_json = self._download_webpage(config_url, video_id, u'Downloading config') -            config = json.loads(config_json) +            config = self._download_json(config_url, video_id, 'Downloading config')              smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'          else:              smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index e4bb3b949..488b10df9 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -11,29 +11,36 @@ from ..utils import (  class UstreamIE(InfoExtractor): -    _VALID_URL = r'https?://www\.ustream\.tv/(?P<type>recorded|embed)/(?P<videoID>\d+)' +    _VALID_URL = r'https?://www\.ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<videoID>\d+)'      IE_NAME = 'ustream'      _TEST = {          'url': 'http://www.ustream.tv/recorded/20274954', -        'file': '20274954.flv',          'md5': '088f151799e8f572f84eb62f17d73e5c',          'info_dict': { -            "uploader": "Young Americans for Liberty", -            "title": "Young Americans for Liberty February 7, 2012 2:28 AM", +            'id': '20274954', +            'ext': 'flv', +            'uploader': 'Young Americans for Liberty', +            'title': 'Young Americans for Liberty February 7, 2012 2:28 AM',          },      }      def _real_extract(self, url):          m = re.match(self._VALID_URL, url) +        video_id = m.group('videoID') + +        # some sites use this embed format (see: http://github.com/rg3/youtube-dl/issues/2990) +        if m.group('type') == 'embed/recorded': +            video_id = m.group('videoID') +            desktop_url = 'http://www.ustream.tv/recorded/' + video_id +            return self.url_result(desktop_url, 'Ustream')          if m.group('type') == 'embed':              video_id = m.group('videoID')              webpage = self._download_webpage(url, video_id) -            desktop_video_id = self._html_search_regex(r'ContentVideoIds=\["([^"]*?)"\]', webpage, 'desktop_video_id') +            desktop_video_id = self._html_search_regex( +                r'ContentVideoIds=\["([^"]*?)"\]', webpage, 'desktop_video_id')              desktop_url = 'http://www.ustream.tv/recorded/' + desktop_video_id              return self.url_result(desktop_url, 'Ustream') -        video_id = m.group('videoID') -          video_url = 'http://tcdn.ustream.tv/video/%s' % video_id          webpage = self._download_webpage(url, video_id) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index ea34a8f16..eada13ce9 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -16,7 +16,7 @@ class VevoIE(InfoExtractor):      (currently used by MTVIE)      """      _VALID_URL = r'''(?x) -        (?:https?://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?| +        (?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?|             https?://cache\.vevo\.com/m/html/embed\.html\?video=|             https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|             vevo:) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 85e99e1b0..7e0044824 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -5,18 +5,21 @@ import re  from .common import InfoExtractor  from ..utils import (      compat_urllib_parse, +    ExtractorError, +    clean_html,  )  class XVideosIE(InfoExtractor):      _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'      _TEST = { -        'url': 'http://www.xvideos.com/video939581/funny_porns_by_s_-1', -        'file': '939581.flv', -        'md5': '1d0c835822f0a71a7bf011855db929d0', +        'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl', +        'md5': '4b46ae6ea5e6e9086e714d883313c0c9',          'info_dict': { -            "title": "Funny Porns By >>>>S<<<<<< -1", -            "age_limit": 18, +            'id': '4588838', +            'ext': 'flv', +            'title': 'Biker Takes his Girl', +            'age_limit': 18,          }      } @@ -28,6 +31,10 @@ class XVideosIE(InfoExtractor):          self.report_extraction(video_id) +        mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) +        if mobj: +            raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True) +          # Extract video URL          video_url = compat_urllib_parse.unquote(              self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL')) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 393f6ffbe..d84be2562 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -21,7 +21,7 @@ class YahooIE(InfoExtractor):              'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',              'md5': '4962b075c08be8690a922ee026d05e69',              'info_dict': { -                'id': '214727115', +                'id': '2d25e626-2378-391f-ada0-ddaf1417e588',                  'ext': 'mp4',                  'title': 'Julian Smith & Travis Legg Watch Julian Smith',                  'description': 'Julian and Travis watch Julian Smith', @@ -31,7 +31,7 @@ class YahooIE(InfoExtractor):              'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',              'md5': 'd6e6fc6e1313c608f316ddad7b82b306',              'info_dict': { -                'id': '103000935', +                'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',                  'ext': 'mp4',                  'title': 'Codefellas - The Cougar Lies with Spanish Moss',                  'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', @@ -58,9 +58,11 @@ class YahooIE(InfoExtractor):              r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,              default=None)          if items_json is None: -            long_id = self._search_regex( +            CONTENT_ID_REGEXES = [                  r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"', -                webpage, 'content ID') +                r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"' +            ] +            long_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')              video_id = long_id          else:              items = json.loads(items_json) @@ -68,9 +70,9 @@ class YahooIE(InfoExtractor):              # The 'meta' field is not always in the video webpage, we request it              # from another page              long_id = info['id'] -        return self._get_info(long_id, video_id) +        return self._get_info(long_id, video_id, webpage) -    def _get_info(self, long_id, video_id): +    def _get_info(self, long_id, video_id, webpage):          query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'                   ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'                   ' AND protocol="http"' % long_id) @@ -113,7 +115,7 @@ class YahooIE(InfoExtractor):              'title': meta['title'],              'formats': formats,              'description': clean_html(meta['description']), -            'thumbnail': meta['thumbnail'], +            'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),          } @@ -137,7 +139,7 @@ class YahooNewsIE(YahooIE):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id)          long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id') -        return self._get_info(long_id, video_id) +        return self._get_info(long_id, video_id, webpage)  class YahooSearchIE(SearchInfoExtractor): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 981ca62c0..7c50881c4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -223,6 +223,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, +        '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          # Dash webm audio          '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50}, @@ -1140,7 +1141,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)          if mobj is None:              mobj = re.search( -                r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded) on (.*?)</strong>', +                r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',                  video_webpage)          if mobj is not None:              upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) @@ -1414,11 +1415,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):          title_span = (search_title('playlist-title') or              search_title('title long-title') or search_title('title'))          title = clean_html(title_span) -        video_re = r'''(?x)data-video-username="(.*?)".*? +        video_re = r'''(?x)data-video-username=".*?".*?                         href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id) -        matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL)) -        # Some of the videos may have been deleted, their username field is empty -        ids = [video_id for (username, video_id) in matches if username] +        ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))          url_results = self._ids_to_results(ids)          return self.playlist_result(url_results, playlist_id, title) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 04a04f579..47dde62b9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.05.16.1' +__version__ = '2014.06.04' | 
