diff options
37 files changed, 745 insertions, 196 deletions
| @@ -70,8 +70,9 @@ which means you can modify it, redistribute it or use it however you like.      --default-search PREFIX          Use this prefix for unqualified URLs. For                                       example "gvsearch2:" downloads two videos                                       from google videos for  youtube-dl "large -                                     apple". By default (with value "auto") -                                     youtube-dl guesses. +                                     apple". Use the value "auto" to let +                                     youtube-dl guess. The default value "error" +                                     just throws an error.      --ignore-config                  Do not read configuration files. When given                                       in the global configuration file /etc                                       /youtube-dl.conf: do not read the user diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 4b56137ce..2bc81f020 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -69,9 +69,6 @@ class TestAllURLsMatching(unittest.TestCase):      def test_youtube_show_matching(self):          self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) -    def test_youtube_truncated(self): -        self.assertMatch('http://www.youtube.com/watch?', ['youtube:truncated_url']) -      def test_youtube_search_matching(self):          self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])          self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) diff --git a/test/test_playlists.py b/test/test_playlists.py index 42051fe2a..994b1d4b0 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -28,8 +28,9 @@ from youtube_dl.extractor import (      SoundcloudSetIE,      SoundcloudUserIE,      SoundcloudPlaylistIE, -    TeacherTubeClassroomIE, +    TeacherTubeUserIE,      LivestreamIE, +    LivestreamOriginalIE,      NHLVideocenterIE,      BambuserChannelIE,      BandcampAlbumIE, @@ -155,6 +156,14 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['title'], 'TEDCity2.0 (English)')          self.assertTrue(len(result['entries']) >= 4) +    def test_livestreamoriginal_folder(self): +        dl = FakeYDL() +        ie = LivestreamOriginalIE(dl) +        result = ie.extract('https://www.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3') +        self.assertIsPlaylist(result) +        self.assertEqual(result['id'], 'a07bf706-d0e4-4e75-a747-b021d84f2fd3') +        self.assertTrue(len(result['entries']) >= 28) +      def test_nhl_videocenter(self):          dl = FakeYDL()          ie = NHLVideocenterIE(dl) @@ -370,13 +379,13 @@ class TestPlaylists(unittest.TestCase):              result['title'], 'Brace Yourself - Today\'s Weirdest News')          self.assertTrue(len(result['entries']) >= 10) -    def test_TeacherTubeClassroom(self): +    def test_TeacherTubeUser(self):          dl = FakeYDL() -        ie = TeacherTubeClassroomIE(dl) -        result = ie.extract('http://www.teachertube.com/view_classroom.php?user=rbhagwati2') +        ie = TeacherTubeUserIE(dl) +        result = ie.extract('http://www.teachertube.com/user/profile/rbhagwati2')          self.assertIsPlaylist(result)          self.assertEqual(result['id'], 'rbhagwati2') -        self.assertTrue(len(result['entries']) >= 20) +        self.assertTrue(len(result['entries']) >= 179)  if __name__ == '__main__':      unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 8417c55a6..8d46fe108 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -33,6 +33,12 @@ _TESTS = [          90,          u']\\[@?>=<;:/.-,+*)(\'&%$#"hZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjiagfedcb39876',      ), +    ( +        u'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', +        u'js', +        u'2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', +        u'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', +    ),  ] @@ -44,7 +50,7 @@ class TestSignature(unittest.TestCase):              os.mkdir(self.TESTDATA_DIR) -def make_tfunc(url, stype, sig_length, expected_sig): +def make_tfunc(url, stype, sig_input, expected_sig):      basename = url.rpartition('/')[2]      m = re.match(r'.*-([a-zA-Z0-9_-]+)\.[a-z]+$', basename)      assert m, '%r should follow URL format' % basename @@ -66,7 +72,9 @@ def make_tfunc(url, stype, sig_length, expected_sig):              with open(fn, 'rb') as testf:                  swfcode = testf.read()              func = ie._parse_sig_swf(swfcode) -        src_sig = compat_str(string.printable[:sig_length]) +        src_sig = ( +            compat_str(string.printable[:sig_input]) +            if isinstance(sig_input, int) else sig_input)          got_sig = func(src_sig)          self.assertEqual(got_sig, expected_sig) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index dc0ba986a..3dff723b8 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -993,6 +993,8 @@ class YoutubeDL(object):                          fd = get_suitable_downloader(info)(self, self.params)                          for ph in self._progress_hooks:                              fd.add_progress_hook(ph) +                        if self.params.get('verbose'): +                            self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))                          return fd.download(name, info)                      if info_dict.get('requested_formats') is not None:                          downloaded = [] diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 1e01432d2..31ed63fcc 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -59,6 +59,7 @@ __authors__  = (      'Adam Thalhammer',      'Georg Jähnig',      'Ralf Haring', +    'Koki Takahashi',  )  __license__ = 'Public Domain' @@ -269,7 +270,7 @@ def parseOpts(overrideArguments=None):      general.add_option(          '--default-search',          dest='default_search', metavar='PREFIX', -        help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for  youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.') +        help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for  youtube-dl "large apple". Use the value "auto" to let youtube-dl guess. The default value "error" just throws an error.')      general.add_option(          '--ignore-config',          action='store_true', diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1666aa372..7b3f9ae24 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -3,6 +3,7 @@ from .addanime import AddAnimeIE  from .aftonbladet import AftonbladetIE  from .anitube import AnitubeIE  from .aol import AolIE +from .allocine import AllocineIE  from .aparat import AparatIE  from .appletrailers import AppleTrailersIE  from .archiveorg import ArchiveOrgIE @@ -63,6 +64,7 @@ from .dailymotion import (  from .daum import DaumIE  from .dotsub import DotsubIE  from .dreisat import DreiSatIE +from .drtv import DRTVIE  from .defense import DefenseGouvFrIE  from .discovery import DiscoveryIE  from .divxstage import DivxStageIE @@ -147,7 +149,11 @@ from .ku6 import Ku6IE  from .la7 import LA7IE  from .lifenews import LifeNewsIE  from .liveleak import LiveLeakIE -from .livestream import LivestreamIE, LivestreamOriginalIE +from .livestream import ( +    LivestreamIE, +    LivestreamOriginalIE, +    LivestreamShortenerIE, +)  from .lynda import (      LyndaIE,      LyndaCourseIE @@ -165,6 +171,7 @@ from .mpora import MporaIE  from .mofosex import MofosexIE  from .mooshare import MooshareIE  from .morningstar import MorningstarIE +from .motherless import MotherlessIE  from .motorsport import MotorsportIE  from .moviezine import MoviezineIE  from .movshare import MovShareIE @@ -197,6 +204,7 @@ from .normalboots import NormalbootsIE  from .novamov import NovaMovIE  from .nowness import NownessIE  from .nowvideo import NowVideoIE +from .npo import NPOIE  from .nrk import (      NRKIE,      NRKTVIE, @@ -255,6 +263,7 @@ from .soundcloud import (      SoundcloudUserIE,      SoundcloudPlaylistIE  ) +from .soundgasm import SoundgasmIE  from .southparkstudios import (      SouthParkStudiosIE,      SouthparkDeIE, @@ -274,7 +283,7 @@ from .sztvhu import SztvHuIE  from .tagesschau import TagesschauIE  from .teachertube import (      TeacherTubeIE, -    TeacherTubeClassroomIE, +    TeacherTubeUserIE,  )  from .teachingchannel import TeachingChannelIE  from .teamcoco import TeamcocoIE diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py new file mode 100644 index 000000000..34f0cd49b --- /dev/null +++ b/youtube_dl/extractor/allocine.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( +    compat_str, +    qualities, +    determine_ext, +) + + +class AllocineIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?P<typ>article|video|film)/(fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=)(?P<id>[0-9]+)(?:\.html)?' + +    _TESTS = [{ +        'url': 'http://www.allocine.fr/article/fichearticle_gen_carticle=18635087.html', +        'md5': '0c9fcf59a841f65635fa300ac43d8269', +        'info_dict': { +            'id': '19546517', +            'ext': 'mp4', +            'title': 'Astérix - Le Domaine des Dieux Teaser VF', +            'description': 'md5:4a754271d9c6f16c72629a8a993ee884', +            'thumbnail': 're:http://.*\.jpg', +        }, +    }, { +        'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html', +        'md5': 'd0cdce5d2b9522ce279fdfec07ff16e0', +        'info_dict': { +            'id': '19540403', +            'ext': 'mp4', +            'title': 'Planes 2 Bande-annonce VF', +            'description': 'md5:c4b1f7bd682a91de6491ada267ec0f4d', +            'thumbnail': 're:http://.*\.jpg', +        }, +    }, { +        'url': 'http://www.allocine.fr/film/fichefilm_gen_cfilm=181290.html', +        'md5': '101250fb127ef9ca3d73186ff22a47ce', +        'info_dict': { +            'id': '19544709', +            'ext': 'mp4', +            'title': 'Dragons 2 - Bande annonce finale VF', +            'description': 'md5:e74a4dc750894bac300ece46c7036490', +            'thumbnail': 're:http://.*\.jpg', +        }, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        typ = mobj.group('typ') +        display_id = mobj.group('id') + +        webpage = self._download_webpage(url, display_id) + +        if typ == 'film': +            video_id = self._search_regex(r'href="/video/player_gen_cmedia=([0-9]+).+"', webpage, 'video id') +        else: +            player = self._search_regex(r'data-player=\'([^\']+)\'>', webpage, 'data player') + +            player_data = json.loads(player) +            video_id = compat_str(player_data['refMedia']) + +        xml = self._download_xml('http://www.allocine.fr/ws/AcVisiondataV4.ashx?media=%s' % video_id, display_id) + +        video = xml.find('.//AcVisionVideo').attrib +        quality = qualities(['ld', 'md', 'hd']) + +        formats = [] +        for k, v in video.items(): +            if re.match(r'.+_path', k): +                format_id = k.split('_')[0] +                formats.append({ +                    'format_id': format_id, +                    'quality': quality(format_id), +                    'url': v, +                    'ext': determine_ext(v), +                }) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': video['videoTitle'], +            'thumbnail': self._og_search_thumbnail(webpage), +            'formats': formats, +            'description': self._og_search_description(webpage), +        } diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 2b019daa9..31f0d417c 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -1,22 +1,24 @@ +from __future__ import unicode_literals +  import re  from .common import InfoExtractor  class AnitubeIE(InfoExtractor): -    IE_NAME = u'anitube.se' +    IE_NAME = 'anitube.se'      _VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P<id>\d+)'      _TEST = { -        u'url': u'http://www.anitube.se/video/36621', -        u'md5': u'59d0eeae28ea0bc8c05e7af429998d43', -        u'file': u'36621.mp4', -        u'info_dict': { -            u'id': u'36621', -            u'ext': u'mp4', -            u'title': u'Recorder to Randoseru 01', +        'url': 'http://www.anitube.se/video/36621', +        'md5': '59d0eeae28ea0bc8c05e7af429998d43', +        'info_dict': { +            'id': '36621', +            'ext': 'mp4', +            'title': 'Recorder to Randoseru 01', +            'duration': 180.19,          }, -        u'skip': u'Blocked in the US', +        'skip': 'Blocked in the US',      }      def _real_extract(self, url): @@ -24,13 +26,15 @@ class AnitubeIE(InfoExtractor):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', -                                      webpage, u'key') +        key = self._html_search_regex( +            r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, 'key') -        config_xml = self._download_xml('http://www.anitube.se/nuevo/econfig.php?key=%s' % key, -                                                key) +        config_xml = self._download_xml( +            'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key)          video_title = config_xml.find('title').text +        thumbnail = config_xml.find('image').text +        duration = float(config_xml.find('duration').text)          formats = []          video_url = config_xml.find('file') @@ -49,5 +53,7 @@ class AnitubeIE(InfoExtractor):          return {              'id': video_id,              'title': video_title, +            'thumbnail': thumbnail, +            'duration': duration,              'formats': formats          } diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index b528a9ec5..9591bad8a 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -39,7 +39,10 @@ class ArteTvIE(InfoExtractor):          formats = [{              'forma_id': q.attrib['quality'], -            'url': q.text, +            # The playpath starts at 'mp4:', if we don't manually +            # split the url, rtmpdump will incorrectly parse them +            'url': q.text.split('mp4:', 1)[0], +            'play_path': 'mp4:' + q.text.split('mp4:', 1)[1],              'ext': 'flv',              'quality': 2 if q.attrib['quality'] == 'hd' else 1,          } for q in config.findall('./urls/url')] @@ -111,7 +114,7 @@ class ArteTVPlus7IE(InfoExtractor):          if not formats:              # Some videos are only available in the 'Originalversion'              # they aren't tagged as being in French or German -            if all(f['versionCode'] == 'VO' for f in all_formats): +            if all(f['versionCode'] == 'VO' or f['versionCode'] == 'VA' for f in all_formats):                  formats = all_formats              else:                  raise ExtractorError(u'The formats list is empty') @@ -189,9 +192,10 @@ class ArteTVFutureIE(ArteTVPlus7IE):      _TEST = {          'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081',          'info_dict': { -            'id': '050940-003', +            'id': '5201',              'ext': 'mp4',              'title': 'Les champignons au secours de la planète', +            'upload_date': '20131101',          },      } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 49e75405e..e4e4feef9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -459,6 +459,9 @@ class InfoExtractor(object):          if secure: regexes = self._og_regexes('video:secure_url') + regexes          return self._html_search_regex(regexes, html, name, **kargs) +    def _og_search_url(self, html, **kargs): +        return self._og_search_property('url', html, **kargs) +      def _html_search_meta(self, name, html, display_name=None, fatal=False):          if display_name is None:              display_name = name diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 55216201f..5d0bfe454 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -150,7 +150,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):          return {              'id':       video_id,              'formats': formats, -            'uploader': info['owner_screenname'], +            'uploader': info['owner.screenname'],              'upload_date':  video_upload_date,              'title':    self._og_search_title(webpage),              'subtitles':    video_subtitles, diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py new file mode 100644 index 000000000..cdccfd376 --- /dev/null +++ b/youtube_dl/extractor/drtv.py @@ -0,0 +1,91 @@ +from __future__ import unicode_literals + +import re + +from .subtitles import SubtitlesInfoExtractor +from .common import ExtractorError +from ..utils import parse_iso8601 + + +class DRTVIE(SubtitlesInfoExtractor): +    _VALID_URL = r'http://(?:www\.)?dr\.dk/tv/se/[^/]+/(?P<id>[\da-z-]+)' + +    _TEST = { +        'url': 'http://www.dr.dk/tv/se/partiets-mand/partiets-mand-7-8', +        'md5': '4a7e1dd65cdb2643500a3f753c942f25', +        'info_dict': { +            'id': 'partiets-mand-7-8', +            'ext': 'mp4', +            'title': 'Partiets mand (7:8)', +            'description': 'md5:a684b90a8f9336cd4aab94b7647d7862', +            'timestamp': 1403047940, +            'upload_date': '20140617', +            'duration': 1299.040, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        programcard = self._download_json( +            'http://www.dr.dk/mu/programcard/expanded/%s' % video_id, video_id, 'Downloading video JSON') + +        data = programcard['Data'][0] + +        title = data['Title'] +        description = data['Description'] +        timestamp = parse_iso8601(data['CreatedTime'][:-5]) + +        thumbnail = None +        duration = None + +        restricted_to_denmark = False + +        formats = [] +        subtitles = {} + +        for asset in data['Assets']: +            if asset['Kind'] == 'Image': +                thumbnail = asset['Uri'] +            elif asset['Kind'] == 'VideoResource': +                duration = asset['DurationInMilliseconds'] / 1000.0 +                restricted_to_denmark = asset['RestrictedToDenmark'] +                for link in asset['Links']: +                    target = link['Target'] +                    uri = link['Uri'] +                    formats.append({ +                        'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri, +                        'format_id': target, +                        'ext': link['FileFormat'], +                        'preference': -1 if target == 'HDS' else -2, +                    }) +                subtitles_list = asset.get('SubtitlesList') +                if isinstance(subtitles_list, list): +                    LANGS = { +                        'Danish': 'dk', +                    } +                    for subs in subtitles_list: +                        lang = subs['Language'] +                        subtitles[LANGS.get(lang, lang)] = subs['Uri'] + +        if not formats and restricted_to_denmark: +            raise ExtractorError( +                'Unfortunately, DR is not allowed to show this program outside Denmark.', expected=True) + +        self._sort_formats(formats) + +        if self._downloader.params.get('listsubtitles', False): +            self._list_available_subtitles(video_id, subtitles) +            return + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'timestamp': timestamp, +            'duration': duration, +            'formats': formats, +            'subtitles': self.extract_subtitles(video_id, subtitles), +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9dd03aba4..f97b59845 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -383,7 +383,7 @@ class GenericIE(InfoExtractor):          if not parsed_url.scheme:              default_search = self._downloader.params.get('default_search')              if default_search is None: -                default_search = 'auto_warning' +                default_search = 'error'              if default_search in ('auto', 'auto_warning'):                  if '/' in url: @@ -397,8 +397,13 @@ class GenericIE(InfoExtractor):                                  expected=True)                          else:                              self._downloader.report_warning( -                                'Falling back to youtube search for  %s . Set --default-search to "auto" to suppress this warning.' % url) +                                'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)                      return self.url_result('ytsearch:' + url) +            elif default_search == 'error': +                raise ExtractorError( +                    ('%r is not a valid URL. ' +                     'Set --default-search "ytseach" (or run  youtube-dl "ytsearch:%s" ) to search YouTube' +                    ) % (url, url), expected=True)              else:                  assert ':' in default_search                  return self.url_result(default_search + url) @@ -620,6 +625,11 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result(mobj.group('url'), 'VK') +        # Look for embedded ivi player +        mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'Ivi') +          # Look for embedded Huffington Post player          mobj = re.search(              r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage) diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index cc29a7e5d..07d994b44 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -52,8 +52,7 @@ class GooglePlusIE(InfoExtractor):          # Extract title          # Get the first line for title -        video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', -            webpage, 'title', default='NA') +        video_title = self._og_search_description(webpage).splitlines()[0]          # Step 2, Simulate clicking the image box to launch video          DOMAIN = 'https://plus.google.com/' diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 528be1524..4027deb70 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -14,7 +14,7 @@ from ..utils import (  class IviIE(InfoExtractor):      IE_DESC = 'ivi.ru'      IE_NAME = 'ivi' -    _VALID_URL = r'https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)' +    _VALID_URL = r'https?://(?:www\.)?ivi\.ru/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<videoid>\d+)'      _TESTS = [          # Single movie diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 5c71f4f09..2c100d424 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -9,6 +9,7 @@ from ..utils import (      compat_urlparse,      xpath_with_ns,      compat_str, +    orderedSet,  ) @@ -64,7 +65,10 @@ class LivestreamIE(InfoExtractor):  # The original version of Livestream uses a different system  class LivestreamOriginalIE(InfoExtractor):      IE_NAME = 'livestream:original' -    _VALID_URL = r'https?://www\.livestream\.com/(?P<user>[^/]+)/video\?.*?clipId=(?P<id>.*?)(&|$)' +    _VALID_URL = r'''(?x)https?://www\.livestream\.com/ +        (?P<user>[^/]+)/(?P<type>video|folder) +        (?:\?.*?Id=|/)(?P<id>.*?)(&|$) +        '''      _TEST = {          'url': 'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',          'info_dict': { @@ -78,10 +82,7 @@ class LivestreamOriginalIE(InfoExtractor):          },      } -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        user = mobj.group('user') +    def _extract_video(self, user, video_id):          api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id)          info = self._download_xml(api_url, video_id) @@ -99,3 +100,44 @@ class LivestreamOriginalIE(InfoExtractor):              'ext': 'flv',              'thumbnail': thumbnail_url,          } + +    def _extract_folder(self, url, folder_id): +        webpage = self._download_webpage(url, folder_id) +        urls = orderedSet(re.findall(r'<a href="(https?://livestre\.am/.*?)"', webpage)) + +        return { +            '_type': 'playlist', +            'id': folder_id, +            'entries': [{ +                '_type': 'url', +                'url': video_url, +            } for video_url in urls], +        } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        id = mobj.group('id') +        user = mobj.group('user') +        url_type = mobj.group('type') +        if url_type == 'folder': +            return self._extract_folder(url, id) +        else: +            return self._extract_video(user, id) + + +# The server doesn't support HEAD request, the generic extractor can't detect +# the redirection +class LivestreamShortenerIE(InfoExtractor): +    IE_NAME = 'livestream:shortener' +    IE_DESC = False  # Do not list +    _VALID_URL = r'https?://livestre\.am/(?P<id>.+)' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        id = mobj.group('id') +        webpage = self._download_webpage(url, id) + +        return { +            '_type': 'url', +            'url': self._og_search_url(webpage), +        } diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py new file mode 100644 index 000000000..6229b2173 --- /dev/null +++ b/youtube_dl/extractor/motherless.py @@ -0,0 +1,87 @@ +from __future__ import unicode_literals + +import datetime +import re + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    unified_strdate, +) + + +class MotherlessIE(InfoExtractor): +    _VALID_URL = r'http://(?:www\.)?motherless\.com/(?P<id>[A-Z0-9]+)' +    _TESTS = [ +        { +            'url': 'http://motherless.com/AC3FFE1', +            'md5': '5527fef81d2e529215dad3c2d744a7d9', +            'info_dict': { +                'id': 'AC3FFE1', +                'ext': 'flv', +                'title': 'Fucked in the ass while playing PS3', +                'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], +                'upload_date': '20100913', +                'uploader_id': 'famouslyfuckedup', +                'thumbnail': 're:http://.*\.jpg', +                'age_limit': 18, +            } +        }, +        { +            'url': 'http://motherless.com/532291B', +            'md5': 'bc59a6b47d1f958e61fbd38a4d31b131', +            'info_dict': { +                'id': '532291B', +                'ext': 'mp4', +                'title': 'Amazing girl playing the omegle game, PERFECT!', +                'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', 'game', 'hairy'], +                'upload_date': '20140622', +                'uploader_id': 'Sulivana7x', +                'thumbnail': 're:http://.*\.jpg', +                'age_limit': 18, +            } +        } +    ] + +    def _real_extract(self,url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        title = self._html_search_regex(r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') +         +        video_url = self._html_search_regex(r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video_url') +        age_limit = self._rta_search(webpage) + +        view_count = self._html_search_regex(r'<strong>Views</strong>\s+([^<]+)<', webpage, 'view_count') +  +        upload_date = self._html_search_regex(r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload_date') +        if 'Ago' in upload_date: +            days = int(re.search(r'([0-9]+)', upload_date).group(1)) +            upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') +        else: +            upload_date = unified_strdate(upload_date) + +        like_count = self._html_search_regex(r'<strong>Favorited</strong>\s+([^<]+)<', webpage, 'like_count') + +        comment_count = webpage.count('class="media-comment-contents"') +        uploader_id = self._html_search_regex(r'"thumb-member-username">\s+<a href="/m/([^"]+)"', webpage, 'uploader_id') + +        categories = self._html_search_meta('keywords', webpage) +        if categories: +            categories = [cat.strip() for cat in categories.split(',')] + +        return { +            'id': video_id, +            'title': title, +            'upload_date': upload_date, +            'uploader_id': uploader_id, +            'thumbnail': self._og_search_thumbnail(webpage), +            'categories': categories, +            'view_count': int_or_none(view_count.replace(',', '')), +            'like_count': int_or_none(like_count.replace(',', '')), +            'comment_count': comment_count, +            'age_limit': age_limit, +            'url': video_url, +        } diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py index 39d6feb98..387935d4d 100644 --- a/youtube_dl/extractor/mpora.py +++ b/youtube_dl/extractor/mpora.py @@ -28,7 +28,7 @@ class MporaIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          data_json = self._search_regex( -            r"new FM\.Player\('[^']+',\s*(\{.*?)\);\n", webpage, 'json') +            r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;", webpage, 'json')          data = json.loads(data_json) diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py index 2fd5b8f04..551bd4d7a 100644 --- a/youtube_dl/extractor/newstube.py +++ b/youtube_dl/extractor/newstube.py @@ -4,18 +4,19 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..utils import ExtractorError  class NewstubeIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?newstube\.ru/media/(?P<id>.+)'      _TEST = { -        'url': 'http://newstube.ru/media/na-korable-progress-prodolzhaetsya-testirovanie-sistemy-kurs', +        'url': 'http://www.newstube.ru/media/telekanal-cnn-peremestil-gorod-slavyansk-v-krym',          'info_dict': { -            'id': 'd156a237-a6e9-4111-a682-039995f721f1', +            'id': '728e0ef2-e187-4012-bac0-5a081fdcb1f6',              'ext': 'flv', -            'title': 'На корабле «Прогресс» продолжается тестирование системы «Курс»', -            'description': 'md5:d0cbe7b4a6f600552617e48548d5dc77', -            'duration': 20.04, +            'title': 'Телеканал CNN переместил город Славянск в Крым', +            'description': 'md5:419a8c9f03442bc0b0a794d689360335', +            'duration': 31.05,          },          'params': {              # rtmp download @@ -40,6 +41,10 @@ class NewstubeIE(InfoExtractor):          def ns(s):              return s.replace('/', '/%(ns)s') % {'ns': '{http://app1.newstube.ru/N2SiteWS/player.asmx}'} +        error_message = player.find(ns('./ErrorMessage')) +        if error_message is not None: +            raise ExtractorError('%s returned error: %s' % (self.IE_NAME, error_message.text), expected=True) +          session_id = player.find(ns('./SessionId')).text          media_info = player.find(ns('./Medias/MediaInfo'))          title = media_info.find(ns('./Name')).text diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 517a72561..c0c139b5d 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -8,10 +8,9 @@ from ..utils import (      compat_urllib_parse,      compat_urllib_request,      compat_urlparse, -    compat_str, - -    ExtractorError,      unified_strdate, +    parse_duration, +    int_or_none,  ) @@ -30,6 +29,7 @@ class NiconicoIE(InfoExtractor):              'uploader_id': '2698420',              'upload_date': '20131123',              'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', +            'duration': 33,          },          'params': {              'username': 'ydl.niconico@gmail.com', @@ -37,17 +37,20 @@ class NiconicoIE(InfoExtractor):          },      } -    _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' +    _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/((?:[a-z]{2})?[0-9]+)'      _NETRC_MACHINE = 'niconico' +    # Determine whether the downloader uses authentication to download video +    _AUTHENTICATE = False      def _real_initialize(self): -        self._login() +        if self._downloader.params.get('username', None) is not None: +            self._AUTHENTICATE = True + +        if self._AUTHENTICATE: +            self._login()      def _login(self):          (username, password) = self._get_login_info() -        if username is None: -            # Login is required -            raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)          # Log in          login_form_strs = { @@ -79,44 +82,66 @@ class NiconicoIE(InfoExtractor):              'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,              note='Downloading video info page') -        # Get flv info -        flv_info_webpage = self._download_webpage( -            'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, -            video_id, 'Downloading flv info') +        if self._AUTHENTICATE: +            # Get flv info +            flv_info_webpage = self._download_webpage( +                'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, +                video_id, 'Downloading flv info') +        else: +            # Get external player info +            ext_player_info = self._download_webpage( +                'http://ext.nicovideo.jp/thumb_watch/' + video_id, video_id) +            thumb_play_key = self._search_regex( +                r'\'thumbPlayKey\'\s*:\s*\'(.*?)\'', ext_player_info, 'thumbPlayKey') + +            # Get flv info +            flv_info_data = compat_urllib_parse.urlencode({ +                'k': thumb_play_key, +                'v': video_id +            }) +            flv_info_request = compat_urllib_request.Request( +                'http://ext.nicovideo.jp/thumb_watch', flv_info_data, +                {'Content-Type': 'application/x-www-form-urlencoded'}) +            flv_info_webpage = self._download_webpage( +                flv_info_request, video_id, +                note='Downloading flv info', errnote='Unable to download flv info') +          video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]          # Start extracting information -        video_title = video_info.find('.//title').text -        video_extension = video_info.find('.//movie_type').text -        video_format = video_extension.upper() -        video_thumbnail = video_info.find('.//thumbnail_url').text -        video_description = video_info.find('.//description').text -        video_uploader_id = video_info.find('.//user_id').text -        video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) -        video_view_count = video_info.find('.//view_counter').text -        video_webpage_url = video_info.find('.//watch_url').text - -        # uploader -        video_uploader = video_uploader_id -        url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id -        try: -            user_info = self._download_xml( -                url, video_id, note='Downloading user information') -            video_uploader = user_info.find('.//nickname').text -        except ExtractorError as err: -            self._downloader.report_warning('Unable to download user info webpage: %s' % compat_str(err)) +        title = video_info.find('.//title').text +        extension = video_info.find('.//movie_type').text +        video_format = extension.upper() +        thumbnail = video_info.find('.//thumbnail_url').text +        description = video_info.find('.//description').text +        upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) +        view_count = int_or_none(video_info.find('.//view_counter').text) +        comment_count = int_or_none(video_info.find('.//comment_num').text) +        duration = parse_duration(video_info.find('.//length').text) +        webpage_url = video_info.find('.//watch_url').text + +        if video_info.find('.//ch_id') is not None: +            uploader_id = video_info.find('.//ch_id').text +            uploader = video_info.find('.//ch_name').text +        elif video_info.find('.//user_id') is not None: +            uploader_id = video_info.find('.//user_id').text +            uploader = video_info.find('.//user_nickname').text +        else: +            uploader_id = uploader = None          return {              'id': video_id,              'url': video_real_url, -            'title': video_title, -            'ext': video_extension, +            'title': title, +            'ext': extension,              'format': video_format, -            'thumbnail': video_thumbnail, -            'description': video_description, -            'uploader': video_uploader, -            'upload_date': video_upload_date, -            'uploader_id': video_uploader_id, -            'view_count': video_view_count, -            'webpage_url': video_webpage_url, +            'thumbnail': thumbnail, +            'description': description, +            'uploader': uploader, +            'upload_date': upload_date, +            'uploader_id': uploader_id, +            'view_count': view_count, +            'comment_count': comment_count, +            'duration': duration, +            'webpage_url': webpage_url,          } diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index c2e7b67c7..33daa0dec 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -47,7 +47,7 @@ class NineGagIE(InfoExtractor):          webpage = self._download_webpage(url, display_id)          post_view = json.loads(self._html_search_regex( -            r'var postView = new app\.PostView\({\s*post:\s*({.+?}),', webpage, 'post view')) +            r'var postView = new app\.PostView\({\s*post:\s*({.+?}),\s*posts:\s*prefetchedCurrentPost', webpage, 'post view'))          youtube_id = post_view['videoExternalId']          title = post_view['title'] diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py new file mode 100644 index 000000000..fbcbe1f40 --- /dev/null +++ b/youtube_dl/extractor/npo.py @@ -0,0 +1,62 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    unified_strdate, +) + + +class NPOIE(InfoExtractor): +    IE_NAME = 'npo.nl' +    _VALID_URL = r'https?://www\.npo\.nl/[^/]+/[^/]+/(?P<id>[^/?]+)' + +    _TEST = { +        'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', +        'md5': '4b3f9c429157ec4775f2c9cb7b911016', +        'info_dict': { +            'id': 'VPWON_1220719', +            'ext': 'mp4', +            'title': 'Nieuwsuur', +            'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', +            'upload_date': '20140622', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        metadata = self._download_json( +            'http://e.omroep.nl/metadata/aflevering/%s' % video_id, +            video_id, +            # We have to remove the javascript callback +            transform_source=lambda j: re.sub(r'parseMetadata\((.*?)\);\n//epc', r'\1', j) +        ) +        token_page = self._download_webpage( +            'http://ida.omroep.nl/npoplayer/i.js', +            video_id, +            note='Downloading token' +        ) +        token = self._search_regex(r'npoplayer.token = "(.+?)"', token_page, 'token') +        streams_info = self._download_json( +            'http://ida.omroep.nl/odi/?prid=%s&puboptions=h264_std&adaptive=yes&token=%s' % (video_id, token), +            video_id +        ) + +        stream_info = self._download_json( +            streams_info['streams'][0] + '&type=json', +            video_id, +            'Downloading stream info' +        ) + +        return { +            'id': video_id, +            'title': metadata['titel'], +            'ext': 'mp4', +            'url': stream_info['url'], +            'description': metadata['info'], +            'thumbnail': metadata['images'][-1]['url'], +            'upload_date': unified_strdate(metadata['gidsdatum']), +        } diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index cb4305349..ba3dd707f 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -35,7 +35,8 @@ class RaiIE(SubtitlesInfoExtractor):                  'description': '',                  'upload_date': '20140612',                  'duration': 1758, -            } +            }, +            'skip': 'Error 404',          },          {              'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py new file mode 100644 index 000000000..a4f8ce6c3 --- /dev/null +++ b/youtube_dl/extractor/soundgasm.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class SoundgasmIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_\-]+)/(?P<title>[0-9a-zA-Z_\-]+)' +    _TEST = { +        'url': 'http://soundgasm.net/u/ytdl/Piano-sample', +        'md5': '010082a2c802c5275bb00030743e75ad', +        'info_dict': { +            'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9', +            'ext': 'm4a', +            'title': 'ytdl_Piano-sample', +            'description': 'Royalty Free Sample Music' +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        display_id = mobj.group('title') +        audio_title = mobj.group('user') + '_' + mobj.group('title') +        webpage = self._download_webpage(url, display_id) +        audio_url = self._html_search_regex( +            r'(?s)m4a\:\s"([^"]+)"', webpage, 'audio URL') +        audio_id = re.split('\/|\.', audio_url)[-2] +        description = self._html_search_regex( +            r'(?s)<li>Description:\s(.*?)<\/li>', webpage, 'description', +            fatal=False) + +        return { +            'id': audio_id, +            'display_id': display_id, +            'url': audio_url, +            'title': audio_title, +            'description': description +        } diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 36331529e..25b9864ad 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -20,13 +20,13 @@ class TagesschauIE(InfoExtractor):              'thumbnail': 're:^http:.*\.jpg$',          },      }, { -        'url': 'http://www.tagesschau.de/multimedia/video/video-196.html', -        'md5': '8aaa8bf3ae1ca2652309718c03019128', +        'url': 'http://www.tagesschau.de/multimedia/video/video-5964.html', +        'md5': '66652566900963a3f962333579eeffcf',          'info_dict': { -            'id': '196', +            'id': '5964',              'ext': 'mp4', -            'title': 'Ukraine-Konflikt: Klitschko in Kiew als Bürgermeister vereidigt', -            'description': 'md5:f22e4af75821d174fa6c977349682691', +            'title': 'Nahost-Konflikt: Israel bombadiert Ziele im Gazastreifen und Westjordanland', +            'description': 'md5:07bfc78c48eec3145ed4805299a1900a',              'thumbnail': 're:http://.*\.jpg',          },      }] diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index b3cb6bd76..2c2113b14 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -14,7 +14,7 @@ class TeacherTubeIE(InfoExtractor):      IE_NAME = 'teachertube'      IE_DESC = 'teachertube.com videos' -    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=)(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=|video/(?:[\da-z-]+-)?|audio/)(?P<id>\d+)'      _TESTS = [{          'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997', @@ -45,6 +45,15 @@ class TeacherTubeIE(InfoExtractor):              'title': 'PER ASPERA AD ASTRA',              'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNI?KE ?KOLE P',          }, +    }, { +        'url': 'http://www.teachertube.com/video/intro-video-schleicher-297790', +        'md5': '9c79fbb2dd7154823996fc28d4a26998', +        'info_dict': { +            'id': '297790', +            'ext': 'mp4', +            'title': 'Intro Video - Schleicher', +            'description': 'Intro Video - Why to flip, how flipping will', +        },      }]      def _real_extract(self, url): @@ -66,6 +75,7 @@ class TeacherTubeIE(InfoExtractor):          media_urls = re.findall(r'data-contenturl="([^"]+)"', webpage)          media_urls.extend(re.findall(r'var\s+filePath\s*=\s*"([^"]+)"', webpage)) +        media_urls.extend(re.findall(r'\'file\'\s*:\s*["\']([^"\']+)["\'],', webpage))          formats = [              { @@ -79,28 +89,36 @@ class TeacherTubeIE(InfoExtractor):          return {              'id': video_id,              'title': title, -            'thumbnail': self._html_search_regex(r'var\s+thumbUrl\s*=\s*"([^"]+)"', webpage, 'thumbnail'), +            'thumbnail': self._html_search_regex(r'\'image\'\s*:\s*["\']([^"\']+)["\']', webpage, 'thumbnail'),              'formats': formats,              'description': description,          } -class TeacherTubeClassroomIE(InfoExtractor): -    IE_NAME = 'teachertube:classroom' -    IE_DESC = 'teachertube.com online classrooms' +class TeacherTubeUserIE(InfoExtractor): +    IE_NAME = 'teachertube:user:collection' +    IE_DESC = 'teachertube.com user and collection videos' + +    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(user/profile|collection)/(?P<user>[0-9a-zA-Z]+)/?' -    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/view_classroom\.php\?user=(?P<user>[0-9a-zA-Z]+)' +    _MEDIA_RE = r'(?s)"sidebar_thumb_time">[0-9:]+</div>.+?<a href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)">'      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          user_id = mobj.group('user') -        rss = self._download_xml( -            'http://www.teachertube.com/rssclassroom.php?mode=user&username=%s' % user_id, -            user_id, 'Downloading classroom RSS') +        urls = [] +        webpage = self._download_webpage(url, user_id) +        urls.extend(re.findall(self._MEDIA_RE, webpage)) +         +        pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[1:-1] +        for p in pages: +            more = 'http://www.teachertube.com/ajax-user/user-videos/%s?page=%s' % (user_id, p) +            webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages) + 1)) +            urls.extend(re.findall(self._MEDIA_RE, webpage))          entries = [] -        for url in rss.findall('.//{http://search.yahoo.com/mrss/}player'): -            entries.append(self.url_result(url.attrib['url'], 'TeacherTube')) +        for url in urls: +            entries.append(self.url_result(url, 'TeacherTube'))          return self.playlist_result(entries, user_id) diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py index 34008afc6..0f389bd93 100644 --- a/youtube_dl/extractor/toypics.py +++ b/youtube_dl/extractor/toypics.py @@ -1,10 +1,13 @@ +# -*- coding:utf-8 -*- +from __future__ import unicode_literals +  from .common import InfoExtractor  import re  class ToypicsIE(InfoExtractor):      IE_DESC = 'Toypics user profile' -    _VALID_URL = r'http://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*' +    _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*'      _TEST = {          'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/',          'md5': '16e806ad6d6f58079d210fe30985e08b', @@ -61,7 +64,7 @@ class ToypicsUserIE(InfoExtractor):                  note='Downloading page %d/%d' % (n, page_count))              urls.extend(                  re.findall( -                    r'<p class="video-entry-title">\n\s*<a href="(http://videos.toypics.net/view/[^"]+)">', +                    r'<p class="video-entry-title">\s+<a href="(https?://videos.toypics.net/view/[^"]+)">',                      lpage))          return { diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 544369068..2882c1809 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*-  from __future__ import unicode_literals  import re @@ -10,14 +11,27 @@ from ..utils import (  class TumblrIE(InfoExtractor):      _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)($|/)' -    _TEST = { +    _TESTS = [{          'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', -        'file': '54196191430.mp4',          'md5': '479bb068e5b16462f5176a6828829767',          'info_dict': { -            "title": "tatiana maslany news" +            'id': '54196191430', +            'ext': 'mp4', +            'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', +            'description': 'md5:dfac39636969fe6bf1caa2d50405f069', +            'thumbnail': 're:http://.*\.jpg',          } -    } +    }, { +        'url': 'http://5sostrum.tumblr.com/post/90208453769/yall-forgetting-the-greatest-keek-of-them-all', +        'md5': 'bf348ef8c0ef84fbf1cbd6fa6e000359', +        'info_dict': { +            'id': '90208453769', +            'ext': 'mp4', +            'title': '5SOS STRUM ;)', +            'description': 'md5:dba62ac8639482759c8eb10ce474586a', +            'thumbnail': 're:http://.*\.jpg', +        } +    }]      def _real_extract(self, url):          m_url = re.match(self._VALID_URL, url) @@ -48,6 +62,7 @@ class TumblrIE(InfoExtractor):          return [{'id': video_id,                   'url': video_url,                   'title': video_title, +                 'description': self._html_search_meta('description', webpage),                   'thumbnail': video_thumbnail,                   'ext': ext                   }] diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index fb132aef6..a7953a7e7 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -49,6 +49,7 @@ class VeohIE(InfoExtractor):                  'description': 'md5:f5a11c51f8fb51d2315bca0937526891',                  'uploader': 'newsy-videos',              }, +            'skip': 'This video has been deleted.',          },      ] diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py index b5034b02f..a647807d0 100644 --- a/youtube_dl/extractor/videott.py +++ b/youtube_dl/extractor/videott.py @@ -4,7 +4,10 @@ import re  import base64  from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( +    unified_strdate, +    int_or_none, +)  class VideoTtIE(InfoExtractor): @@ -50,9 +53,9 @@ class VideoTtIE(InfoExtractor):              'thumbnail': settings['config']['thumbnail'],              'upload_date': unified_strdate(video['added']),              'uploader': video['owner'], -            'view_count': int(video['view_count']), -            'comment_count': int(video['comment_count']), -            'like_count': int(video['liked']), -            'dislike_count': int(video['disliked']), +            'view_count': int_or_none(video['view_count']), +            'comment_count': None if video.get('comment_count') == '--' else int_or_none(video['comment_count']), +            'like_count': int_or_none(video['liked']), +            'dislike_count': int_or_none(video['disliked']),              'formats': formats,          }
\ No newline at end of file diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index fb082f364..918bd1098 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -16,7 +16,7 @@ from ..utils import (  class VKIE(InfoExtractor):      IE_NAME = 'vk.com' -    _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))' +    _VALID_URL = r'https?://(?:m\.)?vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:.+?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'      _NETRC_MACHINE = 'vk'      _TESTS = [ @@ -27,7 +27,7 @@ class VKIE(InfoExtractor):                  'id': '162222515',                  'ext': 'flv',                  'title': 'ProtivoGunz - Хуёвая песня', -                'uploader': 'Noize MC', +                'uploader': 're:Noize MC.*',                  'duration': 195,              },          }, @@ -62,11 +62,47 @@ class VKIE(InfoExtractor):                  'id': '164049491',                  'ext': 'mp4',                  'uploader': 'Триллеры', -                'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]\u00a0', +                'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',                  'duration': 8352,              },              'skip': 'Requires vk account credentials',          }, +        { +            'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', +            'md5': 'd82c22e449f036282d1d3f7f4d276869', +            'info_dict': { +                'id': '166094326', +                'ext': 'mp4', +                'uploader': 'Киномания - лучшее из мира кино', +                'title': 'Запах женщины (1992)', +                'duration': 9392, +            }, +            'skip': 'Requires vk account credentials', +        }, +        { +            'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', +            'md5': '4d7a5ef8cf114dfa09577e57b2993202', +            'info_dict': { +                'id': '168067957', +                'ext': 'mp4', +                'uploader': 'Киномания - лучшее из мира кино', +                'title': ' ', +                'duration': 7291, +            }, +            'skip': 'Requires vk account credentials', +        }, +        { +            'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540', +            'md5': '0c45586baa71b7cb1d0784ee3f4e00a6', +            'note': 'ivi.ru embed', +            'info_dict': { +                'id': '60690', +                'ext': 'mp4', +                'title': 'Книга Илая', +                'duration': 6771, +            }, +            'skip': 'Only works from Russia', +        },      ]      def _login(self): @@ -110,6 +146,16 @@ class VKIE(InfoExtractor):          if m_yt is not None:              self.to_screen('Youtube video detected')              return self.url_result(m_yt.group(1), 'Youtube') + +        m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page) +        if m_opts: +            m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1)) +            if m_opts_url: +                opts_url = m_opts_url.group(1) +                if opts_url.startswith('//'): +                    opts_url = 'http:' + opts_url +                return self.url_result(opts_url) +          data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars')          data = json.loads(data_json) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index feeb44b45..f741ba540 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*-  from __future__ import unicode_literals  import re @@ -54,14 +55,14 @@ class WDRIE(InfoExtractor):              },          },          { -            'url': 'http://www.funkhauseuropa.de/av/audiogrenzenlosleckerbaklava101-audioplayer.html', -            'md5': 'cfff440d4ee64114083ac44676df5d15', +            'url': 'http://www.funkhauseuropa.de/av/audiosuepersongsoulbossanova100-audioplayer.html', +            'md5': '24e83813e832badb0a8d7d1ef9ef0691',              'info_dict': { -                'id': 'mdb-363068', +                'id': 'mdb-463528',                  'ext': 'mp3', -                'title': 'Grenzenlos lecker - Baklava', +                'title': 'Süpersong: Soul Bossa Nova',                  'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', -                'upload_date': '20140311', +                'upload_date': '20140630',              },          },      ] @@ -127,9 +128,10 @@ class WDRMobileIE(InfoExtractor):          'info_dict': {              'title': '4283021',              'id': '421735', +            'ext': 'mp4',              'age_limit': 0,          }, -        '_skip': 'Will be depublicized shortly' +        'skip': 'Problems with loading data.'      }      def _real_extract(self, url): @@ -139,6 +141,7 @@ class WDRMobileIE(InfoExtractor):              'title': mobj.group('title'),              'age_limit': int(mobj.group('age_limit')),              'url': url, +            'ext': determine_ext(url),              'user_agent': 'mobile',          } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6bdea1c44..6123e1256 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -865,71 +865,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):      def _decrypt_signature(self, s, video_id, player_url, age_gate=False):          """Turn the encrypted s field into a working signature""" -        if player_url is not None: -            if player_url.startswith(u'//'): -                player_url = u'https:' + player_url -            try: -                player_id = (player_url, len(s)) -                if player_id not in self._player_cache: -                    func = self._extract_signature_function( -                        video_id, player_url, len(s) -                    ) -                    self._player_cache[player_id] = func -                func = self._player_cache[player_id] -                if self._downloader.params.get('youtube_print_sig_code'): -                    self._print_sig_code(func, len(s)) -                return func(s) -            except Exception: -                tb = traceback.format_exc() -                self._downloader.report_warning( -                    u'Automatic signature extraction failed: ' + tb) - -            self._downloader.report_warning( -                u'Warning: Falling back to static signature algorithm') - -        return self._static_decrypt_signature( -            s, video_id, player_url, age_gate) - -    def _static_decrypt_signature(self, s, video_id, player_url, age_gate): -        if age_gate: -            # The videos with age protection use another player, so the -            # algorithms can be different. -            if len(s) == 86: -                return s[2:63] + s[82] + s[64:82] + s[63] - -        if len(s) == 93: -            return s[86:29:-1] + s[88] + s[28:5:-1] -        elif len(s) == 92: -            return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] -        elif len(s) == 91: -            return s[84:27:-1] + s[86] + s[26:5:-1] -        elif len(s) == 90: -            return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] -        elif len(s) == 89: -            return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1] -        elif len(s) == 88: -            return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28] -        elif len(s) == 87: -            return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] -        elif len(s) == 86: -            return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1] -        elif len(s) == 85: -            return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84] -        elif len(s) == 84: -            return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1] -        elif len(s) == 83: -            return s[80:63:-1] + s[0] + s[62:0:-1] + s[63] -        elif len(s) == 82: -            return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37] -        elif len(s) == 81: -            return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] -        elif len(s) == 80: -            return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80] -        elif len(s) == 79: -            return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] +        if player_url is None: +            raise ExtractorError(u'Cannot decrypt signature without player_url') -        else: -            raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) +        if player_url.startswith(u'//'): +            player_url = u'https:' + player_url +        try: +            player_id = (player_url, len(s)) +            if player_id not in self._player_cache: +                func = self._extract_signature_function( +                    video_id, player_url, len(s) +                ) +                self._player_cache[player_id] = func +            func = self._player_cache[player_id] +            if self._downloader.params.get('youtube_print_sig_code'): +                self._print_sig_code(func, len(s)) +            return func(s) +        except Exception as e: +            tb = traceback.format_exc() +            raise ExtractorError( +                u'Automatic signature extraction failed: ' + tb, cause=e)      def _get_available_subtitles(self, video_id, webpage):          try: @@ -1698,14 +1653,14 @@ class YoutubeSearchURLIE(InfoExtractor):          webpage = self._download_webpage(url, query)          result_code = self._search_regex( -            r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML') +            r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')          part_codes = re.findall(              r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)          entries = []          for part_code in part_codes:              part_title = self._html_search_regex( -                r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False) +                [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)              part_url_snippet = self._html_search_regex(                  r'(?s)href="([^"]+)"', part_code, 'item URL')              part_url = compat_urlparse.urljoin( @@ -1825,10 +1780,21 @@ class YoutubeTruncatedURLIE(InfoExtractor):      IE_NAME = 'youtube:truncated_url'      IE_DESC = False  # Do not list      _VALID_URL = r'''(?x) -        (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$| +        (?:https?://)?[^/]+/watch\?(?: +            feature=[a-z_]+| +            annotation_id=annotation_[^&]+ +        )?$|          (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$      ''' +    _TESTS = [{ +        'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041', +        'only_matching': True, +    }, { +        'url': 'http://www.youtube.com/watch?', +        'only_matching': True, +    }] +      def _real_extract(self, url):          raise ExtractorError(              u'Did you forget to quote the URL? Remember that & is a meta ' diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 449482d3c..3bbb07704 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -59,7 +59,7 @@ class JSInterpreter(object):              if member == 'split("")':                  return list(val)              if member == 'join("")': -                return u''.join(val) +                return ''.join(val)              if member == 'length':                  return len(val)              if member == 'reverse()': @@ -99,7 +99,7 @@ class JSInterpreter(object):      def extract_function(self, funcname):          func_m = re.search( -            (r'(?:function %s|%s\s*=\s*function)' % ( +            (r'(?:function %s|[{;]%s\s*=\s*function)' % (                  re.escape(funcname), re.escape(funcname))) +              r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',              self.code) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b97e62ae9..09312e81a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -816,6 +816,9 @@ def unified_strdate(date_str):          '%d %b %Y',          '%B %d %Y',          '%b %d %Y', +        '%b %dst %Y %I:%M%p', +        '%b %dnd %Y %I:%M%p', +        '%b %dth %Y %I:%M%p',          '%Y-%m-%d',          '%d.%m.%Y',          '%d/%m/%Y', diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 77f6083d5..d6b05892c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.06.24.1' +__version__ = '2014.07.11' | 
