diff options
59 files changed, 1454 insertions, 302 deletions
| @@ -70,8 +70,9 @@ which means you can modify it, redistribute it or use it however you like.      --default-search PREFIX          Use this prefix for unqualified URLs. For                                       example "gvsearch2:" downloads two videos                                       from google videos for  youtube-dl "large -                                     apple". By default (with value "auto") -                                     youtube-dl guesses. +                                     apple". Use the value "auto" to let +                                     youtube-dl guess. The default value "error" +                                     just throws an error.      --ignore-config                  Do not read configuration files. When given                                       in the global configuration file /etc                                       /youtube-dl.conf: do not read the user diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 4b56137ce..2bc81f020 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -69,9 +69,6 @@ class TestAllURLsMatching(unittest.TestCase):      def test_youtube_show_matching(self):          self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) -    def test_youtube_truncated(self): -        self.assertMatch('http://www.youtube.com/watch?', ['youtube:truncated_url']) -      def test_youtube_search_matching(self):          self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])          self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) diff --git a/test/test_playlists.py b/test/test_playlists.py index 465b07b9e..3a88cf270 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -28,8 +28,9 @@ from youtube_dl.extractor import (      SoundcloudSetIE,      SoundcloudUserIE,      SoundcloudPlaylistIE, -    TeacherTubeClassroomIE, +    TeacherTubeUserIE,      LivestreamIE, +    LivestreamOriginalIE,      NHLVideocenterIE,      BambuserChannelIE,      BandcampAlbumIE, @@ -40,6 +41,7 @@ from youtube_dl.extractor import (      KhanAcademyIE,      EveryonesMixtapeIE,      RutubeChannelIE, +    RutubePersonIE,      GoogleSearchIE,      GenericIE,      TEDIE, @@ -114,10 +116,10 @@ class TestPlaylists(unittest.TestCase):      def test_ustream_channel(self):          dl = FakeYDL()          ie = UstreamChannelIE(dl) -        result = ie.extract('http://www.ustream.tv/channel/young-americans-for-liberty') +        result = ie.extract('http://www.ustream.tv/channel/channeljapan')          self.assertIsPlaylist(result) -        self.assertEqual(result['id'], '5124905') -        self.assertTrue(len(result['entries']) >= 6) +        self.assertEqual(result['id'], '10874166') +        self.assertTrue(len(result['entries']) >= 54)      def test_soundcloud_set(self):          dl = FakeYDL() @@ -135,6 +137,14 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['id'], '9615865')          self.assertTrue(len(result['entries']) >= 12) +    def test_soundcloud_likes(self): +        dl = FakeYDL() +        ie = SoundcloudUserIE(dl) +        result = ie.extract('https://soundcloud.com/the-concept-band/likes') +        self.assertIsPlaylist(result) +        self.assertEqual(result['id'], '9615865') +        self.assertTrue(len(result['entries']) >= 1) +      def test_soundcloud_playlist(self):          dl = FakeYDL()          ie = SoundcloudPlaylistIE(dl) @@ -154,6 +164,14 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['title'], 'TEDCity2.0 (English)')          self.assertTrue(len(result['entries']) >= 4) +    def test_livestreamoriginal_folder(self): +        dl = FakeYDL() +        ie = LivestreamOriginalIE(dl) +        result = ie.extract('https://www.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3') +        self.assertIsPlaylist(result) +        self.assertEqual(result['id'], 'a07bf706-d0e4-4e75-a747-b021d84f2fd3') +        self.assertTrue(len(result['entries']) >= 28) +      def test_nhl_videocenter(self):          dl = FakeYDL()          ie = NHLVideocenterIE(dl) @@ -256,10 +274,18 @@ class TestPlaylists(unittest.TestCase):      def test_rutube_channel(self):          dl = FakeYDL()          ie = RutubeChannelIE(dl) -        result = ie.extract('http://rutube.ru/tags/video/1409') +        result = ie.extract('http://rutube.ru/tags/video/1800/') +        self.assertIsPlaylist(result) +        self.assertEqual(result['id'], '1800') +        self.assertTrue(len(result['entries']) >= 68) + +    def test_rutube_person(self): +        dl = FakeYDL() +        ie = RutubePersonIE(dl) +        result = ie.extract('http://rutube.ru/video/person/313878/')          self.assertIsPlaylist(result) -        self.assertEqual(result['id'], '1409') -        self.assertTrue(len(result['entries']) >= 34) +        self.assertEqual(result['id'], '313878') +        self.assertTrue(len(result['entries']) >= 37)      def test_multiple_brightcove_videos(self):          # https://github.com/rg3/youtube-dl/issues/2283 @@ -361,13 +387,13 @@ class TestPlaylists(unittest.TestCase):              result['title'], 'Brace Yourself - Today\'s Weirdest News')          self.assertTrue(len(result['entries']) >= 10) -    def test_TeacherTubeClassroom(self): +    def test_TeacherTubeUser(self):          dl = FakeYDL() -        ie = TeacherTubeClassroomIE(dl) -        result = ie.extract('http://www.teachertube.com/view_classroom.php?user=rbhagwati2') +        ie = TeacherTubeUserIE(dl) +        result = ie.extract('http://www.teachertube.com/user/profile/rbhagwati2')          self.assertIsPlaylist(result)          self.assertEqual(result['id'], 'rbhagwati2') -        self.assertTrue(len(result['entries']) >= 20) +        self.assertTrue(len(result['entries']) >= 179)  if __name__ == '__main__':      unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 8417c55a6..8d46fe108 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -33,6 +33,12 @@ _TESTS = [          90,          u']\\[@?>=<;:/.-,+*)(\'&%$#"hZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjiagfedcb39876',      ), +    ( +        u'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', +        u'js', +        u'2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', +        u'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', +    ),  ] @@ -44,7 +50,7 @@ class TestSignature(unittest.TestCase):              os.mkdir(self.TESTDATA_DIR) -def make_tfunc(url, stype, sig_length, expected_sig): +def make_tfunc(url, stype, sig_input, expected_sig):      basename = url.rpartition('/')[2]      m = re.match(r'.*-([a-zA-Z0-9_-]+)\.[a-z]+$', basename)      assert m, '%r should follow URL format' % basename @@ -66,7 +72,9 @@ def make_tfunc(url, stype, sig_length, expected_sig):              with open(fn, 'rb') as testf:                  swfcode = testf.read()              func = ie._parse_sig_swf(swfcode) -        src_sig = compat_str(string.printable[:sig_length]) +        src_sig = ( +            compat_str(string.printable[:sig_input]) +            if isinstance(sig_input, int) else sig_input)          got_sig = func(src_sig)          self.assertEqual(got_sig, expected_sig) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index dc0ba986a..3dff723b8 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -993,6 +993,8 @@ class YoutubeDL(object):                          fd = get_suitable_downloader(info)(self, self.params)                          for ph in self._progress_hooks:                              fd.add_progress_hook(ph) +                        if self.params.get('verbose'): +                            self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))                          return fd.download(name, info)                      if info_dict.get('requested_formats') is not None:                          downloaded = [] diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 1e01432d2..31ed63fcc 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -59,6 +59,7 @@ __authors__  = (      'Adam Thalhammer',      'Georg Jähnig',      'Ralf Haring', +    'Koki Takahashi',  )  __license__ = 'Public Domain' @@ -269,7 +270,7 @@ def parseOpts(overrideArguments=None):      general.add_option(          '--default-search',          dest='default_search', metavar='PREFIX', -        help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for  youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.') +        help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for  youtube-dl "large apple". Use the value "auto" to let youtube-dl guess. The default value "error" just throws an error.')      general.add_option(          '--ignore-config',          action='store_true', diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 4b7900b4f..44e1708ed 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -3,6 +3,7 @@ from .addanime import AddAnimeIE  from .aftonbladet import AftonbladetIE  from .anitube import AnitubeIE  from .aol import AolIE +from .allocine import AllocineIE  from .aparat import AparatIE  from .appletrailers import AppleTrailersIE  from .archiveorg import ArchiveOrgIE @@ -63,6 +64,7 @@ from .dailymotion import (  from .daum import DaumIE  from .dotsub import DotsubIE  from .dreisat import DreiSatIE +from .drtv import DRTVIE  from .defense import DefenseGouvFrIE  from .discovery import DiscoveryIE  from .divxstage import DivxStageIE @@ -109,6 +111,8 @@ from .gdcvault import GDCVaultIE  from .generic import GenericIE  from .googleplus import GooglePlusIE  from .googlesearch import GoogleSearchIE +from .gorillavid import GorillaVidIE +from .goshgay import GoshgayIE  from .hark import HarkIE  from .helsinki import HelsinkiIE  from .hentaistigma import HentaiStigmaIE @@ -146,7 +150,11 @@ from .ku6 import Ku6IE  from .la7 import LA7IE  from .lifenews import LifeNewsIE  from .liveleak import LiveLeakIE -from .livestream import LivestreamIE, LivestreamOriginalIE +from .livestream import ( +    LivestreamIE, +    LivestreamOriginalIE, +    LivestreamShortenerIE, +)  from .lynda import (      LyndaIE,      LyndaCourseIE @@ -164,11 +172,13 @@ from .mpora import MporaIE  from .mofosex import MofosexIE  from .mooshare import MooshareIE  from .morningstar import MorningstarIE +from .motherless import MotherlessIE  from .motorsport import MotorsportIE  from .moviezine import MoviezineIE  from .movshare import MovShareIE  from .mtv import (      MTVIE, +    MTVServicesEmbeddedIE,      MTVIggyIE,  )  from .musicplayon import MusicPlayOnIE @@ -195,6 +205,7 @@ from .normalboots import NormalbootsIE  from .novamov import NovaMovIE  from .nowness import NownessIE  from .nowvideo import NowVideoIE +from .npo import NPOIE  from .nrk import (      NRKIE,      NRKTVIE, @@ -216,6 +227,7 @@ from .pornotube import PornotubeIE  from .prosiebensat1 import ProSiebenSat1IE  from .pyvideo import PyvideoIE  from .radiofrance import RadioFranceIE +from .rai import RaiIE  from .rbmaradio import RBMARadioIE  from .redtube import RedTubeIE  from .ringtv import RingTVIE @@ -252,6 +264,7 @@ from .soundcloud import (      SoundcloudUserIE,      SoundcloudPlaylistIE  ) +from .soundgasm import SoundgasmIE  from .southparkstudios import (      SouthParkStudiosIE,      SouthparkDeIE, @@ -271,7 +284,7 @@ from .sztvhu import SztvHuIE  from .tagesschau import TagesschauIE  from .teachertube import (      TeacherTubeIE, -    TeacherTubeClassroomIE, +    TeacherTubeUserIE,  )  from .teachingchannel import TeachingChannelIE  from .teamcoco import TeamcocoIE @@ -331,6 +344,7 @@ from .vine import (  )  from .viki import VikiIE  from .vk import VKIE +from .vodlocker import VodlockerIE  from .vube import VubeIE  from .vuclip import VuClipIE  from .vulture import VultureIE @@ -345,6 +359,7 @@ from .weibo import WeiboIE  from .wimp import WimpIE  from .wistia import WistiaIE  from .worldstarhiphop import WorldStarHipHopIE +from .wrzuta import WrzutaIE  from .xbef import XBefIE  from .xhamster import XHamsterIE  from .xnxx import XNXXIE diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py new file mode 100644 index 000000000..34f0cd49b --- /dev/null +++ b/youtube_dl/extractor/allocine.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( +    compat_str, +    qualities, +    determine_ext, +) + + +class AllocineIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?P<typ>article|video|film)/(fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=)(?P<id>[0-9]+)(?:\.html)?' + +    _TESTS = [{ +        'url': 'http://www.allocine.fr/article/fichearticle_gen_carticle=18635087.html', +        'md5': '0c9fcf59a841f65635fa300ac43d8269', +        'info_dict': { +            'id': '19546517', +            'ext': 'mp4', +            'title': 'Astérix - Le Domaine des Dieux Teaser VF', +            'description': 'md5:4a754271d9c6f16c72629a8a993ee884', +            'thumbnail': 're:http://.*\.jpg', +        }, +    }, { +        'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html', +        'md5': 'd0cdce5d2b9522ce279fdfec07ff16e0', +        'info_dict': { +            'id': '19540403', +            'ext': 'mp4', +            'title': 'Planes 2 Bande-annonce VF', +            'description': 'md5:c4b1f7bd682a91de6491ada267ec0f4d', +            'thumbnail': 're:http://.*\.jpg', +        }, +    }, { +        'url': 'http://www.allocine.fr/film/fichefilm_gen_cfilm=181290.html', +        'md5': '101250fb127ef9ca3d73186ff22a47ce', +        'info_dict': { +            'id': '19544709', +            'ext': 'mp4', +            'title': 'Dragons 2 - Bande annonce finale VF', +            'description': 'md5:e74a4dc750894bac300ece46c7036490', +            'thumbnail': 're:http://.*\.jpg', +        }, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        typ = mobj.group('typ') +        display_id = mobj.group('id') + +        webpage = self._download_webpage(url, display_id) + +        if typ == 'film': +            video_id = self._search_regex(r'href="/video/player_gen_cmedia=([0-9]+).+"', webpage, 'video id') +        else: +            player = self._search_regex(r'data-player=\'([^\']+)\'>', webpage, 'data player') + +            player_data = json.loads(player) +            video_id = compat_str(player_data['refMedia']) + +        xml = self._download_xml('http://www.allocine.fr/ws/AcVisiondataV4.ashx?media=%s' % video_id, display_id) + +        video = xml.find('.//AcVisionVideo').attrib +        quality = qualities(['ld', 'md', 'hd']) + +        formats = [] +        for k, v in video.items(): +            if re.match(r'.+_path', k): +                format_id = k.split('_')[0] +                formats.append({ +                    'format_id': format_id, +                    'quality': quality(format_id), +                    'url': v, +                    'ext': determine_ext(v), +                }) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': video['videoTitle'], +            'thumbnail': self._og_search_thumbnail(webpage), +            'formats': formats, +            'description': self._og_search_description(webpage), +        } diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 2b019daa9..31f0d417c 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -1,22 +1,24 @@ +from __future__ import unicode_literals +  import re  from .common import InfoExtractor  class AnitubeIE(InfoExtractor): -    IE_NAME = u'anitube.se' +    IE_NAME = 'anitube.se'      _VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P<id>\d+)'      _TEST = { -        u'url': u'http://www.anitube.se/video/36621', -        u'md5': u'59d0eeae28ea0bc8c05e7af429998d43', -        u'file': u'36621.mp4', -        u'info_dict': { -            u'id': u'36621', -            u'ext': u'mp4', -            u'title': u'Recorder to Randoseru 01', +        'url': 'http://www.anitube.se/video/36621', +        'md5': '59d0eeae28ea0bc8c05e7af429998d43', +        'info_dict': { +            'id': '36621', +            'ext': 'mp4', +            'title': 'Recorder to Randoseru 01', +            'duration': 180.19,          }, -        u'skip': u'Blocked in the US', +        'skip': 'Blocked in the US',      }      def _real_extract(self, url): @@ -24,13 +26,15 @@ class AnitubeIE(InfoExtractor):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', -                                      webpage, u'key') +        key = self._html_search_regex( +            r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, 'key') -        config_xml = self._download_xml('http://www.anitube.se/nuevo/econfig.php?key=%s' % key, -                                                key) +        config_xml = self._download_xml( +            'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key)          video_title = config_xml.find('title').text +        thumbnail = config_xml.find('image').text +        duration = float(config_xml.find('duration').text)          formats = []          video_url = config_xml.find('file') @@ -49,5 +53,7 @@ class AnitubeIE(InfoExtractor):          return {              'id': video_id,              'title': video_title, +            'thumbnail': thumbnail, +            'duration': duration,              'formats': formats          } diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index c6d22c029..b36a4d46a 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -56,7 +56,18 @@ class ARDIE(InfoExtractor):                  raise ExtractorError('This video is only available after 20:00')          formats = [] +          for s in streams: +            if type(s['_stream']) == list: +                for index, url in enumerate(s['_stream'][::-1]): +                    quality = s['_quality'] + index +                    formats.append({ +                        'quality': quality, +                        'url': url, +                        'format_id': '%s-%s' % (determine_ext(url), quality) +                        }) +                continue +              format = {                  'quality': s['_quality'],                  'url': s['_stream'], diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index b528a9ec5..9591bad8a 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -39,7 +39,10 @@ class ArteTvIE(InfoExtractor):          formats = [{              'forma_id': q.attrib['quality'], -            'url': q.text, +            # The playpath starts at 'mp4:', if we don't manually +            # split the url, rtmpdump will incorrectly parse them +            'url': q.text.split('mp4:', 1)[0], +            'play_path': 'mp4:' + q.text.split('mp4:', 1)[1],              'ext': 'flv',              'quality': 2 if q.attrib['quality'] == 'hd' else 1,          } for q in config.findall('./urls/url')] @@ -111,7 +114,7 @@ class ArteTVPlus7IE(InfoExtractor):          if not formats:              # Some videos are only available in the 'Originalversion'              # they aren't tagged as being in French or German -            if all(f['versionCode'] == 'VO' for f in all_formats): +            if all(f['versionCode'] == 'VO' or f['versionCode'] == 'VA' for f in all_formats):                  formats = all_formats              else:                  raise ExtractorError(u'The formats list is empty') @@ -189,9 +192,10 @@ class ArteTVFutureIE(ArteTVPlus7IE):      _TEST = {          'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081',          'info_dict': { -            'id': '050940-003', +            'id': '5201',              'ext': 'mp4',              'title': 'Les champignons au secours de la planète', +            'upload_date': '20131101',          },      } diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 45067b944..0d5889f5d 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -13,7 +13,7 @@ from ..utils import (  class BiliBiliIE(InfoExtractor): -    _VALID_URL = r'http://www\.bilibili\.tv/video/av(?P<id>[0-9]+)/' +    _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>[0-9]+)/'      _TEST = {          'url': 'http://www.bilibili.tv/video/av1074402/', @@ -56,7 +56,7 @@ class BiliBiliIE(InfoExtractor):              'thumbnailUrl', video_code, 'thumbnail', fatal=False)          player_params = compat_parse_qs(self._html_search_regex( -            r'<iframe .*?class="player" src="https://secure.bilibili.tv/secure,([^"]+)"', +            r'<iframe .*?class="player" src="https://secure\.bilibili\.(?:tv|com)/secure,([^"]+)"',              webpage, 'player params'))          if 'cid' in player_params: diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index d4da08991..acfc4ad73 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -15,7 +15,7 @@ from ..utils import (  class BlipTVIE(SubtitlesInfoExtractor): -    _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z]+)))' +    _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+]+)))'      _TESTS = [          { diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index b5b56ff00..993360714 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -17,15 +17,13 @@ class BRIE(InfoExtractor):      _TESTS = [          { -            'url': 'http://www.br.de/mediathek/video/anselm-gruen-114.html', -            'md5': 'c4f83cf0f023ba5875aba0bf46860df2', +            'url': 'http://www.br.de/mediathek/video/sendungen/heimatsound/heimatsound-festival-2014-trailer-100.html', +            'md5': '93556dd2bcb2948d9259f8670c516d59',              'info_dict': { -                'id': '2c8d81c5-6fb7-4a74-88d4-e768e5856532', +                'id': '25e279aa-1ffd-40fd-9955-5325bd48a53a',                  'ext': 'mp4', -                'title': 'Feiern und Verzichten', -                'description': 'Anselm Grün: Feiern und Verzichten', -                'uploader': 'BR/Birgit Baier', -                'upload_date': '20140301', +                'title': 'Am 1. und 2. August in Oberammergau', +                'description': 'md5:dfd224e5aa6819bc1fcbb7826a932021',              }          },          { diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 3c02c297a..419951b62 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -15,6 +15,7 @@ from ..utils import (      compat_urllib_request,      compat_parse_qs, +    determine_ext,      ExtractorError,      unsmuggle_url,      unescapeHTML, @@ -29,10 +30,11 @@ class BrightcoveIE(InfoExtractor):          {              # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/              'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', -            'file': '2371591881001.mp4',              'md5': '5423e113865d26e40624dce2e4b45d95',              'note': 'Test Brightcove downloads and detection in GenericIE',              'info_dict': { +                'id': '2371591881001', +                'ext': 'mp4',                  'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',                  'uploader': '8TV',                  'description': 'md5:a950cc4285c43e44d763d036710cd9cd', @@ -41,8 +43,9 @@ class BrightcoveIE(InfoExtractor):          {              # From http://medianetwork.oracle.com/video/player/1785452137001              'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', -            'file': '1785452137001.flv',              'info_dict': { +                'id': '1785452137001', +                'ext': 'flv',                  'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',                  'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.',                  'uploader': 'Oracle', @@ -70,7 +73,20 @@ class BrightcoveIE(InfoExtractor):                  'description': 'md5:363109c02998fee92ec02211bd8000df',                  'uploader': 'National Ballet of Canada',              }, -        } +        }, +        { +            # test flv videos served by akamaihd.net +            # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william +            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3ABC2996102916001&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', +            # The md5 checksum changes on each download +            'info_dict': { +                'id': '2996102916001', +                'ext': 'flv', +                'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', +                'uploader': 'Red Bull TV', +                'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', +            }, +        },      ]      @classmethod @@ -187,7 +203,7 @@ class BrightcoveIE(InfoExtractor):          webpage = self._download_webpage(req, video_id)          self.report_extraction(video_id) -        info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json') +        info = self._search_regex(r'var experienceJSON = ({.*});', webpage, 'json')          info = json.loads(info)['data']          video_info = info['programmedContent']['videoPlayer']['mediaDTO']          video_info['_youtubedl_adServerURL'] = info.get('adServerURL') @@ -219,12 +235,26 @@ class BrightcoveIE(InfoExtractor):          renditions = video_info.get('renditions')          if renditions: -            renditions = sorted(renditions, key=lambda r: r['size']) -            info['formats'] = [{ -                'url': rend['defaultURL'], -                'height': rend.get('frameHeight'), -                'width': rend.get('frameWidth'), -            } for rend in renditions] +            formats = [] +            for rend in renditions: +                url = rend['defaultURL'] +                if rend['remote']: +                    # This type of renditions are served through akamaihd.net, +                    # but they don't use f4m manifests +                    url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB' +                    ext = 'flv' +                else: +                    ext = determine_ext(url) +                size = rend.get('size') +                formats.append({ +                    'url': url, +                    'ext': ext, +                    'height': rend.get('frameHeight'), +                    'width': rend.get('frameWidth'), +                    'filesize': size if size != 0 else None, +                }) +            self._sort_formats(formats) +            info['formats'] = formats          elif video_info.get('FLVFullLengthURL') is not None:              info.update({                  'url': video_info['FLVFullLengthURL'], diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index ba4d73ab8..8af0abade 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -130,7 +130,7 @@ class ComedyCentralShowsIE(InfoExtractor):                  raise ExtractorError('Invalid redirected URL: ' + url)              if mobj.group('episode') == '':                  raise ExtractorError('Redirected URL is still not specific: ' + url) -            epTitle = mobj.group('episode').rpartition('/')[-1] +            epTitle = (mobj.group('episode') or mobj.group('videotitle')).rpartition('/')[-1]          mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)          if len(mMovieParams) == 0: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 49e75405e..f1ed30704 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,11 +1,12 @@  import base64  import hashlib  import json +import netrc  import os  import re  import socket  import sys -import netrc +import time  import xml.etree.ElementTree  from ..utils import ( @@ -459,6 +460,9 @@ class InfoExtractor(object):          if secure: regexes = self._og_regexes('video:secure_url') + regexes          return self._html_search_regex(regexes, html, name, **kargs) +    def _og_search_url(self, html, **kargs): +        return self._og_search_property('url', html, **kargs) +      def _html_search_meta(self, name, html, display_name=None, fatal=False):          if display_name is None:              display_name = name @@ -572,6 +576,13 @@ class InfoExtractor(object):          else:              return url +    def _sleep(self, timeout, video_id, msg_template=None): +        if msg_template is None: +            msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds' +        msg = msg_template % {'video_id': video_id, 'timeout': timeout} +        self.to_screen(msg) +        time.sleep(timeout) +  class SearchInfoExtractor(InfoExtractor):      """ @@ -615,4 +626,3 @@ class SearchInfoExtractor(InfoExtractor):      @property      def SEARCH_KEY(self):          return self._SEARCH_KEY - diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 55216201f..5d0bfe454 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -150,7 +150,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):          return {              'id':       video_id,              'formats': formats, -            'uploader': info['owner_screenname'], +            'uploader': info['owner.screenname'],              'upload_date':  video_upload_date,              'title':    self._og_search_title(webpage),              'subtitles':    video_subtitles, diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 2ae6ecc12..554df6735 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -7,9 +7,9 @@ from .common import InfoExtractor  class DiscoveryIE(InfoExtractor): -    _VALID_URL = r'http://dsc\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?' +    _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'      _TEST = { -        'url': 'http://dsc.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', +        'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',          'md5': 'e12614f9ee303a6ccef415cb0793eba2',          'info_dict': {              'id': '614784', diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py new file mode 100644 index 000000000..cdccfd376 --- /dev/null +++ b/youtube_dl/extractor/drtv.py @@ -0,0 +1,91 @@ +from __future__ import unicode_literals + +import re + +from .subtitles import SubtitlesInfoExtractor +from .common import ExtractorError +from ..utils import parse_iso8601 + + +class DRTVIE(SubtitlesInfoExtractor): +    _VALID_URL = r'http://(?:www\.)?dr\.dk/tv/se/[^/]+/(?P<id>[\da-z-]+)' + +    _TEST = { +        'url': 'http://www.dr.dk/tv/se/partiets-mand/partiets-mand-7-8', +        'md5': '4a7e1dd65cdb2643500a3f753c942f25', +        'info_dict': { +            'id': 'partiets-mand-7-8', +            'ext': 'mp4', +            'title': 'Partiets mand (7:8)', +            'description': 'md5:a684b90a8f9336cd4aab94b7647d7862', +            'timestamp': 1403047940, +            'upload_date': '20140617', +            'duration': 1299.040, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        programcard = self._download_json( +            'http://www.dr.dk/mu/programcard/expanded/%s' % video_id, video_id, 'Downloading video JSON') + +        data = programcard['Data'][0] + +        title = data['Title'] +        description = data['Description'] +        timestamp = parse_iso8601(data['CreatedTime'][:-5]) + +        thumbnail = None +        duration = None + +        restricted_to_denmark = False + +        formats = [] +        subtitles = {} + +        for asset in data['Assets']: +            if asset['Kind'] == 'Image': +                thumbnail = asset['Uri'] +            elif asset['Kind'] == 'VideoResource': +                duration = asset['DurationInMilliseconds'] / 1000.0 +                restricted_to_denmark = asset['RestrictedToDenmark'] +                for link in asset['Links']: +                    target = link['Target'] +                    uri = link['Uri'] +                    formats.append({ +                        'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri, +                        'format_id': target, +                        'ext': link['FileFormat'], +                        'preference': -1 if target == 'HDS' else -2, +                    }) +                subtitles_list = asset.get('SubtitlesList') +                if isinstance(subtitles_list, list): +                    LANGS = { +                        'Danish': 'dk', +                    } +                    for subs in subtitles_list: +                        lang = subs['Language'] +                        subtitles[LANGS.get(lang, lang)] = subs['Uri'] + +        if not formats and restricted_to_denmark: +            raise ExtractorError( +                'Unfortunately, DR is not allowed to show this program outside Denmark.', expected=True) + +        self._sort_formats(formats) + +        if self._downloader.params.get('listsubtitles', False): +            self._list_available_subtitles(video_id, subtitles) +            return + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'timestamp': timestamp, +            'duration': duration, +            'formats': formats, +            'subtitles': self.extract_subtitles(video_id, subtitles), +        } diff --git a/youtube_dl/extractor/firstpost.py b/youtube_dl/extractor/firstpost.py index eccd8dde9..0993af1c9 100644 --- a/youtube_dl/extractor/firstpost.py +++ b/youtube_dl/extractor/firstpost.py @@ -15,6 +15,7 @@ class FirstpostIE(InfoExtractor):              'id': '1025403',              'ext': 'mp4',              'title': 'India to launch indigenous aircraft carrier INS Vikrant today', +            'description': 'md5:feef3041cb09724e0bdc02843348f5f4',          }      } @@ -22,13 +23,16 @@ class FirstpostIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') +        page = self._download_webpage(url, video_id) +        title = self._html_search_meta('twitter:title', page, 'title') +        description = self._html_search_meta('twitter:description', page, 'title') +          data = self._download_xml(              'http://www.firstpost.com/getvideoxml-%s.xml' % video_id, video_id,              'Downloading video XML')          item = data.find('./playlist/item')          thumbnail = item.find('./image').text -        title = item.find('./title').text          formats = [              { @@ -42,6 +46,7 @@ class FirstpostIE(InfoExtractor):          return {              'id': video_id,              'title': title, +            'description': description,              'thumbnail': thumbnail,              'formats': formats,          } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3105b47ab..f97b59845 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -278,6 +278,17 @@ class GenericIE(InfoExtractor):                  'skip_download': True,              }          }, +        # MTVSercices embed +        { +            'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too', +            'md5': '35727f82f58c76d996fc188f9755b0d5', +            'info_dict': { +                'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9', +                'ext': 'mp4', +                'title': 'Review', +                'description': 'Mario\'s life in the fast lane has never looked so good.', +            }, +        },      ]      def report_download_webpage(self, video_id): @@ -372,7 +383,7 @@ class GenericIE(InfoExtractor):          if not parsed_url.scheme:              default_search = self._downloader.params.get('default_search')              if default_search is None: -                default_search = 'auto_warning' +                default_search = 'error'              if default_search in ('auto', 'auto_warning'):                  if '/' in url: @@ -386,8 +397,13 @@ class GenericIE(InfoExtractor):                                  expected=True)                          else:                              self._downloader.report_warning( -                                'Falling back to youtube search for  %s . Set --default-search to "auto" to suppress this warning.' % url) +                                'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)                      return self.url_result('ytsearch:' + url) +            elif default_search == 'error': +                raise ExtractorError( +                    ('%r is not a valid URL. ' +                     'Set --default-search "ytseach" (or run  youtube-dl "ytsearch:%s" ) to search YouTube' +                    ) % (url, url), expected=True)              else:                  assert ':' in default_search                  return self.url_result(default_search + url) @@ -609,6 +625,11 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result(mobj.group('url'), 'VK') +        # Look for embedded ivi player +        mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'Ivi') +          # Look for embedded Huffington Post player          mobj = re.search(              r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage) @@ -676,6 +697,14 @@ class GenericIE(InfoExtractor):              url = unescapeHTML(mobj.group('url'))              return self.url_result(url, ie='Vulture') +        # Look for embedded mtvservices player +        mobj = re.search( +            r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"', +            webpage) +        if mobj is not None: +            url = unescapeHTML(mobj.group('url')) +            return self.url_result(url, ie='MTVServicesEmbedded') +          # Start with something easy: JW Player in SWFObject          found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)          if not found: diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index cc29a7e5d..07d994b44 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -52,8 +52,7 @@ class GooglePlusIE(InfoExtractor):          # Extract title          # Get the first line for title -        video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', -            webpage, 'title', default='NA') +        video_title = self._og_search_description(webpage).splitlines()[0]          # Step 2, Simulate clicking the image box to launch video          DOMAIN = 'https://plus.google.com/' diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py new file mode 100644 index 000000000..50ef54cce --- /dev/null +++ b/youtube_dl/extractor/gorillavid.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    compat_urllib_parse, +    compat_urllib_request, +) + + +class GorillaVidIE(InfoExtractor): +    IE_DESC = 'GorillaVid.in and daclips.in' +    _VALID_URL = r'''(?x) +        https?://(?:www\.)? +            (?:daclips\.in|gorillavid\.in)/ +        (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)? +    ''' + +    _TESTS = [{ +        'url': 'http://gorillavid.in/06y9juieqpmi', +        'md5': '5ae4a3580620380619678ee4875893ba', +        'info_dict': { +            'id': '06y9juieqpmi', +            'ext': 'flv', +            'title': 'Rebecca Black My Moment Official Music Video Reaction', +            'thumbnail': 're:http://.*\.jpg', +        }, +    }, { +        'url': 'http://gorillavid.in/embed-z08zf8le23c6-960x480.html', +        'md5': 'c9e293ca74d46cad638e199c3f3fe604', +        'info_dict': { +            'id': 'z08zf8le23c6', +            'ext': 'mp4', +            'title': 'Say something nice', +            'thumbnail': 're:http://.*\.jpg', +        }, +    }, { +        'url': 'http://daclips.in/3rso4kdn6f9m', +        'info_dict': { +            'id': '3rso4kdn6f9m', +            'ext': 'mp4', +            'title': 'Micro Pig piglets ready on 16th July 2009', +            'thumbnail': 're:http://.*\.jpg', +        }, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        fields = dict(re.findall(r'''(?x)<input\s+ +            type="hidden"\s+ +            name="([^"]+)"\s+ +            (?:id="[^"]+"\s+)? +            value="([^"]*)" +            ''', webpage)) +         +        if fields['op'] == 'download1': +            post = compat_urllib_parse.urlencode(fields) + +            req = compat_urllib_request.Request(url, post) +            req.add_header('Content-type', 'application/x-www-form-urlencoded') + +            webpage = self._download_webpage(req, video_id, 'Downloading video page') + +        title = self._search_regex(r'style="z-index: [0-9]+;">([0-9a-zA-Z ]+)(?:-.+)?</span>', webpage, 'title') +        thumbnail = self._search_regex(r'image:\'(http[^\']+)\',', webpage, 'thumbnail') +        url = self._search_regex(r'file: \'(http[^\']+)\',', webpage, 'file url') + +        formats = [{ +            'format_id': 'sd', +            'url': url, +            'ext': determine_ext(url), +            'quality': 1, +        }] + +        return { +            'id': video_id, +            'title': title, +            'thumbnail': thumbnail, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/goshgay.py b/youtube_dl/extractor/goshgay.py new file mode 100644 index 000000000..7bca21ad0 --- /dev/null +++ b/youtube_dl/extractor/goshgay.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    compat_urlparse, +    str_to_int, +    ExtractorError, +) +import json + + +class GoshgayIE(InfoExtractor): +    _VALID_URL = r'^(?:https?://)www.goshgay.com/video(?P<id>\d+?)($|/)' +    _TEST = { +        'url': 'http://www.goshgay.com/video4116282', +        'md5': '268b9f3c3229105c57859e166dd72b03', +        'info_dict': { +            'id': '4116282', +            'ext': 'flv', +            'title': 'md5:089833a4790b5e103285a07337f245bf', +            'thumbnail': 're:http://.*\.jpg', +            'age_limit': 18, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) +        title = self._search_regex(r'class="video-title"><h1>(.+?)<', webpage, 'title') + +        player_config = self._search_regex( +            r'(?s)jwplayer\("player"\)\.setup\(({.+?})\)', webpage, 'config settings') +        player_vars = json.loads(player_config.replace("'", '"')) +        width = str_to_int(player_vars.get('width')) +        height = str_to_int(player_vars.get('height')) +        config_uri = player_vars.get('config') + +        if config_uri is None: +            raise ExtractorError('Missing config URI') +        node = self._download_xml(config_uri, video_id, 'Downloading player config XML', +                                  errnote='Unable to download XML') +        if node is None: +            raise ExtractorError('Missing config XML') +        if node.tag != 'config': +            raise ExtractorError('Missing config attribute') +        fns = node.findall('file') +        imgs = node.findall('image') +        if len(fns) != 1: +            raise ExtractorError('Missing media URI') +        video_url = fns[0].text +        if len(imgs) < 1: +            thumbnail = None +        else: +            thumbnail = imgs[0].text + +        url_comp = compat_urlparse.urlparse(url) +        ref = "%s://%s%s" % (url_comp[0], url_comp[1], url_comp[2]) + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'width': width, +            'height': height, +            'thumbnail': thumbnail, +            'http_referer': ref, +            'age_limit': 18, +        } diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 528be1524..4027deb70 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -14,7 +14,7 @@ from ..utils import (  class IviIE(InfoExtractor):      IE_DESC = 'ivi.ru'      IE_NAME = 'ivi' -    _VALID_URL = r'https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)' +    _VALID_URL = r'https?://(?:www\.)?ivi\.ru/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<videoid>\d+)'      _TESTS = [          # Single movie diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index 7a431a274..8d9491f23 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -24,7 +24,7 @@ class LifeNewsIE(InfoExtractor):              'ext': 'mp4',              'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом',              'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.', -            'thumbnail': 'http://lifenews.ru/static/posts/2014/1/126342/.video.jpg', +            'thumbnail': 're:http://.*\.jpg',              'upload_date': '20140130',          }      } diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 1dcd1fb2d..2c100d424 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import re  import json @@ -6,31 +8,35 @@ from ..utils import (      compat_urllib_parse_urlparse,      compat_urlparse,      xpath_with_ns, +    compat_str, +    orderedSet,  )  class LivestreamIE(InfoExtractor): -    IE_NAME = u'livestream' +    IE_NAME = 'livestream'      _VALID_URL = r'http://new\.livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'      _TEST = { -        u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', -        u'file': u'4719370.mp4', -        u'md5': u'0d2186e3187d185a04b3cdd02b828836', -        u'info_dict': { -            u'title': u'Live from Webster Hall NYC', -            u'upload_date': u'20121012', +        'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', +        'md5': '53274c76ba7754fb0e8d072716f2292b', +        'info_dict': { +            'id': '4719370', +            'ext': 'mp4', +            'title': 'Live from Webster Hall NYC', +            'upload_date': '20121012',          }      }      def _extract_video_info(self, video_data):          video_url = video_data.get('progressive_url_hd') or video_data.get('progressive_url') -        return {'id': video_data['id'], -                'url': video_url, -                'ext': 'mp4', -                'title': video_data['caption'], -                'thumbnail': video_data['thumbnail_url'], -                'upload_date': video_data['updated_at'].replace('-','')[:8], -                } +        return { +            'id': compat_str(video_data['id']), +            'url': video_url, +            'ext': 'mp4', +            'title': video_data['caption'], +            'thumbnail': video_data['thumbnail_url'], +            'upload_date': video_data['updated_at'].replace('-', '')[:8], +        }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -40,43 +46,43 @@ class LivestreamIE(InfoExtractor):          if video_id is None:              # This is an event page: -            config_json = self._search_regex(r'window.config = ({.*?});', -                webpage, u'window config') +            config_json = self._search_regex( +                r'window.config = ({.*?});', webpage, 'window config')              info = json.loads(config_json)['event']              videos = [self._extract_video_info(video_data['data']) -                for video_data in info['feed']['data'] if video_data['type'] == u'video'] +                for video_data in info['feed']['data'] if video_data['type'] == 'video']              return self.playlist_result(videos, info['id'], info['full_name'])          else: -            og_video = self._og_search_video_url(webpage, name=u'player url') +            og_video = self._og_search_video_url(webpage, 'player url')              query_str = compat_urllib_parse_urlparse(og_video).query              query = compat_urlparse.parse_qs(query_str)              api_url = query['play_url'][0].replace('.smil', '') -            info = json.loads(self._download_webpage(api_url, video_id, -                                                     u'Downloading video info')) +            info = json.loads(self._download_webpage( +                api_url, video_id, 'Downloading video info'))              return self._extract_video_info(info)  # The original version of Livestream uses a different system  class LivestreamOriginalIE(InfoExtractor): -    IE_NAME = u'livestream:original' -    _VALID_URL = r'https?://www\.livestream\.com/(?P<user>[^/]+)/video\?.*?clipId=(?P<id>.*?)(&|$)' +    IE_NAME = 'livestream:original' +    _VALID_URL = r'''(?x)https?://www\.livestream\.com/ +        (?P<user>[^/]+)/(?P<type>video|folder) +        (?:\?.*?Id=|/)(?P<id>.*?)(&|$) +        '''      _TEST = { -        u'url': u'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', -        u'info_dict': { -            u'id': u'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', -            u'ext': u'flv', -            u'title': u'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital', +        'url': 'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', +        'info_dict': { +            'id': 'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', +            'ext': 'flv', +            'title': 'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital',          }, -        u'params': { +        'params': {              # rtmp -            u'skip_download': True, +            'skip_download': True,          },      } -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        user = mobj.group('user') +    def _extract_video(self, user, video_id):          api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id)          info = self._download_xml(api_url, video_id) @@ -84,7 +90,7 @@ class LivestreamOriginalIE(InfoExtractor):          ns = {'media': 'http://search.yahoo.com/mrss'}          thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url']          # Remove the extension and number from the path (like 1.jpg) -        path = self._search_regex(r'(user-files/.+)_.*?\.jpg$', thumbnail_url, u'path') +        path = self._search_regex(r'(user-files/.+)_.*?\.jpg$', thumbnail_url, 'path')          return {              'id': video_id, @@ -94,3 +100,44 @@ class LivestreamOriginalIE(InfoExtractor):              'ext': 'flv',              'thumbnail': thumbnail_url,          } + +    def _extract_folder(self, url, folder_id): +        webpage = self._download_webpage(url, folder_id) +        urls = orderedSet(re.findall(r'<a href="(https?://livestre\.am/.*?)"', webpage)) + +        return { +            '_type': 'playlist', +            'id': folder_id, +            'entries': [{ +                '_type': 'url', +                'url': video_url, +            } for video_url in urls], +        } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        id = mobj.group('id') +        user = mobj.group('user') +        url_type = mobj.group('type') +        if url_type == 'folder': +            return self._extract_folder(url, id) +        else: +            return self._extract_video(user, id) + + +# The server doesn't support HEAD request, the generic extractor can't detect +# the redirection +class LivestreamShortenerIE(InfoExtractor): +    IE_NAME = 'livestream:shortener' +    IE_DESC = False  # Do not list +    _VALID_URL = r'https?://livestre\.am/(?P<id>.+)' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        id = mobj.group('id') +        webpage = self._download_webpage(url, id) + +        return { +            '_type': 'url', +            'url': self._og_search_url(webpage), +        } diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py new file mode 100644 index 000000000..6229b2173 --- /dev/null +++ b/youtube_dl/extractor/motherless.py @@ -0,0 +1,87 @@ +from __future__ import unicode_literals + +import datetime +import re + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    unified_strdate, +) + + +class MotherlessIE(InfoExtractor): +    _VALID_URL = r'http://(?:www\.)?motherless\.com/(?P<id>[A-Z0-9]+)' +    _TESTS = [ +        { +            'url': 'http://motherless.com/AC3FFE1', +            'md5': '5527fef81d2e529215dad3c2d744a7d9', +            'info_dict': { +                'id': 'AC3FFE1', +                'ext': 'flv', +                'title': 'Fucked in the ass while playing PS3', +                'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], +                'upload_date': '20100913', +                'uploader_id': 'famouslyfuckedup', +                'thumbnail': 're:http://.*\.jpg', +                'age_limit': 18, +            } +        }, +        { +            'url': 'http://motherless.com/532291B', +            'md5': 'bc59a6b47d1f958e61fbd38a4d31b131', +            'info_dict': { +                'id': '532291B', +                'ext': 'mp4', +                'title': 'Amazing girl playing the omegle game, PERFECT!', +                'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', 'game', 'hairy'], +                'upload_date': '20140622', +                'uploader_id': 'Sulivana7x', +                'thumbnail': 're:http://.*\.jpg', +                'age_limit': 18, +            } +        } +    ] + +    def _real_extract(self,url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        title = self._html_search_regex(r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') +         +        video_url = self._html_search_regex(r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video_url') +        age_limit = self._rta_search(webpage) + +        view_count = self._html_search_regex(r'<strong>Views</strong>\s+([^<]+)<', webpage, 'view_count') +  +        upload_date = self._html_search_regex(r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload_date') +        if 'Ago' in upload_date: +            days = int(re.search(r'([0-9]+)', upload_date).group(1)) +            upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') +        else: +            upload_date = unified_strdate(upload_date) + +        like_count = self._html_search_regex(r'<strong>Favorited</strong>\s+([^<]+)<', webpage, 'like_count') + +        comment_count = webpage.count('class="media-comment-contents"') +        uploader_id = self._html_search_regex(r'"thumb-member-username">\s+<a href="/m/([^"]+)"', webpage, 'uploader_id') + +        categories = self._html_search_meta('keywords', webpage) +        if categories: +            categories = [cat.strip() for cat in categories.split(',')] + +        return { +            'id': video_id, +            'title': title, +            'upload_date': upload_date, +            'uploader_id': uploader_id, +            'thumbnail': self._og_search_thumbnail(webpage), +            'categories': categories, +            'view_count': int_or_none(view_count.replace(',', '')), +            'like_count': int_or_none(like_count.replace(',', '')), +            'comment_count': comment_count, +            'age_limit': age_limit, +            'url': video_url, +        } diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py index 39d6feb98..387935d4d 100644 --- a/youtube_dl/extractor/mpora.py +++ b/youtube_dl/extractor/mpora.py @@ -28,7 +28,7 @@ class MporaIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          data_json = self._search_regex( -            r"new FM\.Player\('[^']+',\s*(\{.*?)\);\n", webpage, 'json') +            r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;", webpage, 'json')          data = json.loads(data_json) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index e5ca41b40..af9490ccc 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -22,6 +22,7 @@ def _media_xml_tag(tag):  class MTVServicesInfoExtractor(InfoExtractor):      _MOBILE_TEMPLATE = None +      @staticmethod      def _id_from_uri(uri):          return uri.split(':')[-1] @@ -35,6 +36,9 @@ class MTVServicesInfoExtractor(InfoExtractor):          base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'          return base + m.group('finalid') +    def _get_feed_url(self, uri): +        return self._FEED_URL +      def _get_thumbnail_url(self, uri, itemdoc):          search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))          thumb_node = itemdoc.find(search_path) @@ -136,10 +140,10 @@ class MTVServicesInfoExtractor(InfoExtractor):      def _get_videos_info(self, uri):          video_id = self._id_from_uri(uri) +        feed_url = self._get_feed_url(uri)          data = compat_urllib_parse.urlencode({'uri': uri}) -          idoc = self._download_xml( -            self._FEED_URL + '?' + data, video_id, +            feed_url + '?' + data, video_id,              'Downloading info', transform_source=fix_xml_ampersands)          return [self._get_video_info(item) for item in idoc.findall('.//item')] @@ -160,6 +164,37 @@ class MTVServicesInfoExtractor(InfoExtractor):          return self._get_videos_info(mgid) +class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): +    IE_NAME = 'mtvservices:embedded' +    _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?|/|$)' + +    _TEST = { +        # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/ +        'url': 'http://media.mtvnservices.com/embed/mgid:uma:video:mtv.com:1043906/cp~vid%3D1043906%26uri%3Dmgid%3Auma%3Avideo%3Amtv.com%3A1043906', +        'md5': 'cb349b21a7897164cede95bd7bf3fbb9', +        'info_dict': { +            'id': '1043906', +            'ext': 'mp4', +            'title': 'Peter Dinklage Sums Up \'Game Of Thrones\' In 45 Seconds', +            'description': '"Sexy sexy sexy, stabby stabby stabby, beautiful language," says Peter Dinklage as he tries summarizing "Game of Thrones" in under a minute.', +        }, +    } + +    def _get_feed_url(self, uri): +        video_id = self._id_from_uri(uri) +        site_id = uri.replace(video_id, '') +        config_url = 'http://media.mtvnservices.com/pmt/e1/players/{0}/config.xml'.format(site_id) +        config_doc = self._download_xml(config_url, video_id) +        feed_node = config_doc.find('.//feed') +        feed_url = feed_node.text.strip().split('?')[0] +        return feed_url + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        mgid = mobj.group('mgid') +        return self._get_videos_info(mgid) + +  class MTVIE(MTVServicesInfoExtractor):      _VALID_URL = r'''(?x)^https?://          (?:(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$| diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py index 2fd5b8f04..551bd4d7a 100644 --- a/youtube_dl/extractor/newstube.py +++ b/youtube_dl/extractor/newstube.py @@ -4,18 +4,19 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..utils import ExtractorError  class NewstubeIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?newstube\.ru/media/(?P<id>.+)'      _TEST = { -        'url': 'http://newstube.ru/media/na-korable-progress-prodolzhaetsya-testirovanie-sistemy-kurs', +        'url': 'http://www.newstube.ru/media/telekanal-cnn-peremestil-gorod-slavyansk-v-krym',          'info_dict': { -            'id': 'd156a237-a6e9-4111-a682-039995f721f1', +            'id': '728e0ef2-e187-4012-bac0-5a081fdcb1f6',              'ext': 'flv', -            'title': 'На корабле «Прогресс» продолжается тестирование системы «Курс»', -            'description': 'md5:d0cbe7b4a6f600552617e48548d5dc77', -            'duration': 20.04, +            'title': 'Телеканал CNN переместил город Славянск в Крым', +            'description': 'md5:419a8c9f03442bc0b0a794d689360335', +            'duration': 31.05,          },          'params': {              # rtmp download @@ -40,6 +41,10 @@ class NewstubeIE(InfoExtractor):          def ns(s):              return s.replace('/', '/%(ns)s') % {'ns': '{http://app1.newstube.ru/N2SiteWS/player.asmx}'} +        error_message = player.find(ns('./ErrorMessage')) +        if error_message is not None: +            raise ExtractorError('%s returned error: %s' % (self.IE_NAME, error_message.text), expected=True) +          session_id = player.find(ns('./SessionId')).text          media_info = player.find(ns('./Medias/MediaInfo'))          title = media_info.find(ns('./Name')).text diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 517a72561..c0c139b5d 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -8,10 +8,9 @@ from ..utils import (      compat_urllib_parse,      compat_urllib_request,      compat_urlparse, -    compat_str, - -    ExtractorError,      unified_strdate, +    parse_duration, +    int_or_none,  ) @@ -30,6 +29,7 @@ class NiconicoIE(InfoExtractor):              'uploader_id': '2698420',              'upload_date': '20131123',              'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', +            'duration': 33,          },          'params': {              'username': 'ydl.niconico@gmail.com', @@ -37,17 +37,20 @@ class NiconicoIE(InfoExtractor):          },      } -    _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' +    _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/((?:[a-z]{2})?[0-9]+)'      _NETRC_MACHINE = 'niconico' +    # Determine whether the downloader uses authentication to download video +    _AUTHENTICATE = False      def _real_initialize(self): -        self._login() +        if self._downloader.params.get('username', None) is not None: +            self._AUTHENTICATE = True + +        if self._AUTHENTICATE: +            self._login()      def _login(self):          (username, password) = self._get_login_info() -        if username is None: -            # Login is required -            raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)          # Log in          login_form_strs = { @@ -79,44 +82,66 @@ class NiconicoIE(InfoExtractor):              'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,              note='Downloading video info page') -        # Get flv info -        flv_info_webpage = self._download_webpage( -            'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, -            video_id, 'Downloading flv info') +        if self._AUTHENTICATE: +            # Get flv info +            flv_info_webpage = self._download_webpage( +                'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, +                video_id, 'Downloading flv info') +        else: +            # Get external player info +            ext_player_info = self._download_webpage( +                'http://ext.nicovideo.jp/thumb_watch/' + video_id, video_id) +            thumb_play_key = self._search_regex( +                r'\'thumbPlayKey\'\s*:\s*\'(.*?)\'', ext_player_info, 'thumbPlayKey') + +            # Get flv info +            flv_info_data = compat_urllib_parse.urlencode({ +                'k': thumb_play_key, +                'v': video_id +            }) +            flv_info_request = compat_urllib_request.Request( +                'http://ext.nicovideo.jp/thumb_watch', flv_info_data, +                {'Content-Type': 'application/x-www-form-urlencoded'}) +            flv_info_webpage = self._download_webpage( +                flv_info_request, video_id, +                note='Downloading flv info', errnote='Unable to download flv info') +          video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]          # Start extracting information -        video_title = video_info.find('.//title').text -        video_extension = video_info.find('.//movie_type').text -        video_format = video_extension.upper() -        video_thumbnail = video_info.find('.//thumbnail_url').text -        video_description = video_info.find('.//description').text -        video_uploader_id = video_info.find('.//user_id').text -        video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) -        video_view_count = video_info.find('.//view_counter').text -        video_webpage_url = video_info.find('.//watch_url').text - -        # uploader -        video_uploader = video_uploader_id -        url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id -        try: -            user_info = self._download_xml( -                url, video_id, note='Downloading user information') -            video_uploader = user_info.find('.//nickname').text -        except ExtractorError as err: -            self._downloader.report_warning('Unable to download user info webpage: %s' % compat_str(err)) +        title = video_info.find('.//title').text +        extension = video_info.find('.//movie_type').text +        video_format = extension.upper() +        thumbnail = video_info.find('.//thumbnail_url').text +        description = video_info.find('.//description').text +        upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) +        view_count = int_or_none(video_info.find('.//view_counter').text) +        comment_count = int_or_none(video_info.find('.//comment_num').text) +        duration = parse_duration(video_info.find('.//length').text) +        webpage_url = video_info.find('.//watch_url').text + +        if video_info.find('.//ch_id') is not None: +            uploader_id = video_info.find('.//ch_id').text +            uploader = video_info.find('.//ch_name').text +        elif video_info.find('.//user_id') is not None: +            uploader_id = video_info.find('.//user_id').text +            uploader = video_info.find('.//user_nickname').text +        else: +            uploader_id = uploader = None          return {              'id': video_id,              'url': video_real_url, -            'title': video_title, -            'ext': video_extension, +            'title': title, +            'ext': extension,              'format': video_format, -            'thumbnail': video_thumbnail, -            'description': video_description, -            'uploader': video_uploader, -            'upload_date': video_upload_date, -            'uploader_id': video_uploader_id, -            'view_count': video_view_count, -            'webpage_url': video_webpage_url, +            'thumbnail': thumbnail, +            'description': description, +            'uploader': uploader, +            'upload_date': upload_date, +            'uploader_id': uploader_id, +            'view_count': view_count, +            'comment_count': comment_count, +            'duration': duration, +            'webpage_url': webpage_url,          } diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index c2e7b67c7..33daa0dec 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -47,7 +47,7 @@ class NineGagIE(InfoExtractor):          webpage = self._download_webpage(url, display_id)          post_view = json.loads(self._html_search_regex( -            r'var postView = new app\.PostView\({\s*post:\s*({.+?}),', webpage, 'post view')) +            r'var postView = new app\.PostView\({\s*post:\s*({.+?}),\s*posts:\s*prefetchedCurrentPost', webpage, 'post view'))          youtube_id = post_view['videoExternalId']          title = post_view['title'] diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index d451cd1bf..da203538d 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -35,7 +35,7 @@ class NocoIE(InfoExtractor):          video_id = mobj.group('id')          medias = self._download_json( -            'http://api.noco.tv/1.0/video/medias/%s' % video_id, video_id, 'Downloading video JSON') +            'https://api.noco.tv/1.0/video/medias/%s' % video_id, video_id, 'Downloading video JSON')          formats = [] @@ -43,7 +43,7 @@ class NocoIE(InfoExtractor):              format_id = fmt['quality_key']              file = self._download_json( -                'http://api.noco.tv/1.0/video/file/%s/fr/%s' % (format_id.lower(), video_id), +                'https://api.noco.tv/1.0/video/file/%s/fr/%s' % (format_id.lower(), video_id),                  video_id, 'Downloading %s video JSON' % format_id)              file_url = file['file'] @@ -71,7 +71,7 @@ class NocoIE(InfoExtractor):          self._sort_formats(formats)          show = self._download_json( -            'http://api.noco.tv/1.0/shows/show/%s' % video_id, video_id, 'Downloading show JSON')[0] +            'https://api.noco.tv/1.0/shows/show/%s' % video_id, video_id, 'Downloading show JSON')[0]          upload_date = unified_strdate(show['indexed'])          uploader = show['partner_name'] diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py new file mode 100644 index 000000000..fbcbe1f40 --- /dev/null +++ b/youtube_dl/extractor/npo.py @@ -0,0 +1,62 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    unified_strdate, +) + + +class NPOIE(InfoExtractor): +    IE_NAME = 'npo.nl' +    _VALID_URL = r'https?://www\.npo\.nl/[^/]+/[^/]+/(?P<id>[^/?]+)' + +    _TEST = { +        'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', +        'md5': '4b3f9c429157ec4775f2c9cb7b911016', +        'info_dict': { +            'id': 'VPWON_1220719', +            'ext': 'mp4', +            'title': 'Nieuwsuur', +            'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', +            'upload_date': '20140622', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        metadata = self._download_json( +            'http://e.omroep.nl/metadata/aflevering/%s' % video_id, +            video_id, +            # We have to remove the javascript callback +            transform_source=lambda j: re.sub(r'parseMetadata\((.*?)\);\n//epc', r'\1', j) +        ) +        token_page = self._download_webpage( +            'http://ida.omroep.nl/npoplayer/i.js', +            video_id, +            note='Downloading token' +        ) +        token = self._search_regex(r'npoplayer.token = "(.+?)"', token_page, 'token') +        streams_info = self._download_json( +            'http://ida.omroep.nl/odi/?prid=%s&puboptions=h264_std&adaptive=yes&token=%s' % (video_id, token), +            video_id +        ) + +        stream_info = self._download_json( +            streams_info['streams'][0] + '&type=json', +            video_id, +            'Downloading stream info' +        ) + +        return { +            'id': video_id, +            'title': metadata['titel'], +            'ext': 'mp4', +            'url': stream_info['url'], +            'description': metadata['info'], +            'thumbnail': metadata['images'][-1]['url'], +            'upload_date': unified_strdate(metadata['gidsdatum']), +        } diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 1f066cf05..96f0ae1eb 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -72,14 +72,14 @@ class NRKIE(InfoExtractor):  class NRKTVIE(InfoExtractor): -    _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-z]{4}\d{8})' +    _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})'      _TESTS = [          { -            'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/muhh48000314/23-05-2014', +            'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',              'md5': '7b96112fbae1faf09a6f9ae1aff6cb84',              'info_dict': { -                'id': 'muhh48000314', +                'id': 'MUHH48000314',                  'ext': 'flv',                  'title': '20 spørsmål',                  'description': 'md5:bdea103bc35494c143c6a9acdd84887a', @@ -141,4 +141,4 @@ class NRKTVIE(InfoExtractor):              'upload_date': upload_date,              'duration': duration,              'formats': formats, -        }
\ No newline at end of file +        } diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index e4c4ad714..da64a1a7b 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -158,19 +158,19 @@ class ProSiebenSat1IE(InfoExtractor):      _CLIPID_REGEXES = [          r'"clip_id"\s*:\s+"(\d+)"',          r'clipid: "(\d+)"', -        r'clipId=(\d+)', +        r'clip[iI]d=(\d+)',      ]      _TITLE_REGEXES = [          r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>',          r'<header class="clearfix">\s*<h3>(.+?)</h3>',          r'<!-- start video -->\s*<h1>(.+?)</h1>', -        r'<div class="ep-femvideos-pi4-video-txt">\s*<h2>(.+?)</h2>', +        r'<h1 class="att-name">\s*(.+?)</h1>',      ]      _DESCRIPTION_REGEXES = [          r'<p itemprop="description">\s*(.+?)</p>',          r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>',          r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>', -        r'<p>(.+?)</p>\s*<div class="ep-femvideos-pi4-video-footer">', +        r'<p class="att-description">\s*(.+?)\s*</p>',      ]      _UPLOAD_DATE_REGEXES = [          r'<meta property="og:published_time" content="(.+?)">', diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py new file mode 100644 index 000000000..ba3dd707f --- /dev/null +++ b/youtube_dl/extractor/rai.py @@ -0,0 +1,122 @@ +from __future__ import unicode_literals + +import re + +from .subtitles import SubtitlesInfoExtractor +from ..utils import ( +    parse_duration, +    unified_strdate, +    compat_urllib_parse, +) + + +class RaiIE(SubtitlesInfoExtractor): +    _VALID_URL = r'(?P<url>http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' +    _TESTS = [ +        { +            'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', +            'md5': 'c064c0b2d09c278fb293116ef5d0a32d', +            'info_dict': { +                'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', +                'ext': 'mp4', +                'title': 'Report del 07/04/2014', +                'description': 'md5:f27c544694cacb46a078db84ec35d2d9', +                'upload_date': '20140407', +                'duration': 6160, +            } +        }, +        { +            'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', +            'md5': '8bb9c151924ce241b74dd52ef29ceafa', +            'info_dict': { +                'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', +                'ext': 'mp4', +                'title': 'TG PRIMO TEMPO', +                'description': '', +                'upload_date': '20140612', +                'duration': 1758, +            }, +            'skip': 'Error 404', +        }, +        { +            'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', +            'md5': '35cf7c229f22eeef43e48b5cf923bef0', +            'info_dict': { +                'id': '7aafdea9-0e5d-49d5-88a6-7e65da67ae13', +                'ext': 'mp4', +                'title': 'State of the Net, Antonella La Carpia: regole virali', +                'description': 'md5:b0ba04a324126903e3da7763272ae63c', +                'upload_date': '20140613', +            }, +            'skip': 'Error 404', +        }, +        { +            'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html', +            'md5': '35694f062977fe6619943f08ed935730', +            'info_dict': { +                'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132', +                'ext': 'mp4', +                'title': 'Alluvione in Sardegna e dissesto idrogeologico', +                'description': 'Edizione delle ore 20:30 ', +            } +        }, +    ] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        media = self._download_json('%s?json' % mobj.group('url'), video_id, 'Downloading video JSON') + +        title = media.get('name') +        description = media.get('desc') +        thumbnail = media.get('image_300') or media.get('image_medium') or media.get('image') +        duration = parse_duration(media.get('length')) +        uploader = media.get('author') +        upload_date = unified_strdate(media.get('date')) + +        formats = [] + +        for format_id in ['wmv', 'm3u8', 'mediaUri', 'h264']: +            media_url = media.get(format_id) +            if not media_url: +                continue +            formats.append({ +                'url': media_url, +                'format_id': format_id, +                'ext': 'mp4', +            }) + +        if self._downloader.params.get('listsubtitles', False): +            page = self._download_webpage(url, video_id) +            self._list_available_subtitles(video_id, page) +            return + +        subtitles = {} +        if self._have_to_download_any_subtitles: +            page = self._download_webpage(url, video_id) +            subtitles = self.extract_subtitles(video_id, page) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'uploader': uploader, +            'upload_date': upload_date, +            'duration': duration, +            'formats': formats, +            'subtitles': subtitles, +        } + +    def _get_available_subtitles(self, video_id, webpage): +        subtitles = {} +        m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage) +        if m: +            captions = m.group('captions') +            STL_EXT = '.stl' +            SRT_EXT = '.srt' +            if captions.endswith(STL_EXT): +                captions = captions[:-len(STL_EXT)] + SRT_EXT +            subtitles['it'] = 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions) +        return subtitles
\ No newline at end of file diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 7aa100fb2..14ec9452d 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -255,7 +255,7 @@ class SoundcloudSetIE(SoundcloudIE):  class SoundcloudUserIE(SoundcloudIE): -    _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$' +    _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$'      IE_NAME = 'soundcloud:user'      # it's in tests/test_playlists.py @@ -264,24 +264,31 @@ class SoundcloudUserIE(SoundcloudIE):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          uploader = mobj.group('user') +        resource = mobj.group('rsrc') +        if resource is None: +            resource = 'tracks' +        elif resource == 'likes': +            resource = 'favorites'          url = 'http://soundcloud.com/%s/' % uploader          resolv_url = self._resolv_url(url)          user = self._download_json(              resolv_url, uploader, 'Downloading user info') -        base_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % uploader +        base_url = 'http://api.soundcloud.com/users/%s/%s.json?' % (uploader, resource)          entries = []          for i in itertools.count():              data = compat_urllib_parse.urlencode({                  'offset': i * 50, +                'limit': 50,                  'client_id': self._CLIENT_ID,              })              new_entries = self._download_json(                  base_url + data, uploader, 'Downloading track page %s' % (i + 1)) -            entries.extend(self._extract_info_dict(e, quiet=True) for e in new_entries) -            if len(new_entries) < 50: +            if len(new_entries) == 0: +                self.to_screen('%s: End page received' % uploader)                  break +            entries.extend(self._extract_info_dict(e, quiet=True) for e in new_entries)          return {              '_type': 'playlist', diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py new file mode 100644 index 000000000..a4f8ce6c3 --- /dev/null +++ b/youtube_dl/extractor/soundgasm.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class SoundgasmIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_\-]+)/(?P<title>[0-9a-zA-Z_\-]+)' +    _TEST = { +        'url': 'http://soundgasm.net/u/ytdl/Piano-sample', +        'md5': '010082a2c802c5275bb00030743e75ad', +        'info_dict': { +            'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9', +            'ext': 'm4a', +            'title': 'ytdl_Piano-sample', +            'description': 'Royalty Free Sample Music' +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        display_id = mobj.group('title') +        audio_title = mobj.group('user') + '_' + mobj.group('title') +        webpage = self._download_webpage(url, display_id) +        audio_url = self._html_search_regex( +            r'(?s)m4a\:\s"([^"]+)"', webpage, 'audio URL') +        audio_id = re.split('\/|\.', audio_url)[-2] +        description = self._html_search_regex( +            r'(?s)<li>Description:\s(.*?)<\/li>', webpage, 'description', +            fatal=False) + +        return { +            'id': audio_id, +            'display_id': display_id, +            'url': audio_url, +            'title': audio_title, +            'description': description +        } diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 9156d7faf..340a38440 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -1,3 +1,4 @@ +# encoding: utf-8  from __future__ import unicode_literals  import re @@ -9,18 +10,33 @@ class SpiegelIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'      _TESTS = [{          'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', -        'file': '1259285.mp4',          'md5': '2c2754212136f35fb4b19767d242f66e',          'info_dict': { +            'id': '1259285', +            'ext': 'mp4',              'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', +            'description': 'md5:8029d8310232196eb235d27575a8b9f4', +            'duration': 49,          }, -    }, -    { +    }, {          'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', -        'file': '1309159.mp4',          'md5': 'f2cdf638d7aa47654e251e1aee360af1',          'info_dict': { +            'id': '1309159', +            'ext': 'mp4',              'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', +            'description': 'md5:c2322b65e58f385a820c10fa03b2d088', +            'duration': 983, +        }, +    }, { +        'url': 'http://www.spiegel.de/video/johann-westhauser-videobotschaft-des-hoehlenforschers-video-1502367.html', +        'md5': '54f58ba0e752e3c07bc2a26222dd0acf', +        'info_dict': { +            'id': '1502367', +            'ext': 'mp4', +            'title': 'Videobotschaft: Höhlenforscher Westhauser dankt seinen Rettern', +            'description': 'md5:c6f1ec11413ebd1088b6813943e5fc91', +            'duration': 42,          },      }] @@ -30,18 +46,20 @@ class SpiegelIE(InfoExtractor):          webpage = self._download_webpage(url, video_id) -        video_title = self._html_search_regex( +        title = self._html_search_regex(              r'<div class="module-title">(.*?)</div>', webpage, 'title') +        description = self._html_search_meta('description', webpage, 'description') + +        base_url = self._search_regex( +            r'var\s+server\s*=\s*"([^"]+)\"', webpage, 'server URL') -        xml_url = 'http://video2.spiegel.de/flash/' + video_id + '.xml' -        idoc = self._download_xml( -            xml_url, video_id, -            note='Downloading XML', errnote='Failed to download XML') +        xml_url = base_url + video_id + '.xml' +        idoc = self._download_xml(xml_url, video_id)          formats = [              {                  'format_id': n.tag.rpartition('type')[2], -                'url': 'http://video2.spiegel.de/flash/' + n.find('./filename').text, +                'url': base_url + n.find('./filename').text,                  'width': int(n.find('./width').text),                  'height': int(n.find('./height').text),                  'abr': int(n.find('./audiobitrate').text), @@ -59,7 +77,8 @@ class SpiegelIE(InfoExtractor):          return {              'id': video_id, -            'title': video_title, +            'title': title, +            'description': description,              'duration': duration,              'formats': formats,          } diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index 1d8d57224..af689e2c2 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -53,7 +53,7 @@ class SteamIE(InfoExtractor):              'ext': 'mp4',              'upload_date': '20140329',              'title': 'FRONTIERS - Final Greenlight Trailer', -            'description': "The final trailer for the Steam Greenlight launch. Hooray, progress! Here's the official Greenlight page: http://steamcommunity.com/sharedfiles/filedetails/?id=242472205", +            'description': 'md5:6df4fe8dd494ae811869672b0767e025',              'uploader': 'AAD Productions',              'uploader_id': 'AtomicAgeDogGames',          } diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 36331529e..25b9864ad 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -20,13 +20,13 @@ class TagesschauIE(InfoExtractor):              'thumbnail': 're:^http:.*\.jpg$',          },      }, { -        'url': 'http://www.tagesschau.de/multimedia/video/video-196.html', -        'md5': '8aaa8bf3ae1ca2652309718c03019128', +        'url': 'http://www.tagesschau.de/multimedia/video/video-5964.html', +        'md5': '66652566900963a3f962333579eeffcf',          'info_dict': { -            'id': '196', +            'id': '5964',              'ext': 'mp4', -            'title': 'Ukraine-Konflikt: Klitschko in Kiew als Bürgermeister vereidigt', -            'description': 'md5:f22e4af75821d174fa6c977349682691', +            'title': 'Nahost-Konflikt: Israel bombadiert Ziele im Gazastreifen und Westjordanland', +            'description': 'md5:07bfc78c48eec3145ed4805299a1900a',              'thumbnail': 're:http://.*\.jpg',          },      }] diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index 4d9666c6b..2c2113b14 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -14,7 +14,7 @@ class TeacherTubeIE(InfoExtractor):      IE_NAME = 'teachertube'      IE_DESC = 'teachertube.com videos' -    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=)(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=|video/(?:[\da-z-]+-)?|audio/)(?P<id>\d+)'      _TESTS = [{          'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997', @@ -22,8 +22,8 @@ class TeacherTubeIE(InfoExtractor):          'info_dict': {              'id': '339997',              'ext': 'mp4', -            'title': 'Measures of dispersion from a frequency table_x264', -            'description': 'md5:a3e9853487185e9fcd7181a07164650b', +            'title': 'Measures of dispersion from a frequency table', +            'description': 'Measures of dispersion from a frequency table',              'thumbnail': 're:http://.*\.jpg',          },      }, { @@ -33,7 +33,7 @@ class TeacherTubeIE(InfoExtractor):              'id': '340064',              'ext': 'mp4',              'title': 'How to Make Paper Dolls _ Paper Art Projects', -            'description': 'md5:2ca52b20cd727773d1dc418b3d6bd07b', +            'description': 'Learn how to make paper dolls in this simple',              'thumbnail': 're:http://.*\.jpg',          },      }, { @@ -43,7 +43,16 @@ class TeacherTubeIE(InfoExtractor):              'id': '8805',              'ext': 'mp3',              'title': 'PER ASPERA AD ASTRA', -            'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNIČKE ŠKOLE PER ASPERA AD ASTRA', +            'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNI?KE ?KOLE P', +        }, +    }, { +        'url': 'http://www.teachertube.com/video/intro-video-schleicher-297790', +        'md5': '9c79fbb2dd7154823996fc28d4a26998', +        'info_dict': { +            'id': '297790', +            'ext': 'mp4', +            'title': 'Intro Video - Schleicher', +            'description': 'Intro Video - Why to flip, how flipping will',          },      }] @@ -53,9 +62,20 @@ class TeacherTubeIE(InfoExtractor):          webpage = self._download_webpage(url, video_id) +        title = self._html_search_meta('title', webpage, 'title') +        TITLE_SUFFIX = ' - TeacherTube' +        if title.endswith(TITLE_SUFFIX): +            title = title[:-len(TITLE_SUFFIX)].strip() + +        description = self._html_search_meta('description', webpage, 'description') +        if description: +            description = description.strip() +          quality = qualities(['mp3', 'flv', 'mp4']) -        _, media_urls = zip(*re.findall(r'([\'"])file\1\s*:\s*"([^"]+)"', webpage)) +        media_urls = re.findall(r'data-contenturl="([^"]+)"', webpage) +        media_urls.extend(re.findall(r'var\s+filePath\s*=\s*"([^"]+)"', webpage)) +        media_urls.extend(re.findall(r'\'file\'\s*:\s*["\']([^"\']+)["\'],', webpage))          formats = [              { @@ -68,28 +88,37 @@ class TeacherTubeIE(InfoExtractor):          return {              'id': video_id, -            'title': self._og_search_title(webpage), -            'thumbnail': self._og_search_thumbnail(webpage), +            'title': title, +            'thumbnail': self._html_search_regex(r'\'image\'\s*:\s*["\']([^"\']+)["\']', webpage, 'thumbnail'),              'formats': formats, -            'description': self._og_search_description(webpage), +            'description': description,          } -class TeacherTubeClassroomIE(InfoExtractor): -    IE_NAME = 'teachertube:classroom' -    IE_DESC = 'teachertube.com online classrooms' +class TeacherTubeUserIE(InfoExtractor): +    IE_NAME = 'teachertube:user:collection' +    IE_DESC = 'teachertube.com user and collection videos' + +    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(user/profile|collection)/(?P<user>[0-9a-zA-Z]+)/?' -    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/view_classroom\.php\?user=(?P<user>[0-9a-zA-Z]+)' +    _MEDIA_RE = r'(?s)"sidebar_thumb_time">[0-9:]+</div>.+?<a href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)">'      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          user_id = mobj.group('user') -        rss = self._download_xml('http://www.teachertube.com/rssclassroom.php?mode=user&username=%s' % user_id, -                                      user_id, 'Downloading classroom RSS') +        urls = [] +        webpage = self._download_webpage(url, user_id) +        urls.extend(re.findall(self._MEDIA_RE, webpage)) +         +        pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[1:-1] +        for p in pages: +            more = 'http://www.teachertube.com/ajax-user/user-videos/%s?page=%s' % (user_id, p) +            webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages) + 1)) +            urls.extend(re.findall(self._MEDIA_RE, webpage))          entries = [] -        for url in rss.findall('.//{http://search.yahoo.com/mrss/}player'): -            entries.append(self.url_result(url.attrib['url'], 'TeacherTube')) +        for url in urls: +            entries.append(self.url_result(url, 'TeacherTube'))          return self.playlist_result(entries, user_id) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index d260c91c2..bce32a873 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -27,7 +27,7 @@ class TEDIE(SubtitlesInfoExtractor):          '''      _TESTS = [{          'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', -        'md5': '4ea1dada91e4174b53dac2bb8ace429d', +        'md5': 'fc94ac279feebbce69f21c0c6ee82810',          'info_dict': {              'id': '102',              'ext': 'mp4', diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py index 34008afc6..0f389bd93 100644 --- a/youtube_dl/extractor/toypics.py +++ b/youtube_dl/extractor/toypics.py @@ -1,10 +1,13 @@ +# -*- coding:utf-8 -*- +from __future__ import unicode_literals +  from .common import InfoExtractor  import re  class ToypicsIE(InfoExtractor):      IE_DESC = 'Toypics user profile' -    _VALID_URL = r'http://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*' +    _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*'      _TEST = {          'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/',          'md5': '16e806ad6d6f58079d210fe30985e08b', @@ -61,7 +64,7 @@ class ToypicsUserIE(InfoExtractor):                  note='Downloading page %d/%d' % (n, page_count))              urls.extend(                  re.findall( -                    r'<p class="video-entry-title">\n\s*<a href="(http://videos.toypics.net/view/[^"]+)">', +                    r'<p class="video-entry-title">\s+<a href="(https?://videos.toypics.net/view/[^"]+)">',                      lpage))          return { diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 544369068..2882c1809 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*-  from __future__ import unicode_literals  import re @@ -10,14 +11,27 @@ from ..utils import (  class TumblrIE(InfoExtractor):      _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)($|/)' -    _TEST = { +    _TESTS = [{          'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', -        'file': '54196191430.mp4',          'md5': '479bb068e5b16462f5176a6828829767',          'info_dict': { -            "title": "tatiana maslany news" +            'id': '54196191430', +            'ext': 'mp4', +            'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', +            'description': 'md5:dfac39636969fe6bf1caa2d50405f069', +            'thumbnail': 're:http://.*\.jpg',          } -    } +    }, { +        'url': 'http://5sostrum.tumblr.com/post/90208453769/yall-forgetting-the-greatest-keek-of-them-all', +        'md5': 'bf348ef8c0ef84fbf1cbd6fa6e000359', +        'info_dict': { +            'id': '90208453769', +            'ext': 'mp4', +            'title': '5SOS STRUM ;)', +            'description': 'md5:dba62ac8639482759c8eb10ce474586a', +            'thumbnail': 're:http://.*\.jpg', +        } +    }]      def _real_extract(self, url):          m_url = re.match(self._VALID_URL, url) @@ -48,6 +62,7 @@ class TumblrIE(InfoExtractor):          return [{'id': video_id,                   'url': video_url,                   'title': video_title, +                 'description': self._html_search_meta('description', webpage),                   'thumbnail': video_thumbnail,                   'ext': ext                   }] diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index fb132aef6..a7953a7e7 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -49,6 +49,7 @@ class VeohIE(InfoExtractor):                  'description': 'md5:f5a11c51f8fb51d2315bca0937526891',                  'uploader': 'newsy-videos',              }, +            'skip': 'This video has been deleted.',          },      ] diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py index b5034b02f..a647807d0 100644 --- a/youtube_dl/extractor/videott.py +++ b/youtube_dl/extractor/videott.py @@ -4,7 +4,10 @@ import re  import base64  from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( +    unified_strdate, +    int_or_none, +)  class VideoTtIE(InfoExtractor): @@ -50,9 +53,9 @@ class VideoTtIE(InfoExtractor):              'thumbnail': settings['config']['thumbnail'],              'upload_date': unified_strdate(video['added']),              'uploader': video['owner'], -            'view_count': int(video['view_count']), -            'comment_count': int(video['comment_count']), -            'like_count': int(video['liked']), -            'dislike_count': int(video['disliked']), +            'view_count': int_or_none(video['view_count']), +            'comment_count': None if video.get('comment_count') == '--' else int_or_none(video['comment_count']), +            'like_count': int_or_none(video['liked']), +            'dislike_count': int_or_none(video['disliked']),              'formats': formats,          }
\ No newline at end of file diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index fb082f364..918bd1098 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -16,7 +16,7 @@ from ..utils import (  class VKIE(InfoExtractor):      IE_NAME = 'vk.com' -    _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))' +    _VALID_URL = r'https?://(?:m\.)?vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:.+?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'      _NETRC_MACHINE = 'vk'      _TESTS = [ @@ -27,7 +27,7 @@ class VKIE(InfoExtractor):                  'id': '162222515',                  'ext': 'flv',                  'title': 'ProtivoGunz - Хуёвая песня', -                'uploader': 'Noize MC', +                'uploader': 're:Noize MC.*',                  'duration': 195,              },          }, @@ -62,11 +62,47 @@ class VKIE(InfoExtractor):                  'id': '164049491',                  'ext': 'mp4',                  'uploader': 'Триллеры', -                'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]\u00a0', +                'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',                  'duration': 8352,              },              'skip': 'Requires vk account credentials',          }, +        { +            'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', +            'md5': 'd82c22e449f036282d1d3f7f4d276869', +            'info_dict': { +                'id': '166094326', +                'ext': 'mp4', +                'uploader': 'Киномания - лучшее из мира кино', +                'title': 'Запах женщины (1992)', +                'duration': 9392, +            }, +            'skip': 'Requires vk account credentials', +        }, +        { +            'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', +            'md5': '4d7a5ef8cf114dfa09577e57b2993202', +            'info_dict': { +                'id': '168067957', +                'ext': 'mp4', +                'uploader': 'Киномания - лучшее из мира кино', +                'title': ' ', +                'duration': 7291, +            }, +            'skip': 'Requires vk account credentials', +        }, +        { +            'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540', +            'md5': '0c45586baa71b7cb1d0784ee3f4e00a6', +            'note': 'ivi.ru embed', +            'info_dict': { +                'id': '60690', +                'ext': 'mp4', +                'title': 'Книга Илая', +                'duration': 6771, +            }, +            'skip': 'Only works from Russia', +        },      ]      def _login(self): @@ -110,6 +146,16 @@ class VKIE(InfoExtractor):          if m_yt is not None:              self.to_screen('Youtube video detected')              return self.url_result(m_yt.group(1), 'Youtube') + +        m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page) +        if m_opts: +            m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1)) +            if m_opts_url: +                opts_url = m_opts_url.group(1) +                if opts_url.startswith('//'): +                    opts_url = 'http:' + opts_url +                return self.url_result(opts_url) +          data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars')          data = json.loads(data_json) diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py new file mode 100644 index 000000000..dfc570930 --- /dev/null +++ b/youtube_dl/extractor/vodlocker.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re +import time +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    compat_urllib_parse, +    compat_urllib_request, +) + + +class VodlockerIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?vodlocker.com/(?P<id>[0-9a-zA-Z]+)(?:\..*?)?' + +    _TESTS = [{ +        'url': 'http://vodlocker.com/e8wvyzz4sl42', +        'md5': 'ce0c2d18fa0735f1bd91b69b0e54aacf', +        'info_dict': { +            'id': 'e8wvyzz4sl42', +            'ext': 'mp4', +            'title': 'Germany vs Brazil', +            'thumbnail': 're:http://.*\.jpg', +        }, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        webpage = self._download_webpage(url, video_id) + +        fields = dict(re.findall(r'''(?x)<input\s+ +            type="hidden"\s+ +            name="([^"]+)"\s+ +            (?:id="[^"]+"\s+)? +            value="([^"]*)" +            ''', webpage)) + +        if fields['op'] == 'download1': +            self._sleep(3, video_id)  # they do detect when requests happen too fast! +            post = compat_urllib_parse.urlencode(fields) +            req = compat_urllib_request.Request(url, post) +            req.add_header('Content-type', 'application/x-www-form-urlencoded') +            webpage = self._download_webpage( +                req, video_id, 'Downloading video page') + +        title = self._search_regex( +            r'id="file_title".*?>\s*(.*?)\s*<span', webpage, 'title') +        thumbnail = self._search_regex( +            r'image:\s*"(http[^\"]+)",', webpage, 'thumbnail') +        url = self._search_regex( +            r'file:\s*"(http[^\"]+)",', webpage, 'file url') + +        formats = [{ +            'format_id': 'sd', +            'url': url, +        }] + +        return { +            'id': video_id, +            'title': title, +            'thumbnail': thumbnail, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index feeb44b45..f741ba540 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*-  from __future__ import unicode_literals  import re @@ -54,14 +55,14 @@ class WDRIE(InfoExtractor):              },          },          { -            'url': 'http://www.funkhauseuropa.de/av/audiogrenzenlosleckerbaklava101-audioplayer.html', -            'md5': 'cfff440d4ee64114083ac44676df5d15', +            'url': 'http://www.funkhauseuropa.de/av/audiosuepersongsoulbossanova100-audioplayer.html', +            'md5': '24e83813e832badb0a8d7d1ef9ef0691',              'info_dict': { -                'id': 'mdb-363068', +                'id': 'mdb-463528',                  'ext': 'mp3', -                'title': 'Grenzenlos lecker - Baklava', +                'title': 'Süpersong: Soul Bossa Nova',                  'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', -                'upload_date': '20140311', +                'upload_date': '20140630',              },          },      ] @@ -127,9 +128,10 @@ class WDRMobileIE(InfoExtractor):          'info_dict': {              'title': '4283021',              'id': '421735', +            'ext': 'mp4',              'age_limit': 0,          }, -        '_skip': 'Will be depublicized shortly' +        'skip': 'Problems with loading data.'      }      def _real_extract(self, url): @@ -139,6 +141,7 @@ class WDRMobileIE(InfoExtractor):              'title': mobj.group('title'),              'age_limit': int(mobj.group('age_limit')),              'url': url, +            'ext': determine_ext(url),              'user_agent': 'mobile',          } diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index bc31c2e64..e6bfa9e14 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import json  import re @@ -5,14 +7,16 @@ from .common import InfoExtractor  class WistiaIE(InfoExtractor): -    _VALID_URL = r'^https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)' +    _VALID_URL = r'https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'      _TEST = { -        u"url": u"http://fast.wistia.net/embed/iframe/sh7fpupwlt", -        u"file": u"sh7fpupwlt.mov", -        u"md5": u"cafeb56ec0c53c18c97405eecb3133df", -        u"info_dict": { -            u"title": u"cfh_resourceful_zdkh_final_1" +        'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt', +        'md5': 'cafeb56ec0c53c18c97405eecb3133df', +        'info_dict': { +            'id': 'sh7fpupwlt', +            'ext': 'mov', +            'title': 'Being Resourceful', +            'duration': 117,          },      } @@ -22,7 +26,7 @@ class WistiaIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          data_json = self._html_search_regex( -            r'Wistia.iframeInit\((.*?), {}\);', webpage, u'video data') +            r'Wistia\.iframeInit\((.*?), {}\);', webpage, 'video data')          data = json.loads(data_json) @@ -54,4 +58,5 @@ class WistiaIE(InfoExtractor):              'title': data['name'],              'formats': formats,              'thumbnails': thumbnails, +            'duration': data.get('duration'),          } diff --git a/youtube_dl/extractor/wrzuta.py b/youtube_dl/extractor/wrzuta.py new file mode 100644 index 000000000..34dd6d952 --- /dev/null +++ b/youtube_dl/extractor/wrzuta.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    qualities, +) + + +class WrzutaIE(InfoExtractor): +    IE_NAME = 'wrzuta.pl' + +    _VALID_URL = r'https?://(?P<uploader>[0-9a-zA-Z]+)\.wrzuta\.pl/(?P<typ>film|audio)/(?P<id>[0-9a-zA-Z]+)' + +    _TESTS = [{ +        'url': 'http://laboratoriumdextera.wrzuta.pl/film/aq4hIZWrkBu/nike_football_the_last_game', +        'md5': '9e67e05bed7c03b82488d87233a9efe7', +        'info_dict': { +            'id': 'aq4hIZWrkBu', +            'ext': 'mp4', +            'title': 'Nike Football: The Last Game', +            'duration': 307, +            'uploader_id': 'laboratoriumdextera', +            'description': 'md5:7fb5ef3c21c5893375fda51d9b15d9cd', +        }, +    }, { +        'url': 'http://w729.wrzuta.pl/audio/9oXJqdcndqv/david_guetta_amp_showtek_ft._vassy_-_bad', +        'md5': '1e546a18e1c22ac6e9adce17b8961ff5', +        'info_dict': { +            'id': '9oXJqdcndqv', +            'ext': 'ogg', +            'title': 'David Guetta & Showtek ft. Vassy - Bad', +            'duration': 270, +            'uploader_id': 'w729', +            'description': 'md5:4628f01c666bbaaecefa83476cfa794a', +        }, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        typ = mobj.group('typ') +        uploader = mobj.group('uploader') + +        webpage = self._download_webpage(url, video_id) + +        quality = qualities(['SD', 'MQ', 'HQ', 'HD']) + +        audio_table = {'flv': 'mp3', 'webm': 'ogg'} + +        embedpage = self._download_json('http://www.wrzuta.pl/npp/embed/%s/%s' % (uploader, video_id), video_id) + +        formats = [] +        for media in embedpage['url']: +            if typ == 'audio': +                ext = audio_table[media['type'].split('@')[0]] +            else: +                ext = media['type'].split('@')[0] + +            formats.append({ +                'format_id': '%s_%s' % (ext, media['quality'].lower()), +                'url': media['url'], +                'ext': ext, +                'quality': quality(media['quality']), +            }) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': self._og_search_title(webpage), +            'thumbnail': self._og_search_thumbnail(webpage), +            'formats': formats, +            'duration': int_or_none(embedpage['duration']), +            'uploader_id': uploader, +            'description': self._og_search_description(webpage), +            'age_limit': embedpage.get('minimalAge', 0), +        } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7c50881c4..6123e1256 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -224,6 +224,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, +        '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          # Dash webm audio          '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50}, @@ -440,7 +441,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):      def _parse_sig_js(self, jscode):          funcname = self._search_regex( -            r'signature=([a-zA-Z]+)', jscode, +            r'signature=([$a-zA-Z]+)', jscode,               u'Initial JS player signature function name')          jsi = JSInterpreter(jscode) @@ -864,71 +865,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):      def _decrypt_signature(self, s, video_id, player_url, age_gate=False):          """Turn the encrypted s field into a working signature""" -        if player_url is not None: -            if player_url.startswith(u'//'): -                player_url = u'https:' + player_url -            try: -                player_id = (player_url, len(s)) -                if player_id not in self._player_cache: -                    func = self._extract_signature_function( -                        video_id, player_url, len(s) -                    ) -                    self._player_cache[player_id] = func -                func = self._player_cache[player_id] -                if self._downloader.params.get('youtube_print_sig_code'): -                    self._print_sig_code(func, len(s)) -                return func(s) -            except Exception: -                tb = traceback.format_exc() -                self._downloader.report_warning( -                    u'Automatic signature extraction failed: ' + tb) - -            self._downloader.report_warning( -                u'Warning: Falling back to static signature algorithm') - -        return self._static_decrypt_signature( -            s, video_id, player_url, age_gate) - -    def _static_decrypt_signature(self, s, video_id, player_url, age_gate): -        if age_gate: -            # The videos with age protection use another player, so the -            # algorithms can be different. -            if len(s) == 86: -                return s[2:63] + s[82] + s[64:82] + s[63] - -        if len(s) == 93: -            return s[86:29:-1] + s[88] + s[28:5:-1] -        elif len(s) == 92: -            return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] -        elif len(s) == 91: -            return s[84:27:-1] + s[86] + s[26:5:-1] -        elif len(s) == 90: -            return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] -        elif len(s) == 89: -            return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1] -        elif len(s) == 88: -            return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28] -        elif len(s) == 87: -            return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] -        elif len(s) == 86: -            return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1] -        elif len(s) == 85: -            return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84] -        elif len(s) == 84: -            return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1] -        elif len(s) == 83: -            return s[80:63:-1] + s[0] + s[62:0:-1] + s[63] -        elif len(s) == 82: -            return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37] -        elif len(s) == 81: -            return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] -        elif len(s) == 80: -            return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80] -        elif len(s) == 79: -            return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] +        if player_url is None: +            raise ExtractorError(u'Cannot decrypt signature without player_url') -        else: -            raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) +        if player_url.startswith(u'//'): +            player_url = u'https:' + player_url +        try: +            player_id = (player_url, len(s)) +            if player_id not in self._player_cache: +                func = self._extract_signature_function( +                    video_id, player_url, len(s) +                ) +                self._player_cache[player_id] = func +            func = self._player_cache[player_id] +            if self._downloader.params.get('youtube_print_sig_code'): +                self._print_sig_code(func, len(s)) +            return func(s) +        except Exception as e: +            tb = traceback.format_exc() +            raise ExtractorError( +                u'Automatic signature extraction failed: ' + tb, cause=e)      def _get_available_subtitles(self, video_id, webpage):          try: @@ -1386,13 +1342,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):                          |  p/                          )                          ( -                            (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,} +                            (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}                              # Top tracks, they can also include dots                               |(?:MC)[\w\.]*                          )                          .*                       | -                        ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) +                        ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})                       )"""      _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'      _MORE_PAGES_INDICATOR = r'data-link-type="next"' @@ -1697,14 +1653,14 @@ class YoutubeSearchURLIE(InfoExtractor):          webpage = self._download_webpage(url, query)          result_code = self._search_regex( -            r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML') +            r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')          part_codes = re.findall(              r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)          entries = []          for part_code in part_codes:              part_title = self._html_search_regex( -                r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False) +                [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)              part_url_snippet = self._html_search_regex(                  r'(?s)href="([^"]+)"', part_code, 'item URL')              part_url = compat_urlparse.urljoin( @@ -1824,10 +1780,21 @@ class YoutubeTruncatedURLIE(InfoExtractor):      IE_NAME = 'youtube:truncated_url'      IE_DESC = False  # Do not list      _VALID_URL = r'''(?x) -        (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$| +        (?:https?://)?[^/]+/watch\?(?: +            feature=[a-z_]+| +            annotation_id=annotation_[^&]+ +        )?$|          (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$      ''' +    _TESTS = [{ +        'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041', +        'only_matching': True, +    }, { +        'url': 'http://www.youtube.com/watch?', +        'only_matching': True, +    }] +      def _real_extract(self, url):          raise ExtractorError(              u'Did you forget to quote the URL? Remember that & is a meta ' diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 449482d3c..3bbb07704 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -59,7 +59,7 @@ class JSInterpreter(object):              if member == 'split("")':                  return list(val)              if member == 'join("")': -                return u''.join(val) +                return ''.join(val)              if member == 'length':                  return len(val)              if member == 'reverse()': @@ -99,7 +99,7 @@ class JSInterpreter(object):      def extract_function(self, funcname):          func_m = re.search( -            (r'(?:function %s|%s\s*=\s*function)' % ( +            (r'(?:function %s|[{;]%s\s*=\s*function)' % (                  re.escape(funcname), re.escape(funcname))) +              r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',              self.code) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b97e62ae9..09312e81a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -816,6 +816,9 @@ def unified_strdate(date_str):          '%d %b %Y',          '%B %d %Y',          '%b %d %Y', +        '%b %dst %Y %I:%M%p', +        '%b %dnd %Y %I:%M%p', +        '%b %dth %Y %I:%M%p',          '%Y-%m-%d',          '%d.%m.%Y',          '%d/%m/%Y', diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0c9dd6895..d6b05892c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.06.09' +__version__ = '2014.07.11' | 
