diff options
91 files changed, 2314 insertions, 1075 deletions
| @@ -124,3 +124,6 @@ Mohammad Teimori Pabandi  Roman Le Négrate  Matthias Küch  Julian Richen +Ping O. +Mister Hat +Peter Ding @@ -17,12 +17,12 @@ youtube-dl - download videos from youtube.com or other video platforms  To install it right away for all UNIX users (Linux, OS X, etc.), type:      sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl -    sudo chmod a+x /usr/local/bin/youtube-dl +    sudo chmod a+rx /usr/local/bin/youtube-dl  If you do not have curl, you can alternatively use a recent wget:      sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl -    sudo chmod a+x /usr/local/bin/youtube-dl +    sudo chmod a+rx /usr/local/bin/youtube-dl  Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29). @@ -168,7 +168,7 @@ which means you can modify it, redistribute it or use it however you like.      --no-progress                    Do not print progress bar      --console-title                  Display progress in console titlebar      -v, --verbose                    Print various debugging information -    --dump-pages                     Print downloaded pages to debug problems (very verbose) +    --dump-pages                     Print downloaded pages encoded using base64 to debug problems (very verbose)      --write-pages                    Write downloaded intermediary pages to files in the current directory to debug problems      --print-traffic                  Display sent and read HTTP traffic      -C, --call-home                  Contact the youtube-dl server for debugging @@ -220,7 +220,7 @@ which means you can modify it, redistribute it or use it however you like.      --embed-thumbnail                Embed thumbnail in the audio as cover art      --add-metadata                   Write metadata to the video file      --metadata-from-title FORMAT     Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed -                                     parameters replace existing values. Additional templates: %(album), %(artist). Example: --metadata-from-title "%(artist)s - +                                     parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s -                                       %(title)s" matches a title like "Coldplay - Paradise"      --xattrs                         Write metadata to the video file's xattrs (using dublin core and xdg standards)      --fixup POLICY                   Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default; diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 43fbe8b1d..d147b53fe 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -10,6 +10,7 @@   - **56.com**   - **5min**   - **8tracks** + - **91porn**   - **9gag**   - **abc.net.au**   - **Abc7News** @@ -26,8 +27,7 @@   - **anitube.se**   - **AnySex**   - **Aparat** - - **AppleDailyAnimationNews** - - **AppleDailyRealtimeNews** + - **AppleDaily**   - **AppleTrailers**   - **archive.org**: archive.org videos   - **ARD** @@ -142,6 +142,7 @@   - **Eporner**   - **EroProfile**   - **Escapist** + - **ESPN** (Currently broken)   - **EveryonesMixtape**   - **exfm**: ex.fm   - **ExpoTV** @@ -151,7 +152,6 @@   - **fc2**   - **fernsehkritik.tv**   - **fernsehkritik.tv:postecke** - - **Firedrive**   - **Firstpost**   - **Flickr**   - **Folketinget**: Folketinget (ft.dk; Danish parliament) @@ -229,6 +229,7 @@   - **KanalPlay**: Kanal 5/9/11 Play   - **Kankan**   - **Karaoketv** + - **KarriereVideos**   - **keek**   - **KeezMovies**   - **KhanAcademy** @@ -319,8 +320,10 @@   - **Noco**   - **Normalboots**   - **NosVideo** + - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz   - **novamov**: NovaMov   - **Nowness** + - **NowTV**   - **nowvideo**: NowVideo   - **npo.nl**   - **npo.nl:live** @@ -338,6 +341,7 @@   - **OktoberfestTV**   - **on.aol.com**   - **Ooyala** + - **OoyalaExternal**   - **OpenFilm**   - **orf:fm4**: radio FM4   - **orf:iptv**: iptv.ORF.at @@ -391,7 +395,6 @@   - **Rte**   - **rtl.nl**: rtl.nl and rtlxl.nl   - **RTL2** - - **RTLnow**   - **RTP**   - **RTS**: RTS.ch   - **rtve.es:alacarta**: RTVE a la carta @@ -429,8 +432,9 @@   - **smotri:community**: Smotri.com community videos   - **smotri:user**: Smotri.com user videos   - **Snotr** - - **Sockshare**   - **Sohu** + - **soompi** + - **soompi:show**   - **soundcloud**   - **soundcloud:playlist**   - **soundcloud:set** @@ -451,6 +455,7 @@   - **Spike**   - **Sport5**   - **SportBox** + - **SportBoxEmbed**   - **SportDeutschland**   - **Srf**   - **SRMediathek**: Saarländischer Rundfunk @@ -504,12 +509,15 @@   - **Trilulilu**   - **TruTube**   - **Tube8** + - **TubiTv**   - **Tudou**   - **Tumblr**   - **TuneIn**   - **Turbo**   - **Tutv**   - **tv.dfb.de** + - **TV2** + - **TV2Article**   - **TV4**: tv4.se and tv4play.se   - **tvigle**: Интернет-телевидение Tvigle.ru   - **tvp.pl** @@ -559,6 +567,7 @@   - **vier:videos**   - **Viewster**   - **viki** + - **viki:channel**   - **vimeo**   - **vimeo:album**   - **vimeo:channel** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 82b827536..a13c09ef4 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -12,6 +12,7 @@ import copy  from test.helper import FakeYDL, assertRegexpMatches  from youtube_dl import YoutubeDL +from youtube_dl.compat import compat_str  from youtube_dl.extractor import YoutubeIE  from youtube_dl.postprocessor.common import PostProcessor  from youtube_dl.utils import match_filter_func @@ -507,6 +508,51 @@ class TestYoutubeDL(unittest.TestCase):          res = get_videos(f)          self.assertEqual(res, ['1']) +    def test_playlist_items_selection(self): +        entries = [{ +            'id': compat_str(i), +            'title': compat_str(i), +            'url': TEST_URL, +        } for i in range(1, 5)] +        playlist = { +            '_type': 'playlist', +            'id': 'test', +            'entries': entries, +            'extractor': 'test:playlist', +            'extractor_key': 'test:playlist', +            'webpage_url': 'http://example.com', +        } + +        def get_ids(params): +            ydl = YDL(params) +            # make a copy because the dictionary can be modified +            ydl.process_ie_result(playlist.copy()) +            return [int(v['id']) for v in ydl.downloaded_info_dicts] + +        result = get_ids({}) +        self.assertEqual(result, [1, 2, 3, 4]) + +        result = get_ids({'playlistend': 10}) +        self.assertEqual(result, [1, 2, 3, 4]) + +        result = get_ids({'playlistend': 2}) +        self.assertEqual(result, [1, 2]) + +        result = get_ids({'playliststart': 10}) +        self.assertEqual(result, []) + +        result = get_ids({'playliststart': 2}) +        self.assertEqual(result, [2, 3, 4]) + +        result = get_ids({'playlist_items': '2-4'}) +        self.assertEqual(result, [2, 3, 4]) + +        result = get_ids({'playlist_items': '2,4'}) +        self.assertEqual(result, [2, 4]) + +        result = get_ids({'playlist_items': '10'}) +        self.assertEqual(result, []) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_aes.py b/test/test_aes.py index 4dc7de7b5..315a3f5ae 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -39,7 +39,7 @@ class TestAES(unittest.TestCase):          encrypted = base64.b64encode(              intlist_to_bytes(self.iv[:8]) +              b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' -        ) +        ).decode('utf-8')          decrypted = (aes_decrypt_text(encrypted, password, 16))          self.assertEqual(decrypted, self.secret_msg) @@ -47,7 +47,7 @@ class TestAES(unittest.TestCase):          encrypted = base64.b64encode(              intlist_to_bytes(self.iv[:8]) +              b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83' -        ) +        ).decode('utf-8')          decrypted = (aes_decrypt_text(encrypted, password, 32))          self.assertEqual(decrypted, self.secret_msg) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 891ee620b..c4e3adb67 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -266,7 +266,7 @@ class TestNRKSubtitles(BaseTestSubtitles):          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(set(subtitles.keys()), set(['no'])) -        self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a') +        self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2')  class TestRaiSubtitles(BaseTestSubtitles): diff --git a/test/test_utils.py b/test/test_utils.py index b40107037..e13e11b59 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -621,6 +621,21 @@ Line  '''          self.assertEqual(dfxp2srt(dfxp_data), srt_data) +        dfxp_data_no_default_namespace = '''<?xml version="1.0" encoding="UTF-8"?> +            <tt xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter"> +            <body> +                <div xml:lang="en"> +                    <p begin="0" end="1">The first line</p> +                </div> +            </body> +            </tt>''' +        srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +The first line + +''' +        self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data) +  if __name__ == '__main__':      unittest.main() @@ -4,6 +4,8 @@ envlist = py26,py27,py33,py34  deps =     nose     coverage +# We need a valid $HOME for test_compat_expanduser +passenv = HOME  defaultargs = test --exclude test_download.py --exclude test_age_restriction.py      --exclude test_subtitles.py --exclude test_write_annotations.py      --exclude test_youtube_lists.py diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 691f3e09f..aa6ec9d9a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -49,6 +49,7 @@ from .utils import (      ExtractorError,      format_bytes,      formatSeconds, +    HEADRequest,      locked_file,      make_HTTPS_handler,      MaxDownloadsReached, @@ -759,7 +760,9 @@ class YoutubeDL(object):              if isinstance(ie_entries, list):                  n_all_entries = len(ie_entries)                  if playlistitems: -                    entries = [ie_entries[i - 1] for i in playlistitems] +                    entries = [ +                        ie_entries[i - 1] for i in playlistitems +                        if -n_all_entries <= i - 1 < n_all_entries]                  else:                      entries = ie_entries[playliststart:playlistend]                  n_entries = len(entries) @@ -921,8 +924,9 @@ class YoutubeDL(object):                  if f.get('vcodec') != 'none' and f.get('acodec') != 'none']              if audiovideo_formats:                  return audiovideo_formats[format_idx] -            # for audio only urls, select the best/worst audio format -            elif all(f.get('acodec') != 'none' for f in available_formats): +            # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format +            elif (all(f.get('acodec') != 'none' for f in available_formats) or +                  all(f.get('vcodec') != 'none' for f in available_formats)):                  return available_formats[format_idx]          elif format_spec == 'bestaudio':              audio_formats = [ @@ -1045,6 +1049,8 @@ class YoutubeDL(object):          if not formats:              raise ExtractorError('No video formats found!') +        formats_dict = {} +          # We check that all the formats have the format and format_id fields          for i, format in enumerate(formats):              if 'url' not in format: @@ -1052,6 +1058,18 @@ class YoutubeDL(object):              if format.get('format_id') is None:                  format['format_id'] = compat_str(i) +            format_id = format['format_id'] +            if format_id not in formats_dict: +                formats_dict[format_id] = [] +            formats_dict[format_id].append(format) + +        # Make sure all formats have unique format_id +        for format_id, ambiguous_formats in formats_dict.items(): +            if len(ambiguous_formats) > 1: +                for i, format in enumerate(ambiguous_formats): +                    format['format_id'] = '%s-%d' % (format_id, i) + +        for i, format in enumerate(formats):              if format.get('format') is None:                  format['format'] = '{id} - {res}{note}'.format(                      id=format['format_id'], @@ -1366,7 +1384,7 @@ class YoutubeDL(object):                          postprocessors = []                          self.report_warning('You have requested multiple '                                              'formats but ffmpeg or avconv are not installed.' -                                            ' The formats won\'t be merged') +                                            ' The formats won\'t be merged.')                      else:                          postprocessors = [merger] @@ -1393,8 +1411,8 @@ class YoutubeDL(object):                      requested_formats = info_dict['requested_formats']                      if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):                          info_dict['ext'] = 'mkv' -                        self.report_warning('You have requested formats incompatible for merge. ' -                                            'The formats will be merged into mkv') +                        self.report_warning( +                            'Requested formats are incompatible for merge and will be merged into mkv.')                      # Ensure filename always has a correct extension for successful merge                      filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])                      if os.path.exists(encodeFilename(filename)): @@ -1525,6 +1543,7 @@ class YoutubeDL(object):              pps_chain.extend(ie_info['__postprocessors'])          pps_chain.extend(self._pps)          for pp in pps_chain: +            files_to_delete = []              try:                  files_to_delete, info = pp.run(info)              except PostProcessingError as e: @@ -1703,7 +1722,8 @@ class YoutubeDL(object):              if req_is_string:                  req = url_escaped              else: -                req = compat_urllib_request.Request( +                req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request +                req = req_type(                      url_escaped, data=req.data, headers=req.headers,                      origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 07224d508..7817adcfd 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -152,7 +152,7 @@ def aes_decrypt_text(data, password, key_size_bytes):      """      NONCE_LENGTH_BYTES = 8 -    data = bytes_to_intlist(base64.b64decode(data)) +    data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))      password = bytes_to_intlist(password.encode('utf-8'))      key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 860023d14..631381eea 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -141,6 +141,7 @@ from .engadget import EngadgetIE  from .eporner import EpornerIE  from .eroprofile import EroProfileIE  from .escapist import EscapistIE +from .espn import ESPNIE  from .everyonesmixtape import EveryonesMixtapeIE  from .exfm import ExfmIE  from .expotv import ExpoTVIE @@ -148,7 +149,6 @@ from .extremetube import ExtremeTubeIE  from .facebook import FacebookIE  from .faz import FazIE  from .fc2 import FC2IE -from .firedrive import FiredriveIE  from .firstpost import FirstpostIE  from .firsttv import FirstTVIE  from .fivemin import FiveMinIE @@ -243,6 +243,7 @@ from .kaltura import KalturaIE  from .kanalplay import KanalPlayIE  from .kankan import KankanIE  from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE  from .keezmovies import KeezMoviesIE  from .khanacademy import KhanAcademyIE  from .kickstarter import KickStarterIE @@ -337,8 +338,7 @@ from .newstube import NewstubeIE  from .nextmedia import (      NextMediaIE,      NextMediaActionNewsIE, -    AppleDailyRealtimeNewsIE, -    AppleDailyAnimationNewsIE +    AppleDailyIE,  )  from .nfb import NFBIE  from .nfl import NFLIE @@ -352,8 +352,10 @@ from .ninegag import NineGagIE  from .noco import NocoIE  from .normalboots import NormalbootsIE  from .nosvideo import NosVideoIE +from .nova import NovaIE  from .novamov import NovaMovIE  from .nowness import NownessIE +from .nowtv import NowTVIE  from .nowvideo import NowVideoIE  from .npo import (      NPOIE, @@ -376,7 +378,10 @@ from .nytimes import (  from .nuvid import NuvidIE  from .odnoklassniki import OdnoklassnikiIE  from .oktoberfesttv import OktoberfestTVIE -from .ooyala import OoyalaIE +from .ooyala import ( +    OoyalaIE, +    OoyalaExternalIE, +)  from .openfilm import OpenFilmIE  from .orf import (      ORFTVthekIE, @@ -397,6 +402,7 @@ from .playfm import PlayFMIE  from .playvid import PlayvidIE  from .playwire import PlaywireIE  from .podomatic import PodomaticIE +from .porn91 import Porn91IE  from .pornhd import PornHdIE  from .pornhub import (      PornHubIE, @@ -434,7 +440,6 @@ from .roxwel import RoxwelIE  from .rtbf import RTBFIE  from .rte import RteIE  from .rtlnl import RtlNlIE -from .rtlnow import RTLnowIE  from .rtl2 import RTL2IE  from .rtp import RTPIE  from .rts import RTSIE @@ -477,8 +482,11 @@ from .smotri import (      SmotriBroadcastIE,  )  from .snotr import SnotrIE -from .sockshare import SockshareIE  from .sohu import SohuIE +from .soompi import ( +    SoompiIE, +    SoompiShowIE, +)  from .soundcloud import (      SoundcloudIE,      SoundcloudSetIE, @@ -503,7 +511,10 @@ from .spiegel import SpiegelIE, SpiegelArticleIE  from .spiegeltv import SpiegeltvIE  from .spike import SpikeIE  from .sport5 import Sport5IE -from .sportbox import SportBoxIE +from .sportbox import ( +    SportBoxIE, +    SportBoxEmbedIE, +)  from .sportdeutschland import SportDeutschlandIE  from .srf import SrfIE  from .srmediathek import SRMediathekIE @@ -561,11 +572,16 @@ from .traileraddict import TrailerAddictIE  from .trilulilu import TriluliluIE  from .trutube import TruTubeIE  from .tube8 import Tube8IE +from .tubitv import TubiTvIE  from .tudou import TudouIE  from .tumblr import TumblrIE  from .tunein import TuneInIE  from .turbo import TurboIE  from .tutv import TutvIE +from .tv2 import ( +    TV2IE, +    TV2ArticleIE, +)  from .tv4 import TV4IE  from .tvigle import TvigleIE  from .tvp import TvpIE, TvpSeriesIE @@ -637,7 +653,10 @@ from .vine import (      VineIE,      VineUserIE,  ) -from .viki import VikiIE +from .viki import ( +    VikiIE, +    VikiChannelIE, +)  from .vk import (      VKIE,      VKUserVideosIE, diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index a117502bc..e0518cf26 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -6,11 +6,11 @@ from ..utils import int_or_none  class AftonbladetIE(InfoExtractor): -    _VALID_URL = r'http://tv\.aftonbladet\.se/webbtv.+?(?P<id>article[0-9]+)\.ab(?:$|[?#])' +    _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)'      _TEST = { -        'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab', +        'url': 'http://tv.aftonbladet.se/abtv/articles/36015',          'info_dict': { -            'id': 'article36015', +            'id': '36015',              'ext': 'mp4',              'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna',              'description': 'Jupiters måne mest aktiv av alla himlakroppar', @@ -25,8 +25,9 @@ class AftonbladetIE(InfoExtractor):          # find internal video meta data          meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' -        internal_meta_id = self._html_search_regex( -            r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id') +        player_config = self._parse_json(self._html_search_regex( +            r'data-player-config="([^"]+)"', webpage, 'player config'), video_id) +        internal_meta_id = player_config['videoId']          internal_meta_url = meta_url % internal_meta_id          internal_meta_json = self._download_json(              internal_meta_url, video_id, 'Downloading video meta data') diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 8273bd6c9..76de24477 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -7,7 +7,6 @@ from .common import InfoExtractor  from ..utils import (      find_xpath_attr,      unified_strdate, -    get_element_by_id,      get_element_by_attribute,      int_or_none,      qualities, @@ -195,7 +194,9 @@ class ArteTVFutureIE(ArteTVPlus7IE):      def _real_extract(self, url):          anchor_id, lang = self._extract_url_info(url)          webpage = self._download_webpage(url, anchor_id) -        row = get_element_by_id(anchor_id, webpage) +        row = self._search_regex( +            r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id, +            webpage, 'row')          return self._extract_from_webpage(row, anchor_id, lang) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 7ca835e31..2103ed73a 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals  import re  import itertools +import json +import xml.etree.ElementTree as ET  from .common import InfoExtractor  from ..utils import ( @@ -67,11 +69,19 @@ class BiliBiliIE(InfoExtractor):          entries = [] -        lq_doc = self._download_xml( +        lq_page = self._download_webpage(              'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid,              video_id,              note='Downloading LQ video info'          ) +        try: +            err_info = json.loads(lq_page) +            raise ExtractorError( +                'BiliBili said: ' + err_info['error_text'], expected=True) +        except ValueError: +            pass + +        lq_doc = ET.fromstring(lq_page)          lq_durls = lq_doc.findall('./durl')          hq_doc = self._download_xml( @@ -80,9 +90,11 @@ class BiliBiliIE(InfoExtractor):              note='Downloading HQ video info',              fatal=False,          ) -        hq_durls = hq_doc.findall('./durl') if hq_doc is not False else itertools.repeat(None) - -        assert len(lq_durls) == len(hq_durls) +        if hq_doc is not False: +            hq_durls = hq_doc.findall('./durl') +            assert len(lq_durls) == len(hq_durls) +        else: +            hq_durls = itertools.repeat(None)          i = 1          for lq_durl, hq_durl in zip(lq_durls, hq_durls): diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 7e47960ab..52e61d85b 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -32,7 +32,7 @@ class CBSNewsIE(InfoExtractor):                  'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',                  'ext': 'flv',                  'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', -                'thumbnail': 'http://cbsnews2.cbsistatic.com/hub/i/r/2014/04/04/0c9fbc66-576b-41ca-8069-02d122060dd2/thumbnail/140x90/6dad7a502f88875ceac38202984b6d58/en-0404-werner-replace-640x360.jpg', +                'thumbnail': 're:^https?://.*\.jpg$',                  'duration': 205,              },              'params': { diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py index 2a5d4be18..6924eac70 100644 --- a/youtube_dl/extractor/ccc.py +++ b/youtube_dl/extractor/ccc.py @@ -16,7 +16,7 @@ class CCCIE(InfoExtractor):      _TEST = {          'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video', -        'md5': '205a365d0d57c0b1e43a12c9ffe8f9be', +        'md5': '3a1eda8f3a29515d27f5adb967d7e740',          'info_dict': {              'id': '20131228183',              'ext': 'mp4', @@ -51,7 +51,7 @@ class CCCIE(InfoExtractor):          matches = re.finditer(r'''(?xs)              <(?:span|div)\s+class='label\s+filetype'>(?P<format>.*?)</(?:span|div)>\s* -            <a\s+href='(?P<http_url>[^']+)'>\s* +            <a\s+download\s+href='(?P<http_url>[^']+)'>\s*              (?:                  .*?                  <a\s+href='(?P<torrent_url>[^']+\.torrent)' diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index c922f6959..0206d96db 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor):          base64_video_info = self._html_search_regex(              r'var cozVidData = "(.+?)";', webpage, 'video data') -        decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8") +        decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8')          video_info_dict = json.loads(decoded_video_info)          # get video information from dict diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index cf0a7551b..c949a4814 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -60,6 +60,17 @@ class CinemassacreIE(InfoExtractor):                  'uploader_id': 'Cinemassacre',                  'title': 'AVGN: McKids',              } +        }, +        { +            'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/', +            'md5': '1376908e49572389e7b06251a53cdd08', +            'info_dict': { +                'id': 'Cinemassacre-555779690c440', +                'ext': 'mp4', +                'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', +                'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', +                'upload_date': '20150525', +            }          }      ] @@ -72,7 +83,7 @@ class CinemassacreIE(InfoExtractor):          playerdata_url = self._search_regex(              [ -                r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', +                r'src="(http://(?:player2\.screenwavemedia\.com|player\.screenwavemedia\.com/play)/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',                  r'<iframe[^>]+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',              ],              webpage, 'player data URL', default=None) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 5efc5f4fe..3b1bd4033 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import (  class CNNIE(InfoExtractor):      _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ -        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))''' +        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''      _TESTS = [{          'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 65bb77086..cecf917ff 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -786,8 +786,8 @@ class InfoExtractor(object):              return True          except ExtractorError as e:              if isinstance(e.cause, compat_HTTPError): -                self.report_warning( -                    '%s URL is invalid, skipping' % item, video_id) +                self.to_screen( +                    '%s: %s URL is invalid, skipping' % (video_id, item))                  return False              raise diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 1c77df47e..41f0c736d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -76,8 +76,8 @@ class CrunchyrollIE(InfoExtractor):          self._login()      def _decrypt_subtitles(self, data, iv, id): -        data = bytes_to_intlist(data) -        iv = bytes_to_intlist(iv) +        data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) +        iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))          id = int(id)          def obfuscate_key_aux(count, modulo, start): @@ -179,6 +179,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text          return output +    def _extract_subtitles(self, subtitle): +        sub_root = xml.etree.ElementTree.fromstring(subtitle) +        return [{ +            'ext': 'srt', +            'data': self._convert_subtitles_to_srt(sub_root), +        }, { +            'ext': 'ass', +            'data': self._convert_subtitles_to_ass(sub_root), +        }] +      def _get_subtitles(self, video_id, webpage):          subtitles = {}          for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): @@ -190,25 +200,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text              data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)              if not id or not iv or not data:                  continue -            id = int(id) -            iv = base64.b64decode(iv) -            data = base64.b64decode(data) -              subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')              lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)              if not lang_code:                  continue -            sub_root = xml.etree.ElementTree.fromstring(subtitle) -            subtitles[lang_code] = [ -                { -                    'ext': 'srt', -                    'data': self._convert_subtitles_to_srt(sub_root), -                }, -                { -                    'ext': 'ass', -                    'data': self._convert_subtitles_to_ass(sub_root), -                }, -            ] +            subtitles[lang_code] = self._extract_subtitles(subtitle)          return subtitles      def _real_extract(self, url): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index db10b8d00..70aa4333c 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -225,7 +225,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):  class DailymotionUserIE(DailymotionPlaylistIE):      IE_NAME = 'dailymotion:user' -    _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:old/)?user/(?P<user>[^/]+)' +    _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P<user>[^/]+)$'      _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'      _TESTS = [{          'url': 'https://www.dailymotion.com/user/nqtv', @@ -239,7 +239,8 @@ class DailymotionUserIE(DailymotionPlaylistIE):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          user = mobj.group('user') -        webpage = self._download_webpage(url, user) +        webpage = self._download_webpage( +            'https://www.dailymotion.com/user/%s' % user, user)          full_user = unescapeHTML(self._html_search_regex(              r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),              webpage, 'user')) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index f25ab319e..baa24c6d1 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,8 +1,11 @@  # coding: utf-8  from __future__ import unicode_literals -from .common import InfoExtractor, ExtractorError -from ..utils import parse_iso8601 +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    parse_iso8601, +)  class DRTVIE(InfoExtractor): @@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor):                  restricted_to_denmark = asset['RestrictedToDenmark']                  spoken_subtitles = asset['Target'] == 'SpokenSubtitles'                  for link in asset['Links']: -                    target = link['Target']                      uri = link['Uri'] +                    target = link['Target']                      format_id = target -                    preference = -1 if target == 'HDS' else -2 +                    preference = None                      if spoken_subtitles: -                        preference -= 2 +                        preference = -1                          format_id += '-spoken-subtitles' -                    formats.append({ -                        'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri, -                        'format_id': format_id, -                        'ext': link['FileFormat'], -                        'preference': preference, -                    }) +                    if target == 'HDS': +                        formats.extend(self._extract_f4m_formats( +                            uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', +                            video_id, preference, f4m_id=format_id)) +                    elif target == 'HLS': +                        formats.extend(self._extract_m3u8_formats( +                            uri, video_id, 'mp4', preference=preference, +                            m3u8_id=format_id)) +                    else: +                        bitrate = link.get('Bitrate') +                        if bitrate: +                            format_id += '-%s' % bitrate +                        formats.append({ +                            'url': uri, +                            'format_id': format_id, +                            'tbr': bitrate, +                            'ext': link.get('FileFormat'), +                        })                  subtitles_list = asset.get('SubtitlesList')                  if isinstance(subtitles_list, list):                      LANGS = { diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py index 70f8efe27..4827022e0 100644 --- a/youtube_dl/extractor/empflix.py +++ b/youtube_dl/extractor/empflix.py @@ -4,22 +4,28 @@ from .tnaflix import TNAFlixIE  class EMPFlixIE(TNAFlixIE): -    _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html' +    _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html'      _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"'      _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"'      _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' -    _TEST = { -        'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', -        'md5': 'b1bc15b6412d33902d6e5952035fcabc', -        'info_dict': { -            'id': '33051', -            'display_id': 'Amateur-Finger-Fuck', -            'ext': 'mp4', -            'title': 'Amateur Finger Fuck', -            'description': 'Amateur solo finger fucking.', -            'thumbnail': 're:https?://.*\.jpg$', -            'age_limit': 18, +    _TESTS = [ +        { +            'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', +            'md5': 'b1bc15b6412d33902d6e5952035fcabc', +            'info_dict': { +                'id': '33051', +                'display_id': 'Amateur-Finger-Fuck', +                'ext': 'mp4', +                'title': 'Amateur Finger Fuck', +                'description': 'Amateur solo finger fucking.', +                'thumbnail': 're:https?://.*\.jpg$', +                'age_limit': 18, +            } +        }, +        { +            'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', +            'only_matching': True,          } -    } +    ] diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py new file mode 100644 index 000000000..e6f8f0337 --- /dev/null +++ b/youtube_dl/extractor/espn.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class ESPNIE(InfoExtractor): +    _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)' +    _WORKING = False +    _TESTS = [{ +        'url': 'http://espn.go.com/video/clip?id=10365079', +        'info_dict': { +            'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', +            'ext': 'mp4', +            'title': 'dm_140128_30for30Shorts___JudgingJewellv2', +            'description': '', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, { +        'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', +        'only_matching': True, +    }, { +        'url': 'http://espn.go.com/nba/recap?gameId=400793786', +        'only_matching': True, +    }, { +        'url': 'http://espn.go.com/blog/golden-state-warriors/post/_/id/593/how-warriors-rapidly-regained-a-winning-edge', +        'only_matching': True, +    }, { +        'url': 'http://espn.go.com/sports/endurance/story/_/id/12893522/dzhokhar-tsarnaev-sentenced-role-boston-marathon-bombings', +        'only_matching': True, +    }, { +        'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        video_id = self._search_regex( +            r'class="video-play-button"[^>]+data-id="(\d+)', +            webpage, 'video id') + +        player = self._download_webpage( +            'https://espn.go.com/video/iframe/twitter/?id=%s' % video_id, video_id) + +        pcode = self._search_regex( +            r'["\']pcode=([^"\']+)["\']', player, 'pcode') + +        return self.url_result( +            'ooyalaexternal:espn:%s:%s' % (video_id, pcode), +            'OoyalaExternal') diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 937b28fcc..82dc27bc6 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -50,7 +50,10 @@ class FacebookIE(InfoExtractor):              'id': '274175099429670',              'ext': 'mp4',              'title': 'Facebook video #274175099429670', -        } +        }, +        'expected_warnings': [ +            'title' +        ]      }, {          'url': 'https://www.facebook.com/video.php?v=10204634152394104',          'only_matching': True, @@ -149,12 +152,12 @@ class FacebookIE(InfoExtractor):              raise ExtractorError('Cannot find video formats')          video_title = self._html_search_regex( -            r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title', -            fatal=False) +            r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title', +            default=None)          if not video_title:              video_title = self._html_search_regex(                  r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>', -                webpage, 'alternative title', default=None) +                webpage, 'alternative title', fatal=False)              video_title = limit_length(video_title, 80)          if not video_title:              video_title = 'Facebook video #%s' % video_id diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py deleted file mode 100644 index 3191116d9..000000000 --- a/youtube_dl/extractor/firedrive.py +++ /dev/null @@ -1,80 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( -    compat_urllib_parse, -    compat_urllib_request, -) -from ..utils import ( -    ExtractorError, -) - - -class FiredriveIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \ -                 '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)' -    _FILE_DELETED_REGEX = r'<div class="removed_file_image">' - -    _TESTS = [{ -        'url': 'https://www.firedrive.com/file/FEB892FA160EBD01', -        'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970', -        'info_dict': { -            'id': 'FEB892FA160EBD01', -            'ext': 'flv', -            'title': 'bbb_theora_486kbit.flv', -            'thumbnail': 're:^http://.*\.jpg$', -        }, -    }] - -    def _real_extract(self, url): -        video_id = self._match_id(url) -        url = 'http://firedrive.com/file/%s' % video_id -        webpage = self._download_webpage(url, video_id) - -        if re.search(self._FILE_DELETED_REGEX, webpage) is not None: -            raise ExtractorError('Video %s does not exist' % video_id, -                                 expected=True) - -        fields = dict(re.findall(r'''(?x)<input\s+ -            type="hidden"\s+ -            name="([^"]+)"\s+ -            value="([^"]*)" -            ''', webpage)) - -        post = compat_urllib_parse.urlencode(fields) -        req = compat_urllib_request.Request(url, post) -        req.add_header('Content-type', 'application/x-www-form-urlencoded') - -        # Apparently, this header is required for confirmation to work. -        req.add_header('Host', 'www.firedrive.com') - -        webpage = self._download_webpage(req, video_id, -                                         'Downloading video page') - -        title = self._search_regex(r'class="external_title_left">(.+)</div>', -                                   webpage, 'title') -        thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage, -                                       'thumbnail', fatal=False) -        if thumbnail is not None: -            thumbnail = 'http:' + thumbnail - -        ext = self._search_regex(r'type:\s?\'([^\']+)\',', -                                 webpage, 'extension', fatal=False) -        video_url = self._search_regex( -            r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url') - -        formats = [{ -            'format_id': 'sd', -            'url': video_url, -            'ext': ext, -        }] - -        return { -            'id': video_id, -            'title': title, -            'thumbnail': thumbnail, -            'formats': formats, -        } diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 47373e215..2d33fa7f5 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,8 +14,8 @@ from ..utils import (  class GameSpotIE(InfoExtractor): -    _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' -    _TEST = { +    _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' +    _TESTS = [{          'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',          'md5': 'b2a30deaa8654fcccd43713a6b6a4825',          'info_dict': { @@ -23,8 +23,16 @@ class GameSpotIE(InfoExtractor):              'ext': 'mp4',              'title': 'Arma 3 - Community Guide: SITREP I',              'description': 'Check out this video where some of the basics of Arma 3 is explained.', -        } -    } +        }, +    }, { +        'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/', +        'info_dict': { +            'id': 'gs-2300-6424837', +            'ext': 'flv', +            'title': 'The Witcher 3: Wild Hunt [Xbox ONE]  - Now Playing', +            'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.', +        }, +    }]      def _real_extract(self, url):          page_id = self._match_id(url) @@ -32,25 +40,37 @@ class GameSpotIE(InfoExtractor):          data_video_json = self._search_regex(              r'data-video=["\'](.*?)["\']', webpage, 'data video')          data_video = json.loads(unescapeHTML(data_video_json)) +        streams = data_video['videoStreams'] -        # Transform the manifest url to a link to the mp4 files -        # they are used in mobile devices. -        f4m_url = data_video['videoStreams']['f4m_stream'] -        f4m_path = compat_urlparse.urlparse(f4m_url).path -        QUALITIES_RE = r'((,\d+)+,?)' -        qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',') -        http_path = f4m_path[1:].split('/', 1)[1] -        http_template = re.sub(QUALITIES_RE, r'%s', http_path) -        http_template = http_template.replace('.csmil/manifest.f4m', '') -        http_template = compat_urlparse.urljoin( -            'http://video.gamespotcdn.com/', http_template)          formats = [] -        for q in qualities: -            formats.append({ -                'url': http_template % q, -                'ext': 'mp4', -                'format_id': q, -            }) +        f4m_url = streams.get('f4m_stream') +        if f4m_url is not None: +            # Transform the manifest url to a link to the mp4 files +            # they are used in mobile devices. +            f4m_path = compat_urlparse.urlparse(f4m_url).path +            QUALITIES_RE = r'((,\d+)+,?)' +            qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',') +            http_path = f4m_path[1:].split('/', 1)[1] +            http_template = re.sub(QUALITIES_RE, r'%s', http_path) +            http_template = http_template.replace('.csmil/manifest.f4m', '') +            http_template = compat_urlparse.urljoin( +                'http://video.gamespotcdn.com/', http_template) +            for q in qualities: +                formats.append({ +                    'url': http_template % q, +                    'ext': 'mp4', +                    'format_id': q, +                }) +        else: +            for quality in ['sd', 'hd']: +                # It's actually a link to a flv file +                flv_url = streams.get('f4m_{0}'.format(quality)) +                if flv_url is not None: +                    formats.append({ +                        'url': flv_url, +                        'ext': 'flv', +                        'format_id': quality, +                    })          return {              'id': data_video['guid'], diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3d756e848..96ca398de 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -9,6 +9,8 @@ from .common import InfoExtractor  from .youtube import YoutubeIE  from ..compat import (      compat_urllib_parse, +    compat_urllib_parse_unquote, +    compat_urllib_request,      compat_urlparse,      compat_xml_parse_error,  ) @@ -32,6 +34,7 @@ from .brightcove import BrightcoveIE  from .nbc import NBCSportsVPlayerIE  from .ooyala import OoyalaIE  from .rutv import RUTVIE +from .sportbox import SportBoxEmbedIE  from .smotri import SmotriIE  from .condenast import CondeNastIE  from .udn import UDNEmbedIE @@ -45,6 +48,97 @@ class GenericIE(InfoExtractor):      _VALID_URL = r'.*'      IE_NAME = 'generic'      _TESTS = [ +        # Direct link to a video +        { +            'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', +            'md5': '67d406c2bcb6af27fa886f31aa934bbe', +            'info_dict': { +                'id': 'trailer', +                'ext': 'mp4', +                'title': 'trailer', +                'upload_date': '20100513', +            } +        }, +        # Direct link to media delivered compressed (until Accept-Encoding is *) +        { +            'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', +            'md5': '128c42e68b13950268b648275386fc74', +            'info_dict': { +                'id': 'FictionJunction-Parallel_Hearts', +                'ext': 'flac', +                'title': 'FictionJunction-Parallel_Hearts', +                'upload_date': '20140522', +            }, +            'expected_warnings': [ +                'URL could be a direct video link, returning it as such.' +            ] +        }, +        # Direct download with broken HEAD +        { +            'url': 'http://ai-radio.org:8000/radio.opus', +            'info_dict': { +                'id': 'radio', +                'ext': 'opus', +                'title': 'radio', +            }, +            'params': { +                'skip_download': True,  # infinite live stream +            }, +            'expected_warnings': [ +                r'501.*Not Implemented' +            ], +        }, +        # Direct link with incorrect MIME type +        { +            'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', +            'md5': '4ccbebe5f36706d85221f204d7eb5913', +            'info_dict': { +                'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', +                'id': '5_Lennart_Poettering_-_Systemd', +                'ext': 'webm', +                'title': '5_Lennart_Poettering_-_Systemd', +                'upload_date': '20141120', +            }, +            'expected_warnings': [ +                'URL could be a direct video link, returning it as such.' +            ] +        }, +        # RSS feed +        { +            'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', +            'info_dict': { +                'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', +                'title': 'Zero Punctuation', +                'description': 're:.*groundbreaking video review series.*' +            }, +            'playlist_mincount': 11, +        }, +        # RSS feed with enclosure +        { +            'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', +            'info_dict': { +                'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', +                'ext': 'm4v', +                'upload_date': '20150228', +                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', +            } +        }, +        # google redirect +        { +            'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', +            'info_dict': { +                'id': 'cmQHVoWB5FY', +                'ext': 'mp4', +                'upload_date': '20130224', +                'uploader_id': 'TheVerge', +                'description': 're:^Chris Ziegler takes a look at the\.*', +                'uploader': 'The Verge', +                'title': 'First Firefox OS phones side-by-side', +            }, +            'params': { +                'skip_download': False, +            } +        },          {              'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',              'md5': '85b90ccc9d73b4acd9138d3af4c27f89', @@ -124,17 +218,6 @@ class GenericIE(InfoExtractor):                  'skip_download': True,  # m3u8 download              },          }, -        # Direct link to a video -        { -            'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', -            'md5': '67d406c2bcb6af27fa886f31aa934bbe', -            'info_dict': { -                'id': 'trailer', -                'ext': 'mp4', -                'title': 'trailer', -                'upload_date': '20100513', -            } -        },          # ooyala video          {              'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -159,22 +242,6 @@ class GenericIE(InfoExtractor):              },              'add_ie': ['Ooyala'],          }, -        # google redirect -        { -            'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', -            'info_dict': { -                'id': 'cmQHVoWB5FY', -                'ext': 'mp4', -                'upload_date': '20130224', -                'uploader_id': 'TheVerge', -                'description': 're:^Chris Ziegler takes a look at the\.*', -                'uploader': 'The Verge', -                'title': 'First Firefox OS phones side-by-side', -            }, -            'params': { -                'skip_download': False, -            } -        },          # embed.ly video          {              'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -224,6 +291,37 @@ class GenericIE(InfoExtractor):                  'skip_download': True,              },          }, +        # SportBox embed +        { +            'url': 'http://www.vestifinance.ru/articles/25753', +            'info_dict': { +                'id': '25753', +                'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"', +            }, +            'playlist': [{ +                'info_dict': { +                    'id': '370908', +                    'title': 'Госзаказ. День 3', +                    'ext': 'mp4', +                } +            }, { +                'info_dict': { +                    'id': '370905', +                    'title': 'Госзаказ. День 2', +                    'ext': 'mp4', +                } +            }, { +                'info_dict': { +                    'id': '370902', +                    'title': 'Госзаказ. День 1', +                    'ext': 'mp4', +                } +            }], +            'params': { +                # m3u8 download +                'skip_download': True, +            }, +        },          # Embedded TED video          {              'url': 'http://en.support.wordpress.com/videos/ted-talks/', @@ -375,16 +473,6 @@ class GenericIE(InfoExtractor):                  'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',              }          }, -        # RSS feed -        { -            'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', -            'info_dict': { -                'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', -                'title': 'Zero Punctuation', -                'description': 're:.*groundbreaking video review series.*' -            }, -            'playlist_mincount': 11, -        },          # Multiple brightcove videos          # https://github.com/rg3/youtube-dl/issues/2283          { @@ -438,21 +526,6 @@ class GenericIE(InfoExtractor):                  'uploader': 'thoughtworks.wistia.com',              },          }, -        # Direct download with broken HEAD -        { -            'url': 'http://ai-radio.org:8000/radio.opus', -            'info_dict': { -                'id': 'radio', -                'ext': 'opus', -                'title': 'radio', -            }, -            'params': { -                'skip_download': True,  # infinite live stream -            }, -            'expected_warnings': [ -                r'501.*Not Implemented' -            ], -        },          # Soundcloud embed          {              'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', @@ -484,21 +557,6 @@ class GenericIE(InfoExtractor):              },              'playlist_mincount': 2,          }, -        # Direct link with incorrect MIME type -        { -            'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', -            'md5': '4ccbebe5f36706d85221f204d7eb5913', -            'info_dict': { -                'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', -                'id': '5_Lennart_Poettering_-_Systemd', -                'ext': 'webm', -                'title': '5_Lennart_Poettering_-_Systemd', -                'upload_date': '20141120', -            }, -            'expected_warnings': [ -                'URL could be a direct video link, returning it as such.' -            ] -        },          # Cinchcast embed          {              'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', @@ -657,16 +715,6 @@ class GenericIE(InfoExtractor):                  'age_limit': 0,              },          }, -        # RSS feed with enclosure -        { -            'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', -            'info_dict': { -                'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', -                'ext': 'm4v', -                'upload_date': '20150228', -                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', -            } -        },          # Crooks and Liars embed          {              'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', @@ -862,7 +910,7 @@ class GenericIE(InfoExtractor):              force_videoid = smuggled_data['force_videoid']              video_id = force_videoid          else: -            video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0] +            video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])          self.to_screen('%s: Requesting header' % video_id) @@ -884,7 +932,9 @@ class GenericIE(InfoExtractor):          full_response = None          if head_response is False: -            full_response = self._request_webpage(url, video_id) +            request = compat_urllib_request.Request(url) +            request.add_header('Accept-Encoding', '*') +            full_response = self._request_webpage(request, video_id)              head_response = full_response          # Check for direct link to a video @@ -895,7 +945,7 @@ class GenericIE(InfoExtractor):                  head_response.headers.get('Last-Modified'))              return {                  'id': video_id, -                'title': os.path.splitext(url_basename(url))[0], +                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),                  'direct': True,                  'formats': [{                      'format_id': m.group('format_id'), @@ -909,7 +959,17 @@ class GenericIE(InfoExtractor):              self._downloader.report_warning('Falling back on generic information extractor.')          if not full_response: -            full_response = self._request_webpage(url, video_id) +            request = compat_urllib_request.Request(url) +            # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) +            # making it impossible to download only chunk of the file (yet we need only 512kB to +            # test whether it's HTML or not). According to youtube-dl default Accept-Encoding +            # that will always result in downloading the whole file that is not desirable. +            # Therefore for extraction pass we have to override Accept-Encoding to any in order +            # to accept raw bytes and being able to download only a chunk. +            # It may probably better to solve this by checking Content-Type for application/octet-stream +            # after HEAD request finishes, but not sure if we can rely on this. +            request.add_header('Accept-Encoding', '*') +            full_response = self._request_webpage(request, video_id)          # Maybe it's a direct link to a video?          # Be careful not to download the whole thing! @@ -921,7 +981,7 @@ class GenericIE(InfoExtractor):                  head_response.headers.get('Last-Modified'))              return {                  'id': video_id, -                'title': os.path.splitext(url_basename(url))[0], +                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),                  'direct': True,                  'url': url,                  'upload_date': upload_date, @@ -1229,6 +1289,11 @@ class GenericIE(InfoExtractor):          if rutv_url:              return self.url_result(rutv_url, 'RUTV') +        # Look for embedded SportBox player +        sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) +        if sportbox_urls: +            return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') +          # Look for embedded TED player          mobj = re.search(              r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage) @@ -1388,7 +1453,7 @@ class GenericIE(InfoExtractor):          # Look for Senate ISVP iframe          senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)          if senate_isvp_url: -            return self.url_result(surl, 'SenateISVP') +            return self.url_result(senate_isvp_url, 'SenateISVP')          def check_video(vurl):              if YoutubeIE.suitable(vurl): diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index fe5d95e2c..d692ea79a 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_urlparse  from ..utils import (      int_or_none,      js_to_json, @@ -12,7 +13,7 @@ from ..utils import (  class ImgurIE(InfoExtractor): -    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?' +    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)'      _TESTS = [{          'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -34,7 +35,8 @@ class ImgurIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) +        webpage = self._download_webpage( +            compat_urlparse.urljoin(url, video_id), video_id)          width = int_or_none(self._search_regex(              r'<param name="width" value="([0-9]+)"', diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 65f6ca103..b10755788 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -7,9 +7,9 @@ from ..utils import int_or_none  class InstagramIE(InfoExtractor): -    _VALID_URL = r'https?://instagram\.com/p/(?P<id>[\da-zA-Z]+)' +    _VALID_URL = r'https://instagram\.com/p/(?P<id>[\da-zA-Z]+)'      _TEST = { -        'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc', +        'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',          'md5': '0d2da106a9d2631273e192b372806516',          'info_dict': {              'id': 'aye83DjauH', @@ -41,11 +41,11 @@ class InstagramIE(InfoExtractor):  class InstagramUserIE(InfoExtractor): -    _VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])' +    _VALID_URL = r'https://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'      IE_DESC = 'Instagram user profile'      IE_NAME = 'instagram:user'      _TEST = { -        'url': 'http://instagram.com/porsche', +        'url': 'https://instagram.com/porsche',          'info_dict': {              'id': 'porsche',              'title': 'porsche', diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 8529bedfc..821c8ec10 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -11,11 +11,12 @@ from ..compat import (  )  from ..utils import (      ExtractorError, +    remove_end,  )  class IPrimaIE(InfoExtractor): -    _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)' +    _VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P<id>[^?#]+)'      _TESTS = [{          'url': 'http://play.iprima.cz/particka/particka-92', @@ -23,7 +24,7 @@ class IPrimaIE(InfoExtractor):              'id': '39152',              'ext': 'flv',              'title': 'Partička (92)', -            'description': 'md5:3740fda51464da35a2d4d0670b8e4fd6', +            'description': 'md5:74e9617e51bca67c3ecfb2c6f9766f45',              'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',          },          'params': { @@ -35,13 +36,14 @@ class IPrimaIE(InfoExtractor):              'id': '9718337',              'ext': 'flv',              'title': 'Tchibo Partička - Jarní móda', -            'description': 'md5:589f8f59f414220621ff8882eb3ce7be',              'thumbnail': 're:^http:.*\.jpg$',          },          'params': {              'skip_download': True,  # requires rtmpdump          }, -        'skip': 'Do not have permission to access this page', +    }, { +        'url': 'http://play.iprima.cz/zpravy-ftv-prima-2752015', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -102,8 +104,10 @@ class IPrimaIE(InfoExtractor):          return {              'id': real_id, -            'title': self._og_search_title(webpage), +            'title': remove_end(self._og_search_title(webpage), ' | Prima PLAY'),              'thumbnail': self._og_search_thumbnail(webpage),              'formats': formats, -            'description': self._og_search_description(webpage), +            'description': self._search_regex( +                r'<p[^>]+itemprop="description"[^>]*>([^<]+)', +                webpage, 'description', default=None),          } diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index 99a1361f8..bc226fa67 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote  from ..utils import (      determine_ext,      float_or_none, @@ -30,7 +31,7 @@ class IzleseneIE(InfoExtractor):                  'description': 'md5:253753e2655dde93f59f74b572454f6d',                  'thumbnail': 're:^http://.*\.jpg',                  'uploader_id': 'pelikzzle', -                'timestamp': 1404302298, +                'timestamp': int,                  'upload_date': '20140702',                  'duration': 95.395,                  'age_limit': 0, @@ -46,7 +47,7 @@ class IzleseneIE(InfoExtractor):                  'description': 'Tarkan Dortmund 2006 Konseri',                  'thumbnail': 're:^http://.*\.jpg',                  'uploader_id': 'parlayankiz', -                'timestamp': 1163322193, +                'timestamp': int,                  'upload_date': '20061112',                  'duration': 253.666,                  'age_limit': 0, @@ -67,9 +68,9 @@ class IzleseneIE(InfoExtractor):          uploader = self._html_search_regex(              r"adduserUsername\s*=\s*'([^']+)';", -            webpage, 'uploader', fatal=False, default='') +            webpage, 'uploader', fatal=False)          timestamp = parse_iso8601(self._html_search_meta( -            'uploadDate', webpage, 'upload date', fatal=False)) +            'uploadDate', webpage, 'upload date'))          duration = float_or_none(self._html_search_regex(              r'"videoduration"\s*:\s*"([^"]+)"', @@ -86,8 +87,7 @@ class IzleseneIE(InfoExtractor):          # Might be empty for some videos.          streams = self._html_search_regex( -            r'"qualitylevel"\s*:\s*"([^"]+)"', -            webpage, 'streams', fatal=False, default='') +            r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='')          formats = []          if streams: @@ -95,15 +95,15 @@ class IzleseneIE(InfoExtractor):                  quality, url = re.search(r'\[(\w+)\](.+)', stream).groups()                  formats.append({                      'format_id': '%sp' % quality if quality else 'sd', -                    'url': url, +                    'url': compat_urllib_parse_unquote(url),                      'ext': ext,                  })          else:              stream_url = self._search_regex( -                r'"streamurl"\s?:\s?"([^"]+)"', webpage, 'stream URL') +                r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL')              formats.append({                  'format_id': 'sd', -                'url': stream_url, +                'url': compat_urllib_parse_unquote(stream_url),                  'ext': ext,              }) diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py new file mode 100644 index 000000000..bed94bc93 --- /dev/null +++ b/youtube_dl/extractor/karrierevideos.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( +    fix_xml_ampersands, +    float_or_none, +    xpath_with_ns, +    xpath_text, +) + + +class KarriereVideosIE(InfoExtractor): +    _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' +    _TESTS = [{ +        'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', +        'info_dict': { +            'id': '32c91', +            'ext': 'flv', +            'title': 'AltenpflegerIn', +            'description': 'md5:dbadd1259fde2159a9b28667cb664ae2', +            'thumbnail': 're:^http://.*\.png', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        } +    }, { +        # broken ampersands +        'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun', +        'info_dict': { +            'id': '5sniu', +            'ext': 'flv', +            'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"', +            'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33', +            'thumbnail': 're:^http://.*\.png', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        } +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        title = (self._html_search_meta('title', webpage, default=None) or +                 self._search_regex(r'<h1 class="title">([^<]+)</h1>')) + +        video_id = self._search_regex( +            r'/config/video/(.+?)\.xml', webpage, 'video id') +        playlist = self._download_xml( +            'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id, +            video_id, transform_source=fix_xml_ampersands) + +        NS_MAP = { +            'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats' +        } + +        def ns(path): +            return xpath_with_ns(path, NS_MAP) + +        item = playlist.find('./tracklist/item') +        video_file = xpath_text( +            item, ns('./jwplayer:file'), 'video url', fatal=True) +        streamer = xpath_text( +            item, ns('./jwplayer:streamer'), 'streamer', fatal=True) + +        uploader = xpath_text( +            item, ns('./jwplayer:author'), 'uploader') +        duration = float_or_none( +            xpath_text(item, ns('./jwplayer:duration'), 'duration')) + +        description = self._html_search_regex( +            r'(?s)<div class="leadtext">(.+?)</div>', +            webpage, 'description') + +        thumbnail = self._html_search_meta( +            'thumbnail', webpage, 'thumbnail') +        if thumbnail: +            thumbnail = compat_urlparse.urljoin(url, thumbnail) + +        return { +            'id': video_id, +            'url': streamer.replace('rtmpt', 'rtmp'), +            'play_path': 'mp4:%s' % video_file, +            'ext': 'flv', +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'uploader': uploader, +            'duration': duration, +        } diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 1484ac0d2..da896caf1 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -50,9 +50,7 @@ class LetvIE(InfoExtractor):              'title': '与龙共舞 完整版',              'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',          }, -        'params': { -            'cn_verification_proxy': 'http://proxy.uku.im:8888' -        }, +        'skip': 'Only available in China',      }]      @staticmethod diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index d8897eb90..7091f3335 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -20,7 +20,6 @@ class MiTeleIE(InfoExtractor):      _TESTS = [{          'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', -        'md5': '6a75fe9d0d3275bead0cb683c616fddb',          'info_dict': {              'id': '0fce117d',              'ext': 'mp4', @@ -29,6 +28,10 @@ class MiTeleIE(InfoExtractor):              'display_id': 'programa-144',              'duration': 2913,          }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }]      def _real_extract(self, url): @@ -56,12 +59,14 @@ class MiTeleIE(InfoExtractor):              episode,              transform_source=strip_jsonp          ) +        formats = self._extract_m3u8_formats( +            token_info['tokenizedUrl'], episode, ext='mp4')          return {              'id': embed_data['videoId'],              'display_id': episode,              'title': info_el.find('title').text, -            'url': token_info['tokenizedUrl'], +            'formats': formats,              'description': get_element_by_attribute('class', 'text', webpage),              'thumbnail': info_el.find('thumb').text,              'duration': parse_duration(info_el.find('duration').text), diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index c10405f04..925967753 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -6,6 +6,7 @@ import re  from .common import InfoExtractor  from ..compat import (      compat_urllib_parse, +    compat_urlparse,  )  from ..utils import (      ExtractorError, @@ -16,7 +17,7 @@ from ..utils import (  class NaverIE(InfoExtractor):      _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)' -    _TEST = { +    _TESTS = [{          'url': 'http://tvcast.naver.com/v/81652',          'info_dict': {              'id': '81652', @@ -25,7 +26,18 @@ class NaverIE(InfoExtractor):              'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',              'upload_date': '20130903',          }, -    } +    }, { +        'url': 'http://tvcast.naver.com/v/395837', +        'md5': '638ed4c12012c458fefcddfd01f173cd', +        'info_dict': { +            'id': '395837', +            'ext': 'mp4', +            'title': '9년이 지나도 아픈 기억, 전효성의 아버지', +            'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7', +            'upload_date': '20150519', +        }, +        'skip': 'Georestricted', +    }]      def _real_extract(self, url):          video_id = self._match_id(url) @@ -35,7 +47,7 @@ class NaverIE(InfoExtractor):                           webpage)          if m_id is None:              m_error = re.search( -                r'(?s)<div class="nation_error">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>', +                r'(?s)<div class="(?:nation_error|nation_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',                  webpage)              if m_error:                  raise ExtractorError(clean_html(m_error.group('msg')), expected=True) @@ -58,14 +70,18 @@ class NaverIE(InfoExtractor):          formats = []          for format_el in urls.findall('EncodingOptions/EncodingOption'):              domain = format_el.find('Domain').text +            uri = format_el.find('uri').text              f = { -                'url': domain + format_el.find('uri').text, +                'url': compat_urlparse.urljoin(domain, uri),                  'ext': 'mp4',                  'width': int(format_el.find('width').text),                  'height': int(format_el.find('height').text),              }              if domain.startswith('rtmp'): +                # urlparse does not support custom schemes +                # https://bugs.python.org/issue18828                  f.update({ +                    'url': domain + uri,                      'ext': 'flv',                      'rtmp_protocol': '1',  # rtmpt                  }) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 862b706bf..944096e1c 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -22,6 +22,18 @@ class NBAIE(InfoExtractor):      }, {          'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',          'only_matching': True, +    }, { +        'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', +        'info_dict': { +            'id': '0041400301-cle-atl-recap.nba', +            'ext': 'mp4', +            'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1', +            'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', +            'duration': 228, +        }, +        'params': { +            'skip_download': True, +        }      }]      def _real_extract(self, url): @@ -35,8 +47,12 @@ class NBAIE(InfoExtractor):              self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com')          description = self._og_search_description(webpage) -        duration = parse_duration( -            self._html_search_meta('duration', webpage, 'duration')) +        duration_str = self._html_search_meta( +            'duration', webpage, 'duration', default=None) +        if not duration_str: +            duration_str = self._html_search_regex( +                r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False) +        duration = parse_duration(duration_str)          return {              'id': shortened_video_id, diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index 02dba4ef6..d1b7cff4c 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -89,8 +89,8 @@ class NextMediaActionNewsIE(NextMediaIE):          return self._extract_from_nextmedia_page(news_id, url, article_page) -class AppleDailyRealtimeNewsIE(NextMediaIE): -    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' +class AppleDailyIE(NextMediaIE): +    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'      _TESTS = [{          'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',          'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', @@ -99,7 +99,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):              'ext': 'mp4',              'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生',              'thumbnail': 're:^https?://.*\.jpg$', -            'description': 'md5:b23787119933404ce515c6356a8c355c', +            'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4',              'upload_date': '20150128',          }      }, { @@ -110,26 +110,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):              'ext': 'mp4',              'title': '不滿被踩腳 山東兩大媽一路打下車',              'thumbnail': 're:^https?://.*\.jpg$', -            'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d', +            'description': 'md5:175b4260c1d7c085993474217e4ab1b4',              'upload_date': '20150128',          } -    }] - -    _URL_PATTERN = r'\{url: \'(.+)\'\}' - -    def _fetch_title(self, page): -        return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title') - -    def _fetch_thumbnail(self, page): -        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) - -    def _fetch_timestamp(self, page): -        return None - - -class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE): -    _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' -    _TESTS = [{ +    }, {          'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',          'md5': '03df296d95dedc2d5886debbb80cb43f',          'info_dict': { @@ -154,10 +138,22 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):          'expected_warnings': [              'video thumbnail',          ] +    }, { +        'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/', +        'only_matching': True,      }] +    _URL_PATTERN = r'\{url: \'(.+)\'\}' +      def _fetch_title(self, page): -        return self._html_search_meta('description', page, 'news title') +        return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or +                self._html_search_meta('description', page, 'news title')) + +    def _fetch_thumbnail(self, page): +        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) + +    def _fetch_timestamp(self, page): +        return None      def _fetch_description(self, page):          return self._html_search_meta('description', page, 'news description') diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py new file mode 100644 index 000000000..3f9c776ef --- /dev/null +++ b/youtube_dl/extractor/nova.py @@ -0,0 +1,179 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    clean_html, +    unified_strdate, +) + + +class NovaIE(InfoExtractor): +    IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' +    _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' +    _TESTS = [{ +        'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus', +        'info_dict': { +            'id': '1608920', +            'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou', +            'ext': 'flv', +            'title': 'Duel: Michal Hrdlička a Petr Suchoň', +            'description': 'md5:d0cc509858eee1b1374111c588c6f5d5', +            'thumbnail': 're:^https?://.*\.(?:jpg)', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        } +    }, { +        'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', +        'md5': '1dd7b9d5ea27bc361f110cd855a19bd3', +        'info_dict': { +            'id': '1757139', +            'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci', +            'ext': 'mp4', +            'title': 'Podzemní nemocnice v pražské Krči', +            'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53', +            'thumbnail': 're:^https?://.*\.(?:jpg)', +        } +    }, { +        'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove', +        'info_dict': { +            'id': '1756825', +            'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove', +            'ext': 'flv', +            'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově', +            'description': 'md5:dc24e50be5908df83348e50d1431295e',  # Make sure this description is clean of html tags +            'thumbnail': 're:^https?://.*\.(?:jpg)', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        } +    }, { +        'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/', +        'info_dict': { +            'id': '1756858', +            'ext': 'flv', +            'title': 'Televizní noviny - 30. 5. 2015', +            'thumbnail': 're:^https?://.*\.(?:jpg)', +            'upload_date': '20150530', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        } +    }, { +        'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', +        'info_dict': { +            'id': '1753621', +            'ext': 'mp4', +            'title': 'Zaklínač 3: Divoký hon', +            'description': 're:.*Pokud se stejně jako my nemůžete.*', +            'thumbnail': 're:https?://.*\.jpg(\?.*)?', +            'upload_date': '20150521', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        } +    }, { +        'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', +        'only_matching': True, +    }, { +        'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', +        'only_matching': True, +    }, { +        'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html', +        'only_matching': True, +    }, { +        'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html', +        'only_matching': True, +    }, { +        'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        display_id = mobj.group('id') +        site = mobj.group('site') + +        webpage = self._download_webpage(url, display_id) + +        video_id = self._search_regex( +            [r"(?:media|video_id)\s*:\s*'(\d+)'", +             r'media=(\d+)', +             r'id="article_video_(\d+)"', +             r'id="player_(\d+)"'], +            webpage, 'video id') + +        config_url = self._search_regex( +            r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', +            webpage, 'config url', default=None) + +        if not config_url: +            DEFAULT_SITE_ID = '23000' +            SITES = { +                'tvnoviny': DEFAULT_SITE_ID, +                'novaplus': DEFAULT_SITE_ID, +                'vymena': DEFAULT_SITE_ID, +                'krasna': DEFAULT_SITE_ID, +                'fanda': '30', +                'tn': '30', +                'doma': '30', +            } + +            site_id = self._search_regex( +                r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID) + +            config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig' +                          % (site_id, video_id)) + +        config = self._download_json( +            config_url, display_id, +            'Downloading config JSON', +            transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) + +        mediafile = config['mediafile'] +        video_url = mediafile['src'] + +        m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url) +        if m: +            formats = [{ +                'url': m.group('url'), +                'app': m.group('app'), +                'play_path': m.group('playpath'), +                'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf', +                'ext': 'flv', +            }] +        else: +            formats = [{ +                'url': video_url, +            }] +        self._sort_formats(formats) + +        title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) +        description = clean_html(self._og_search_description(webpage, default=None)) +        thumbnail = config.get('poster') + +        if site == 'novaplus': +            upload_date = unified_strdate(self._search_regex( +                r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) +        elif site == 'fanda': +            upload_date = unified_strdate(self._search_regex( +                r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) +        else: +            upload_date = None + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'upload_date': upload_date, +            'thumbnail': thumbnail, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py new file mode 100644 index 000000000..173e46cd8 --- /dev/null +++ b/youtube_dl/extractor/nowtv.py @@ -0,0 +1,192 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    ExtractorError, +    int_or_none, +    parse_iso8601, +    parse_duration, +    remove_start, +) + + +class NowTVIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player' + +    _TESTS = [{ +        # rtl +        'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player', +        'info_dict': { +            'id': '203519', +            'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', +            'ext': 'mp4', +            'title': 'Die neuen Bauern und eine Hochzeit', +            'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', +            'thumbnail': 're:^https?://.*\.jpg$', +            'timestamp': 1432580700, +            'upload_date': '20150525', +            'duration': 2786, +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, { +        # rtl2 +        'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player', +        'info_dict': { +            'id': '203481', +            'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', +            'ext': 'mp4', +            'title': 'Berlin - Tag & Nacht (Folge 934)', +            'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', +            'thumbnail': 're:^https?://.*\.jpg$', +            'timestamp': 1432666800, +            'upload_date': '20150526', +            'duration': 2641, +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, { +        # rtlnitro +        'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player', +        'info_dict': { +            'id': '165780', +            'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', +            'ext': 'mp4', +            'title': 'Hals- und Beinbruch', +            'description': 'md5:b50d248efffe244e6f56737f0911ca57', +            'thumbnail': 're:^https?://.*\.jpg$', +            'timestamp': 1432415400, +            'upload_date': '20150523', +            'duration': 2742, +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, { +        # superrtl +        'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player', +        'info_dict': { +            'id': '99205', +            'display_id': 'medicopter-117/angst', +            'ext': 'mp4', +            'title': 'Angst!', +            'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', +            'thumbnail': 're:^https?://.*\.jpg$', +            'timestamp': 1222632900, +            'upload_date': '20080928', +            'duration': 3025, +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, { +        # ntv +        'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player', +        'info_dict': { +            'id': '203521', +            'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', +            'ext': 'mp4', +            'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', +            'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', +            'thumbnail': 're:^https?://.*\.jpg$', +            'timestamp': 1432751700, +            'upload_date': '20150527', +            'duration': 1083, +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, { +        # vox +        'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player', +        'info_dict': { +            'id': '128953', +            'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', +            'ext': 'mp4', +            'title': "Büro-Fall / Chihuahua 'Joel'", +            'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', +            'thumbnail': 're:^https?://.*\.jpg$', +            'timestamp': 1432408200, +            'upload_date': '20150523', +            'duration': 3092, +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        display_id = mobj.group('id') +        station = mobj.group('station') + +        info = self._download_json( +            'https://api.nowtv.de/v3/movies/%s?fields=*,format,files' % display_id, +            display_id) + +        video_id = compat_str(info['id']) + +        files = info['files'] +        if not files: +            if info.get('geoblocked', False): +                raise ExtractorError( +                    'Video %s is not available from your location due to geo restriction' % video_id, +                    expected=True) +            if not info.get('free', True): +                raise ExtractorError( +                    'Video %s is not available for free' % video_id, expected=True) + +        f = info.get('format', {}) +        station = f.get('station') or station + +        STATIONS = { +            'rtl': 'rtlnow', +            'rtl2': 'rtl2now', +            'vox': 'voxnow', +            'nitro': 'rtlnitronow', +            'ntv': 'n-tvnow', +            'superrtl': 'superrtlnow' +        } + +        formats = [] +        for item in files['items']: +            item_path = remove_start(item['path'], '/') +            tbr = int_or_none(item['bitrate']) +            m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path) +            m3u8_url = m3u8_url.replace('now/', 'now/videos/') +            formats.append({ +                'url': m3u8_url, +                'format_id': '%s-%sk' % (item['id'], tbr), +                'ext': 'mp4', +                'tbr': tbr, +            }) +        self._sort_formats(formats) + +        title = info['title'] +        description = info.get('articleLong') or info.get('articleShort') +        timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') +        duration = parse_duration(info.get('duration')) +        thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'timestamp': timestamp, +            'duration': duration, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index fbc521d1a..6c7149fe3 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -2,6 +2,7 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..compat import compat_urllib_parse  from ..utils import (      unified_strdate,      int_or_none, @@ -11,8 +12,9 @@ from ..utils import (  class OdnoklassnikiIE(InfoExtractor): -    _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>\d+)' +    _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>[\d-]+)'      _TESTS = [{ +        # metadata in JSON          'url': 'http://ok.ru/video/20079905452',          'md5': '8e24ad2da6f387948e7a7d44eb8668fe',          'info_dict': { @@ -20,11 +22,22 @@ class OdnoklassnikiIE(InfoExtractor):              'ext': 'mp4',              'title': 'Культура меняет нас (прекрасный ролик!))',              'duration': 100, -            'upload_date': '20141207',              'uploader_id': '330537914540',              'uploader': 'Виталий Добровольский',              'like_count': int, -            'age_limit': 0, +        }, +    }, { +        # metadataUrl +        'url': 'http://ok.ru/video/63567059965189-0', +        'md5': '9676cf86eff5391d35dea675d224e131', +        'info_dict': { +            'id': '63567059965189-0', +            'ext': 'mp4', +            'title': 'Девушка без комплексов ...', +            'duration': 191, +            'uploader_id': '534380003155', +            'uploader': 'Андрей Мещанинов', +            'like_count': int,          },      }, {          'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', @@ -34,14 +47,23 @@ class OdnoklassnikiIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) +        webpage = self._download_webpage( +            'http://ok.ru/video/%s' % video_id, video_id)          player = self._parse_json(              unescapeHTML(self._search_regex(                  r'data-attributes="([^"]+)"', webpage, 'player')),              video_id) -        metadata = self._parse_json(player['flashvars']['metadata'], video_id) +        flashvars = player['flashvars'] + +        metadata = flashvars.get('metadata') +        if metadata: +            metadata = self._parse_json(metadata, video_id) +        else: +            metadata = self._download_json( +                compat_urllib_parse.unquote(flashvars['metadataUrl']), +                video_id, 'Downloading metadata JSON')          movie = metadata['movie']          title = movie['title'] @@ -53,11 +75,11 @@ class OdnoklassnikiIE(InfoExtractor):          uploader = author.get('name')          upload_date = unified_strdate(self._html_search_meta( -            'ya:ovs:upload_date', webpage, 'upload date')) +            'ya:ovs:upload_date', webpage, 'upload date', default=None))          age_limit = None          adult = self._html_search_meta( -            'ya:ovs:adult', webpage, 'age limit') +            'ya:ovs:adult', webpage, 'age limit', default=None)          if adult:              age_limit = 18 if adult == 'true' else 0 diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index c0e6d643d..a262a9f6d 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -12,50 +12,7 @@ from ..utils import (  ) -class OoyalaIE(InfoExtractor): -    _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)' - -    _TESTS = [ -        { -            # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video -            'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', -            'info_dict': { -                'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', -                'ext': 'mp4', -                'title': 'Explaining Data Recovery from Hard Drives and SSDs', -                'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', -            }, -        }, { -            # Only available for ipad -            'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', -            'info_dict': { -                'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', -                'ext': 'mp4', -                'title': 'Simulation Overview - Levels of Simulation', -                'description': '', -            }, -        }, -        { -            # Information available only through SAS api -            # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187 -            'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx', -            'md5': 'a84001441b35ea492bc03736e59e7935', -            'info_dict': { -                'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx', -                'ext': 'mp4', -                'title': 'Ooyala video', -            } -        } -    ] - -    @staticmethod -    def _url_for_embed_code(embed_code): -        return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code - -    @classmethod -    def _build_url_result(cls, embed_code): -        return cls.url_result(cls._url_for_embed_code(embed_code), -                              ie=cls.ie_key()) +class OoyalaBaseIE(InfoExtractor):      def _extract_result(self, info, more_info):          embedCode = info['embedCode'] @@ -77,11 +34,8 @@ class OoyalaIE(InfoExtractor):              'thumbnail': more_info['promo'],          } -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        embedCode = mobj.group('id') -        player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode -        player = self._download_webpage(player_url, embedCode) +    def _extract(self, player_url, video_id): +        player = self._download_webpage(player_url, video_id)          mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="',                                          player, 'mobile player url')          # Looks like some videos are only available for particular devices @@ -94,7 +48,7 @@ class OoyalaIE(InfoExtractor):          devices.insert(0, 'unknown')          for device in devices:              mobile_player = self._download_webpage( -                '%s&device=%s' % (mobile_url, device), embedCode, +                '%s&device=%s' % (mobile_url, device), video_id,                  'Downloading mobile player JS for %s device' % device)              videos_info = self._search_regex(                  r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', @@ -105,10 +59,10 @@ class OoyalaIE(InfoExtractor):          if not videos_info:              formats = []              auth_data = self._download_json( -                'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (embedCode, embedCode), -                embedCode) +                'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (video_id, video_id), +                video_id) -            cur_auth_data = auth_data['authorization_data'][embedCode] +            cur_auth_data = auth_data['authorization_data'][video_id]              for stream in cur_auth_data['streams']:                  formats.append({ @@ -123,7 +77,7 @@ class OoyalaIE(InfoExtractor):                  })              if formats:                  return { -                    'id': embedCode, +                    'id': video_id,                      'formats': formats,                      'title': 'Ooyala video',                  } @@ -143,9 +97,100 @@ class OoyalaIE(InfoExtractor):              videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])]              return {                  '_type': 'playlist', -                'id': embedCode, +                'id': video_id,                  'title': unescapeHTML(videos_more_info['title']),                  'entries': videos,              }          else:              return self._extract_result(videos_info[0], videos_more_info) + + +class OoyalaIE(OoyalaBaseIE): +    _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)' + +    _TESTS = [ +        { +            # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video +            'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', +            'info_dict': { +                'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', +                'ext': 'mp4', +                'title': 'Explaining Data Recovery from Hard Drives and SSDs', +                'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', +            }, +        }, { +            # Only available for ipad +            'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', +            'info_dict': { +                'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', +                'ext': 'mp4', +                'title': 'Simulation Overview - Levels of Simulation', +                'description': '', +            }, +        }, +        { +            # Information available only through SAS api +            # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187 +            'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx', +            'md5': 'a84001441b35ea492bc03736e59e7935', +            'info_dict': { +                'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx', +                'ext': 'mp4', +                'title': 'Ooyala video', +            } +        } +    ] + +    @staticmethod +    def _url_for_embed_code(embed_code): +        return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code + +    @classmethod +    def _build_url_result(cls, embed_code): +        return cls.url_result(cls._url_for_embed_code(embed_code), +                              ie=cls.ie_key()) + +    def _real_extract(self, url): +        embed_code = self._match_id(url) +        player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code +        return self._extract(player_url, embed_code) + + +class OoyalaExternalIE(OoyalaBaseIE): +    _VALID_URL = r'''(?x) +                    (?: +                        ooyalaexternal:| +                        https?://.+?\.ooyala\.com/.*?\bexternalId= +                    ) +                    (?P<partner_id>[^:]+) +                    : +                    (?P<id>.+) +                    (?: +                        :| +                        .*?&pcode= +                    ) +                    (?P<pcode>.+?) +                    (&|$) +                    ''' + +    _TEST = { +        'url': 'https://player.ooyala.com/player.js?externalId=espn:10365079&pcode=1kNG061cgaoolOncv54OAO1ceO-I&adSetCode=91cDU6NuXTGKz3OdjOxFdAgJVtQcKJnI&callback=handleEvents&hasModuleParams=1&height=968&playerBrandingId=7af3bd04449c444c964f347f11873075&targetReplaceId=videoPlayer&width=1656&wmode=opaque&allowScriptAccess=always', +        'info_dict': { +            'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', +            'ext': 'mp4', +            'title': 'dm_140128_30for30Shorts___JudgingJewellv2', +            'description': '', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        partner_id = mobj.group('partner_id') +        video_id = mobj.group('id') +        pcode = mobj.group('pcode') +        player_url = 'http://player.ooyala.com/player.js?externalId=%s:%s&pcode=%s' % (partner_id, video_id, pcode) +        return self._extract(player_url, video_id) diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index f179ea200..6cdc2638b 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -87,7 +87,7 @@ class PatreonIE(InfoExtractor):              r'<div class="attach"><a target="_blank" href="([^"]+)">',              webpage, 'attachment URL', default=None)          embed = self._html_search_regex( -            r'<div id="watchCreation">\s*<iframe class="embedly-embed" src="([^"]+)"', +            r'<div[^>]+id="watchCreation"[^>]*>\s*<iframe[^>]+src="([^"]+)"',              webpage, 'embedded URL', default=None)          if attach_fn is not None: diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py new file mode 100644 index 000000000..72d1b2718 --- /dev/null +++ b/youtube_dl/extractor/porn91.py @@ -0,0 +1,71 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from ..compat import compat_urllib_parse +from .common import InfoExtractor +from ..utils import ( +    parse_duration, +    int_or_none, +    ExtractorError, +) + + +class Porn91IE(InfoExtractor): +    IE_NAME = '91porn' +    _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P<id>[\w\d]+)' + +    _TEST = { +        'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', +        'md5': '6df8f6d028bc8b14f5dbd73af742fb20', +        'info_dict': { +            'id': '7e42283b4f5ab36da134', +            'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', +            'ext': 'mp4', +            'duration': 431, +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id +        self._set_cookie('91porn.com', 'language', 'cn_CN') +        webpage = self._download_webpage(url, video_id, 'get HTML content') + +        if '作为游客,你每天只可观看10个视频' in webpage: +            raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True) + +        title = self._search_regex( +            r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title') +        title = title.replace('\n', '') + +        # get real url +        file_id = self._search_regex( +            r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id') +        sec_code = self._search_regex( +            r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code') +        max_vid = self._search_regex( +            r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid') +        url_params = compat_urllib_parse.urlencode({ +            'VID': file_id, +            'mp4': '1', +            'seccode': sec_code, +            'max_vid': max_vid, +        }) +        info_cn = self._download_webpage( +            'http://91porn.com/getfile.php?' + url_params, video_id, +            'get real video url') +        video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url') + +        duration = parse_duration(self._search_regex( +            r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False)) + +        comment_count = int_or_none(self._search_regex( +            r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False)) + +        return { +            'id': video_id, +            'title': title, +            'url': video_url, +            'duration': duration, +            'comment_count': comment_count, +        } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 0c8b731cf..daa284ea2 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -71,7 +71,8 @@ class PornHubIE(InfoExtractor):          video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))          if webpage.find('"encrypted":true') != -1: -            password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password')) +            password = compat_urllib_parse.unquote_plus( +                self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))              video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))          formats = [] diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py index 9688ed948..eba4dfbb3 100644 --- a/youtube_dl/extractor/pornovoisines.py +++ b/youtube_dl/extractor/pornovoisines.py @@ -34,7 +34,7 @@ class PornoVoisinesIE(InfoExtractor):              'duration': 120,              'view_count': int,              'average_rating': float, -            'categories': ['Débutante', 'Scénario', 'Sodomie'], +            'categories': ['Débutantes', 'Scénario', 'Sodomie'],              'age_limit': 18,          }      } @@ -71,7 +71,7 @@ class PornoVoisinesIE(InfoExtractor):          view_count = int_or_none(self._search_regex(              r'(\d+) vues', webpage, 'view count', fatal=False))          average_rating = self._search_regex( -            r'Note : (\d+,\d+)', webpage, 'average rating', fatal=False) +            r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False)          if average_rating:              average_rating = float_or_none(average_rating.replace(',', '.')) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 7cc799664..255d4abc1 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -17,7 +17,7 @@ from ..utils import (  class ProSiebenSat1IE(InfoExtractor):      IE_NAME = 'prosiebensat1'      IE_DESC = 'ProSiebenSat.1 Digital' -    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)' +    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)'      _TESTS = [          { diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 13113820b..bafa81c21 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -9,7 +9,6 @@ from .common import InfoExtractor  from ..utils import (      strip_jsonp,      unescapeHTML, -    js_to_json,  )  from ..compat import compat_urllib_request @@ -19,17 +18,23 @@ class QQMusicIE(InfoExtractor):      _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)'      _TESTS = [{          'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', -        'md5': 'bed90b6db2a7a7a7e11bc585f471f63a', +        'md5': '9ce1c1c8445f561506d2e3cfb0255705',          'info_dict': {              'id': '004295Et37taLD', -            'ext': 'm4a', +            'ext': 'mp3',              'title': '可惜没如果',              'upload_date': '20141227',              'creator': '林俊杰', -            'description': 'md5:4348ff1dd24036906baa7b6f973f8d30', +            'description': 'md5:d327722d0361576fde558f1ac68a7065',          }      }] +    _FORMATS = { +        'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320}, +        'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128}, +        'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10} +    } +      # Reference: m_r_GetRUin() in top_player.js      # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js      @staticmethod @@ -60,6 +65,8 @@ class QQMusicIE(InfoExtractor):          lrc_content = self._html_search_regex(              r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>',              detail_info_page, 'LRC lyrics', default=None) +        if lrc_content: +            lrc_content = lrc_content.replace('\\n', '\n')          guid = self.m_r_get_ruin() @@ -67,11 +74,22 @@ class QQMusicIE(InfoExtractor):              'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid,              mid, note='Retrieve vkey', errnote='Unable to get vkey',              transform_source=strip_jsonp)['key'] -        song_url = 'http://cc.stream.qqmusic.qq.com/C200%s.m4a?vkey=%s&guid=%s&fromtag=0' % (mid, vkey, guid) + +        formats = [] +        for format_id, details in self._FORMATS.items(): +            formats.append({ +                'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' +                       % (details['prefix'], mid, details['ext'], vkey, guid), +                'format': format_id, +                'format_id': format_id, +                'preference': details['preference'], +                'abr': details.get('abr'), +            }) +        self._sort_formats(formats)          return {              'id': mid, -            'url': song_url, +            'formats': formats,              'title': song_name,              'upload_date': publish_time,              'creator': singer, @@ -179,60 +197,49 @@ class QQMusicToplistIE(QQPlaylistBaseIE):      _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)'      _TESTS = [{ -        'url': 'http://y.qq.com/#type=toplist&p=global_12', +        'url': 'http://y.qq.com/#type=toplist&p=global_123',          'info_dict': { -            'id': 'global_12', -            'title': 'itunes榜', +            'id': 'global_123', +            'title': '美国iTunes榜',          },          'playlist_count': 10,      }, { -        'url': 'http://y.qq.com/#type=toplist&p=top_6', +        'url': 'http://y.qq.com/#type=toplist&p=top_3',          'info_dict': { -            'id': 'top_6', +            'id': 'top_3',              'title': 'QQ音乐巅峰榜·欧美', +            'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统' +                           '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据' +                           '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:' +                           '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放'          },          'playlist_count': 100,      }, { -        'url': 'http://y.qq.com/#type=toplist&p=global_5', +        'url': 'http://y.qq.com/#type=toplist&p=global_106',          'info_dict': { -            'id': 'global_5', -            'title': '韩国mnet排行榜', +            'id': 'global_106', +            'title': '韩国Mnet榜',          },          'playlist_count': 50,      }] -    @staticmethod -    def strip_qq_jsonp(code): -        return js_to_json(re.sub(r'^MusicJsonCallback\((.*?)\)/\*.+?\*/$', r'\1', code)) -      def _real_extract(self, url):          list_id = self._match_id(url)          list_type, num_id = list_id.split("_") -        list_page = self._download_webpage( -            "http://y.qq.com/y/static/toplist/index/%s.html" % list_id, -            list_id, 'Download toplist page') - -        entries = [] -        if list_type == 'top': -            jsonp_url = "http://y.qq.com/y/static/toplist/json/top/%s/1.js" % num_id -        else: -            jsonp_url = "http://y.qq.com/y/static/toplist/json/global/%s/1_1.js" % num_id -          toplist_json = self._download_json( -            jsonp_url, list_id, note='Retrieve toplist json', -            errnote='Unable to get toplist json', transform_source=self.strip_qq_jsonp) - -        for song in toplist_json['l']: -            s = song['s'] -            song_mid = s.split("|")[20] -            entries.append(self.url_result( -                'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic', -                song_mid)) +            'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json' +            % (list_type, num_id), +            list_id, 'Download toplist page') -        list_name = self._html_search_regex( -            r'<h2 id="top_name">([^\']+)</h2>', list_page, 'top list name', -            default=None) +        entries = [ +            self.url_result( +                'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid'] +            ) for song in toplist_json['songlist'] +        ] -        return self.playlist_result(entries, list_id, list_name) +        topinfo = toplist_json.get('topinfo', {}) +        list_name = topinfo.get('ListName') +        list_description = topinfo.get('info') +        return self.playlist_result(entries, list_id, list_name, list_description) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index dce64e151..5a381d9ce 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -1,10 +1,11 @@  # coding: utf-8  from __future__ import unicode_literals -import re -import json -  from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    unescapeHTML, +)  class RTBFIE(InfoExtractor): @@ -16,25 +17,24 @@ class RTBFIE(InfoExtractor):              'id': '1921274',              'ext': 'mp4',              'title': 'Les Diables au coeur (épisode 2)', -            'description': 'Football - Diables Rouges',              'duration': 3099, -            'timestamp': 1398456336, -            'upload_date': '20140425',          }      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url) -        page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id) +        webpage = self._download_webpage( +            'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id) -        data = json.loads(self._html_search_regex( -            r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data'] +        data = self._parse_json( +            unescapeHTML(self._search_regex( +                r'data-video="([^"]+)"', webpage, 'data video')), +            video_id)          video_url = data.get('downloadUrl') or data.get('url') -        if data['provider'].lower() == 'youtube': +        if data.get('provider').lower() == 'youtube':              return self.url_result(video_url, 'Youtube')          return { @@ -42,8 +42,8 @@ class RTBFIE(InfoExtractor):              'url': video_url,              'title': data['title'],              'description': data.get('description') or data.get('subtitle'), -            'thumbnail': data['thumbnail']['large'], +            'thumbnail': data.get('thumbnail'),              'duration': data.get('duration') or data.get('realDuration'), -            'timestamp': data['created'], -            'view_count': data['viewCount'], +            'timestamp': int_or_none(data.get('created')), +            'view_count': int_or_none(data.get('viewCount')),          } diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py deleted file mode 100644 index 785a8045e..000000000 --- a/youtube_dl/extractor/rtlnow.py +++ /dev/null @@ -1,174 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( -    ExtractorError, -    clean_html, -    unified_strdate, -    int_or_none, -) - - -class RTLnowIE(InfoExtractor): -    """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" -    _VALID_URL = r'''(?x) -                        (?:https?://)? -                        (?P<url> -                            (?P<domain> -                                rtl-now\.rtl\.de| -                                rtl2now\.rtl2\.de| -                                (?:www\.)?voxnow\.de| -                                (?:www\.)?rtlnitronow\.de| -                                (?:www\.)?superrtlnow\.de| -                                (?:www\.)?n-tvnow\.de) -                            /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\? -                            (?:container_id|film_id)=(?P<video_id>[0-9]+)& -                            player=1(?:&season=[0-9]+)?(?:&.*)? -                        )''' - -    _TESTS = [ -        { -            'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', -            'info_dict': { -                'id': '90419', -                'ext': 'flv', -                'title': 'Ahornallee - Folge 1 - Der Einzug', -                'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de', -                'upload_date': '20070416', -                'duration': 1685, -            }, -            'params': { -                'skip_download': True, -            }, -            'skip': 'Only works from Germany', -        }, -        { -            'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', -            'info_dict': { -                'id': '69756', -                'ext': 'flv', -                'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.', -                'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0', -                'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg', -                'upload_date': '20120519', -                'duration': 1245, -            }, -            'params': { -                'skip_download': True, -            }, -            'skip': 'Only works from Germany', -        }, -        { -            'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', -            'info_dict': { -                'id': '13883', -                'ext': 'flv', -                'title': 'Voxtours - Südafrika-Reporter II', -                'description': 'md5:de7f8d56be6fd4fed10f10f57786db00', -                'upload_date': '20090627', -                'duration': 1800, -            }, -            'params': { -                'skip_download': True, -            }, -        }, -        { -            'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1', -            'info_dict': { -                'id': '99205', -                'ext': 'flv', -                'title': 'Medicopter 117 - Angst!', -                'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin', -                'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg', -                'upload_date': '20080928', -                'duration': 2691, -            }, -            'params': { -                'skip_download': True, -            }, -        }, -        { -            'url': 'http://rtl-now.rtl.de/der-bachelor/folge-4.php?film_id=188729&player=1&season=5', -            'info_dict': { -                'id': '188729', -                'ext': 'flv', -                'upload_date': '20150204', -                'description': 'md5:5e1ce23095e61a79c166d134b683cecc', -                'title': 'Der Bachelor - Folge 4', -            } -        }, { -            'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0', -            'only_matching': True, -        }, -    ] - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_page_url = 'http://%s/' % mobj.group('domain') -        video_id = mobj.group('video_id') - -        webpage = self._download_webpage('http://' + mobj.group('url'), video_id) - -        mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage) -        if mobj: -            raise ExtractorError(clean_html(mobj.group(1)), expected=True) - -        title = self._og_search_title(webpage) -        description = self._og_search_description(webpage) -        thumbnail = self._og_search_thumbnail(webpage, default=None) - -        upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date')) - -        mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage) -        duration = int(mobj.group('seconds')) if mobj else None - -        playerdata_url = self._html_search_regex( -            r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url') - -        playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML') - -        videoinfo = playerdata.find('./playlist/videoinfo') - -        formats = [] -        for filename in videoinfo.findall('filename'): -            mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text) -            if mobj: -                fmt = { -                    'url': mobj.group('url'), -                    'play_path': 'mp4:' + mobj.group('play_path'), -                    'page_url': video_page_url, -                    'player_url': video_page_url + 'includes/vodplayer.swf', -                } -            else: -                mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text) -                if mobj: -                    fmt = { -                        'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'), -                        'play_path': 'mp4:' + mobj.group('play_path'), -                        'page_url': url, -                        'player_url': video_page_url + 'includes/vodplayer.swf', -                    } -                else: -                    fmt = { -                        'url': filename.text, -                    } -            fmt.update({ -                'width': int_or_none(filename.get('width')), -                'height': int_or_none(filename.get('height')), -                'vbr': int_or_none(filename.get('bitrate')), -                'ext': 'flv', -            }) -            formats.append(fmt) - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'upload_date': upload_date, -            'duration': duration, -            'formats': formats, -        } diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index d0981115d..9fbe239d8 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -190,6 +190,7 @@ class RTSIE(InfoExtractor):                  'tbr': media['rate'] or extract_bitrate(media['url']),              } for media in info['media'] if media.get('rate')]) +        self._check_formats(formats, video_id)          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 849300140..82cd98ac7 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -17,7 +17,7 @@ from ..utils import (  def _decrypt_url(png): -    encrypted_data = base64.b64decode(png) +    encrypted_data = base64.b64decode(png.encode('utf-8'))      text_index = encrypted_data.find(b'tEXt')      text_chunk = encrypted_data[text_index - 4:]      length = struct_unpack('!I', text_chunk[:4])[0] diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index 55604637d..d9df06861 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -104,7 +104,7 @@ class RUTVIE(InfoExtractor):      @classmethod      def _extract_url(cls, webpage):          mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)          if mobj:              return mobj.group('url') diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index b8775c2f9..d4bd1a0d7 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -1,7 +1,6 @@  # -*- coding: utf-8 -*-  from __future__ import unicode_literals -import json  import re  from .common import InfoExtractor  from ..utils import ( @@ -33,16 +32,18 @@ class SBSIE(InfoExtractor):      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url) +          webpage = self._download_webpage(url, video_id) -        release_urls_json = js_to_json(self._search_regex( +        player = self._search_regex(              r'(?s)playerParams\.releaseUrls\s*=\s*(\{.*?\n\});\n', -            webpage, '')) -        release_urls = json.loads(release_urls_json) -        theplatform_url = ( -            release_urls.get('progressive') or release_urls.get('standard')) +            webpage, 'player') +        player = re.sub(r"'\s*\+\s*[\da-zA-Z_]+\s*\+\s*'", '', player) + +        release_urls = self._parse_json(js_to_json(player), video_id) + +        theplatform_url = release_urls.get('progressive') or release_urls['standard']          title = remove_end(self._og_search_title(webpage), ' (The Feed)')          description = self._html_search_meta('description', webpage) @@ -52,7 +53,6 @@ class SBSIE(InfoExtractor):              '_type': 'url_transparent',              'id': video_id,              'url': theplatform_url, -              'title': title,              'description': description,              'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index d3b8a1be4..9c53704ea 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor):          ["arch", "", "http://ussenate-f.akamaihd.net/"]      ]      _IE_NAME = 'senate.gov' -    _VALID_URL = r'http://www\.senate\.gov/isvp/\?(?P<qs>.+)' +    _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)'      _TESTS = [{          'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',          'info_dict': { @@ -72,12 +72,16 @@ class SenateISVPIE(InfoExtractor):              'ext': 'mp4',              'title': 'Integrated Senate Video Player'          } +    }, { +        # From http://www.c-span.org/video/?96791-1 +        'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', +        'only_matching': True,      }]      @staticmethod      def _search_iframe_url(webpage):          mobj = re.search( -            r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/\?[^'\"]+)['\"]", +            r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",              webpage)          if mobj:              return mobj.group('url') diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 26ced716e..9f3e944e7 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -47,7 +47,7 @@ class SharedIE(InfoExtractor):          video_url = self._html_search_regex(              r'data-url="([^"]+)"', video_page, 'video URL')          title = base64.b64decode(self._html_search_meta( -            'full:title', webpage, 'title')).decode('utf-8') +            'full:title', webpage, 'title').encode('utf-8')).decode('utf-8')          filesize = int_or_none(self._html_search_meta(              'full:size', webpage, 'file size', fatal=False))          thumbnail = self._html_search_regex( diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py deleted file mode 100644 index b5fa6f1da..000000000 --- a/youtube_dl/extractor/sockshare.py +++ /dev/null @@ -1,83 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from ..compat import ( -    compat_urllib_parse, -    compat_urllib_request, -) -from ..utils import ( -    determine_ext, -    ExtractorError, -) - -from .common import InfoExtractor - - -class SockshareIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P<id>[0-9A-Za-z]+)' -    _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.</div>' -    _TEST = { -        'url': 'http://www.sockshare.com/file/437BE28B89D799D7', -        'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd', -        'info_dict': { -            'id': '437BE28B89D799D7', -            'title': 'big_buck_bunny_720p_surround.avi', -            'ext': 'avi', -        } -    } - -    def _real_extract(self, url): -        video_id = self._match_id(url) -        url = 'http://sockshare.com/file/%s' % video_id -        webpage = self._download_webpage(url, video_id) - -        if re.search(self._FILE_DELETED_REGEX, webpage) is not None: -            raise ExtractorError('Video %s does not exist' % video_id, -                                 expected=True) - -        confirm_hash = self._html_search_regex(r'''(?x)<input\s+ -            type="hidden"\s+ -            value="([^"]*)"\s+ -            name="hash" -            ''', webpage, 'hash') - -        fields = { -            "hash": confirm_hash.encode('utf-8'), -            "confirm": "Continue as Free User" -        } - -        post = compat_urllib_parse.urlencode(fields) -        req = compat_urllib_request.Request(url, post) -        # Apparently, this header is required for confirmation to work. -        req.add_header('Host', 'www.sockshare.com') -        req.add_header('Content-type', 'application/x-www-form-urlencoded') - -        webpage = self._download_webpage( -            req, video_id, 'Downloading video page') - -        video_url = self._html_search_regex( -            r'<a href="([^"]*)".+class="download_file_link"', -            webpage, 'file url') -        video_url = "http://www.sockshare.com" + video_url -        title = self._html_search_regex(( -            r'<h1>(.+)<strong>', -            r'var name = "([^"]+)";'), -            webpage, 'title', default=None) -        thumbnail = self._html_search_regex( -            r'<img\s+src="([^"]*)".+?name="bg"', -            webpage, 'thumbnail', default=None) - -        formats = [{ -            'format_id': 'sd', -            'url': video_url, -            'ext': determine_ext(title), -        }] - -        return { -            'id': video_id, -            'title': title, -            'thumbnail': thumbnail, -            'formats': formats, -        } diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index eab4adfca..29bd9ce6f 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -23,9 +23,7 @@ class SohuIE(InfoExtractor):              'ext': 'mp4',              'title': 'MV:Far East Movement《The Illest》',          }, -        'params': { -            'cn_verification_proxy': 'proxy.uku.im:8888' -        } +        'skip': 'On available in China',      }, {          'url': 'http://tv.sohu.com/20150305/n409385080.shtml',          'md5': 'ac9a5d322b4bf9ae184d53e4711e4f1a', diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py new file mode 100644 index 000000000..5da66ca9e --- /dev/null +++ b/youtube_dl/extractor/soompi.py @@ -0,0 +1,146 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .crunchyroll import CrunchyrollIE + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( +    ExtractorError, +    int_or_none, +    remove_start, +    xpath_text, +) + + +class SoompiBaseIE(InfoExtractor): +    def _get_episodes(self, webpage, episode_filter=None): +        episodes = self._parse_json( +            self._search_regex( +                r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'), +            None) +        return list(filter(episode_filter, episodes)) + + +class SoompiIE(SoompiBaseIE, CrunchyrollIE): +    IE_NAME = 'soompi' +    _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)' +    _TESTS = [{ +        'url': 'http://tv.soompi.com/en/watch/29235', +        'info_dict': { +            'id': '29235', +            'ext': 'mp4', +            'title': 'Episode 1096', +            'description': '2015-05-20' +        }, +        'params': { +            'skip_download': True, +        }, +    }] + +    def _get_episode(self, webpage, video_id): +        return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0] + +    def _get_subtitles(self, config, video_id): +        sub_langs = {} +        for subtitle in config.findall('./{default}preload/subtitles/subtitle'): +            sub_langs[subtitle.attrib['id']] = subtitle.attrib['title'] + +        subtitles = {} +        for s in config.findall('./{default}preload/subtitle'): +            lang_code = sub_langs.get(s.attrib['id']) +            if not lang_code: +                continue +            sub_id = s.get('id') +            data = xpath_text(s, './data', 'data') +            iv = xpath_text(s, './iv', 'iv') +            if not id or not iv or not data: +                continue +            subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8') +            subtitles[lang_code] = self._extract_subtitles(subtitle) +        return subtitles + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        try: +            webpage = self._download_webpage( +                url, video_id, 'Downloading episode page') +        except ExtractorError as ee: +            if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: +                webpage = ee.cause.read() +                block_message = self._html_search_regex( +                    r'(?s)<div class="block-message">(.+?)</div>', webpage, +                    'block message', default=None) +                if block_message: +                    raise ExtractorError(block_message, expected=True) +            raise + +        formats = [] +        config = None +        for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage): +            config = self._download_xml( +                'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id), +                video_id, 'Downloading %s XML' % format_id) +            m3u8_url = xpath_text( +                config, './{default}preload/stream_info/file', +                '%s m3u8 URL' % format_id) +            if not m3u8_url: +                continue +            formats.extend(self._extract_m3u8_formats( +                m3u8_url, video_id, 'mp4', m3u8_id=format_id)) +        self._sort_formats(formats) + +        episode = self._get_episode(webpage, video_id) + +        title = episode['name'] +        description = episode.get('description') +        duration = int_or_none(episode.get('duration')) + +        thumbnails = [{ +            'id': thumbnail_id, +            'url': thumbnail_url, +        } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()] + +        subtitles = self.extract_subtitles(config, video_id) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnails': thumbnails, +            'duration': duration, +            'formats': formats, +            'subtitles': subtitles +        } + + +class SoompiShowIE(SoompiBaseIE): +    IE_NAME = 'soompi:show' +    _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)' +    _TESTS = [{ +        'url': 'http://tv.soompi.com/en/shows/liar-game', +        'info_dict': { +            'id': 'liar-game', +            'title': 'Liar Game', +            'description': 'md5:52c02bce0c1a622a95823591d0589b66', +        }, +        'playlist_count': 14, +    }] + +    def _real_extract(self, url): +        show_id = self._match_id(url) + +        webpage = self._download_webpage( +            url, show_id, 'Downloading show page') + +        title = remove_start(self._og_search_title(webpage), 'SoompiTV | ') +        description = self._og_search_description(webpage) + +        entries = [ +            self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi') +            for episode in self._get_episodes(webpage)] + +        return self.playlist_result(entries, show_id, title, description) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index b936202f6..06d6e6640 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -71,7 +71,7 @@ class SpankwireIE(InfoExtractor):              compat_urllib_parse.unquote,              re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage)))          if webpage.find('flashvars\.encrypted = "true"') != -1: -            password = self._html_search_regex( +            password = self._search_regex(                  r'flashvars\.video_title = "([^"]+)',                  webpage, 'password').replace('+', ' ')              video_urls = list(map( diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index 98cf92d89..359722ad6 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -51,9 +51,9 @@ class SpiegeltvIE(InfoExtractor):          is_wide = media_json['is_wide']          server_json = self._download_json( -            'http://www.spiegel.tv/streaming_servers/', video_id, -            note='Downloading server information') -        server = server_json[0]['endpoint'] +            'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json', +            video_id, note='Downloading server information') +        server = server_json['streamingserver'][0]['endpoint']          thumbnails = []          for image in media_json['images']: @@ -76,5 +76,6 @@ class SpiegeltvIE(InfoExtractor):              'ext': 'm4v',              'description': description,              'duration': duration, -            'thumbnails': thumbnails +            'thumbnails': thumbnails, +            'rtmp_live': True,          } diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index becdf658f..86d509ae5 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -4,37 +4,36 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_urlparse  from ..utils import ( -    parse_duration, -    parse_iso8601, +    unified_strdate,  )  class SportBoxIE(InfoExtractor): -    _VALID_URL = r'https?://news\.sportbox\.ru/Vidy_sporta/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)' -    _TESTS = [ -        { -            'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', -            'md5': 'ff56a598c2cf411a9a38a69709e97079', -            'info_dict': { -                'id': '80822', -                'ext': 'mp4', -                'title': 'Гонка 2  заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', -                'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed', -                'thumbnail': 're:^https?://.*\.jpg$', -                'timestamp': 1411896237, -                'upload_date': '20140928', -                'duration': 4846, -            }, -            'params': { -                # m3u8 download -                'skip_download': True, -            }, -        }, { -            'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', -            'only_matching': True, -        } -    ] +    _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)' +    _TESTS = [{ +        'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', +        'md5': 'ff56a598c2cf411a9a38a69709e97079', +        'info_dict': { +            'id': '80822', +            'ext': 'mp4', +            'title': 'Гонка 2  заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', +            'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad', +            'thumbnail': 're:^https?://.*\.jpg$', +            'upload_date': '20140928', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, { +        'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', +        'only_matching': True, +    }, { +        'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355', +        'only_matching': True, +    }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -42,35 +41,75 @@ class SportBoxIE(InfoExtractor):          webpage = self._download_webpage(url, display_id) -        video_id = self._search_regex( -            r'src="/vdl/player/media/(\d+)"', webpage, 'video id') +        player = self._search_regex( +            r'src="/?(vdl/player/[^"]+)"', webpage, 'player') + +        title = self._html_search_regex( +            [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'], +            webpage, 'title') +        description = self._og_search_description(webpage) or self._html_search_meta( +            'description', webpage, 'description') +        thumbnail = self._og_search_thumbnail(webpage) +        upload_date = unified_strdate(self._html_search_meta( +            'dateCreated', webpage, 'upload date')) + +        return { +            '_type': 'url_transparent', +            'url': compat_urlparse.urljoin(url, '/%s' % player), +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'upload_date': upload_date, +        } -        player = self._download_webpage( -            'http://news.sportbox.ru/vdl/player/media/%s' % video_id, -            display_id, 'Downloading player webpage') + +class SportBoxEmbedIE(InfoExtractor): +    _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' +    _TESTS = [{ +        'url': 'http://news.sportbox.ru/vdl/player/ci/211355', +        'info_dict': { +            'id': '211355', +            'ext': 'mp4', +            'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', +            'thumbnail': 're:^https?://.*\.jpg$', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, { +        'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580', +        'only_matching': True, +    }] + +    @staticmethod +    def _extract_urls(webpage): +        return re.findall( +            r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"', +            webpage) + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id)          hls = self._search_regex( -            r"var\s+original_hls_file\s*=\s*'([^']+)'", player, 'hls file') +            r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]", +            webpage, 'hls file') -        formats = self._extract_m3u8_formats(hls, display_id, 'mp4') +        formats = self._extract_m3u8_formats(hls, video_id, 'mp4') -        title = self._html_search_regex( -            r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title') -        description = self._html_search_regex( -            r'(?s)<div itemprop="description">(.+?)</div>', webpage, 'description', fatal=False) -        thumbnail = self._og_search_thumbnail(webpage) -        timestamp = parse_iso8601(self._search_regex( -            r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False)) -        duration = parse_duration(self._html_search_regex( -            r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False)) +        title = self._search_regex( +            r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title') + +        thumbnail = self._search_regex( +            r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"', +            webpage, 'thumbnail', default=None)          return {              'id': video_id, -            'display_id': display_id,              'title': title, -            'description': description,              'thumbnail': thumbnail, -            'timestamp': timestamp, -            'duration': duration,              'formats': formats,          } diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 56be52638..d1b7264b4 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -10,6 +10,7 @@ from .common import InfoExtractor  from ..utils import (      ExtractorError,      qualities, +    determine_ext,  )  from ..compat import compat_ord @@ -50,6 +51,17 @@ class TeamcocoIE(InfoExtractor):              'params': {                  'skip_download': True,  # m3u8 downloads              } +        }, { +            'url': 'http://teamcoco.com/video/full-episode-mon-6-1-joel-mchale-jake-tapper-and-musical-guest-courtney-barnett?playlist=x;eyJ0eXBlIjoidGFnIiwiaWQiOjl9', +            'info_dict': { +                'id': '89341', +                'ext': 'mp4', +                'title': 'Full Episode - Mon. 6/1 - Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett', +                'description': 'Guests: Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett', +            }, +            'params': { +                'skip_download': True,  # m3u8 downloads +            }          }      ]      _VIDEO_ID_REGEXES = ( @@ -108,10 +120,24 @@ class TeamcocoIE(InfoExtractor):          formats = []          get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p'])          for filed in data['files']: -            if filed['type'] == 'hls': -                formats.extend(self._extract_m3u8_formats( -                    filed['url'], video_id, ext='mp4')) +            if determine_ext(filed['url']) == 'm3u8': +                # compat_urllib_parse.urljoin does not work here +                if filed['url'].startswith('/'): +                    m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url'] +                else: +                    m3u8_url = filed['url'] +                m3u8_formats = self._extract_m3u8_formats( +                    m3u8_url, video_id, ext='mp4') +                for m3u8_format in m3u8_formats: +                    if m3u8_format not in formats: +                        formats.append(m3u8_format) +            elif determine_ext(filed['url']) == 'f4m': +                # TODO Correct f4m extraction +                continue              else: +                if filed['url'].startswith('/mp4:protected/'): +                    # TODO Correct extraction for these files +                    continue                  m_format = re.search(r'(\d+(k|p))\.mp4', filed['url'])                  if m_format is not None:                      format_id = m_format.group(1) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 251a68680..a0c744fd1 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -16,6 +16,10 @@ class TelecincoIE(MiTeleIE):              'title': 'Con Martín Berasategui, hacer un bacalao al ...',              'duration': 662,          }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }, {          'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html',          'only_matching': True, diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py index 466155ef8..f6694149b 100644 --- a/youtube_dl/extractor/tenplay.py +++ b/youtube_dl/extractor/tenplay.py @@ -2,6 +2,10 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    float_or_none, +)  class TenPlayIE(InfoExtractor): @@ -49,18 +53,23 @@ class TenPlayIE(InfoExtractor):              if protocol == 'rtmp':                  url = url.replace('&mp4:', '') +                tbr = int_or_none(rendition.get('encodingRate'), 1000) +              formats.append({ -                'format_id': '_'.join(['rtmp', rendition['videoContainer'].lower(), rendition['videoCodec'].lower()]), -                'width': rendition['frameWidth'], -                'height': rendition['frameHeight'], -                'tbr': rendition['encodingRate'] / 1024, -                'filesize': rendition['size'], +                'format_id': '_'.join( +                    ['rtmp', rendition['videoContainer'].lower(), +                     rendition['videoCodec'].lower(), '%sk' % tbr]), +                'width': int_or_none(rendition['frameWidth']), +                'height': int_or_none(rendition['frameHeight']), +                'tbr': tbr, +                'filesize': int_or_none(rendition['size']),                  'protocol': protocol,                  'ext': ext,                  'vcodec': rendition['videoCodec'].lower(),                  'container': rendition['videoContainer'].lower(),                  'url': url,              }) +        self._sort_formats(formats)          return {              'id': video_id, @@ -74,8 +83,8 @@ class TenPlayIE(InfoExtractor):                  'url': json['thumbnailURL']              }],              'thumbnail': json['videoStillURL'], -            'duration': json['length'] / 1000, -            'timestamp': float(json['creationDate']) / 1000, -            'uploader': json['customFields']['production_company_distributor'] if 'production_company_distributor' in json['customFields'] else 'TENplay', -            'view_count': json['playsTotal'] +            'duration': float_or_none(json.get('length'), 1000), +            'timestamp': float_or_none(json.get('creationDate'), 1000), +            'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay', +            'view_count': int_or_none(json.get('playsTotal')),          } diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 025d0877c..3a68eaa80 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,8 +6,8 @@ from .common import InfoExtractor  class TF1IE(InfoExtractor):      """TF1 uses the wat.tv player.""" -    _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html' -    _TESTS = { +    _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html' +    _TESTS = [{          'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',          'info_dict': {              'id': '10635995', @@ -32,7 +32,13 @@ class TF1IE(InfoExtractor):              # Sometimes wat serves the whole file with the --test option              'skip_download': True,          }, -    } +    }, { +        'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', +        'only_matching': True, +    }, { +        'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index d48cbbf14..c282865b2 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -10,26 +10,32 @@ from ..utils import (  class TNAFlixIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'      _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'      _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'      _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' -    _TEST = { -        'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', -        'md5': 'ecf3498417d09216374fc5907f9c6ec0', -        'info_dict': { -            'id': '553878', -            'display_id': 'Carmella-Decesare-striptease', -            'ext': 'mp4', -            'title': 'Carmella Decesare - striptease', -            'description': '', -            'thumbnail': 're:https?://.*\.jpg$', -            'duration': 91, -            'age_limit': 18, +    _TESTS = [ +        { +            'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', +            'md5': 'ecf3498417d09216374fc5907f9c6ec0', +            'info_dict': { +                'id': '553878', +                'display_id': 'Carmella-Decesare-striptease', +                'ext': 'mp4', +                'title': 'Carmella Decesare - striptease', +                'description': '', +                'thumbnail': 're:https?://.*\.jpg$', +                'duration': 91, +                'age_limit': 18, +            } +        }, +        { +            'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', +            'only_matching': True,          } -    } +    ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -45,9 +51,8 @@ class TNAFlixIE(InfoExtractor):          age_limit = self._rta_search(webpage) -        duration = self._html_search_meta('duration', webpage, 'duration', default=None) -        if duration: -            duration = parse_duration(duration[1:]) +        duration = parse_duration(self._html_search_meta( +            'duration', webpage, 'duration', default=None))          cfg_url = self._proto_relative_url(self._html_search_regex(              self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:') @@ -56,14 +61,15 @@ class TNAFlixIE(InfoExtractor):              cfg_url, display_id, note='Downloading metadata',              transform_source=fix_xml_ampersands) -        thumbnail = cfg_xml.find('./startThumb').text +        thumbnail = self._proto_relative_url( +            cfg_xml.find('./startThumb').text, 'http:')          formats = []          for item in cfg_xml.findall('./quality/item'):              video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text)              format_id = item.find('res').text              fmt = { -                'url': video_url, +                'url': self._proto_relative_url(video_url, 'http:'),                  'format_id': format_id,              }              m = re.search(r'^(\d+)', format_id) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index d73ad3762..6ca8840b0 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -47,7 +47,7 @@ class Tube8IE(InfoExtractor):          webpage = self._download_webpage(req, display_id)          flashvars = json.loads(self._html_search_regex( -            r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars')) +            r'flashvars\s*=\s*({.+?})', webpage, 'flashvars'))          video_url = flashvars['video_url']          if flashvars.get('encrypted') is True: @@ -58,19 +58,19 @@ class Tube8IE(InfoExtractor):          thumbnail = flashvars.get('image_url')          title = self._html_search_regex( -            r'videotitle\s*=\s*"([^"]+)', webpage, 'title') +            r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')          description = self._html_search_regex( -            r'>Description:</strong>(.+?)<', webpage, 'description', fatal=False) +            r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False)          uploader = self._html_search_regex( -            r'<strong class="video-username">(?:<a href="[^"]+">)?([^<]+)(?:</a>)?</strong>', +            r'<span class="username">\s*(.+?)\s*<',              webpage, 'uploader', fatal=False)          like_count = int_or_none(self._html_search_regex( -            r"rupVar\s*=\s*'(\d+)'", webpage, 'like count', fatal=False)) +            r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))          dislike_count = int_or_none(self._html_search_regex( -            r"rdownVar\s*=\s*'(\d+)'", webpage, 'dislike count', fatal=False)) +            r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False))          view_count = self._html_search_regex( -            r'<strong>Views: </strong>([\d,\.]+)</li>', webpage, 'view count', fatal=False) +            r'<strong>Views: </strong>([\d,\.]+)\s*</li>', webpage, 'view count', fatal=False)          if view_count:              view_count = str_to_int(view_count)          comment_count = self._html_search_regex( diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py new file mode 100644 index 000000000..2c4b21807 --- /dev/null +++ b/youtube_dl/extractor/tubitv.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import codecs +import re + +from .common import InfoExtractor +from ..compat import ( +    compat_urllib_parse, +    compat_urllib_request +) +from ..utils import ( +    ExtractorError, +    int_or_none, +) + + +class TubiTvIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P<id>[0-9]+)' +    _LOGIN_URL = 'http://tubitv.com/login' +    _NETRC_MACHINE = 'tubitv' +    _TEST = { +        'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01', +        'info_dict': { +            'id': '54411', +            'ext': 'mp4', +            'title': 'The Kitchen Musical - EP01', +            'thumbnail': 're:^https?://.*\.png$', +            'description': 'md5:37532716166069b353e8866e71fefae7', +            'duration': 2407, +        }, +        'params': { +            'skip_download': 'HLS download', +        }, +    } + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            return +        self.report_login() +        form_data = { +            'username': username, +            'password': password, +        } +        payload = compat_urllib_parse.urlencode(form_data).encode('utf-8') +        request = compat_urllib_request.Request(self._LOGIN_URL, payload) +        request.add_header('Content-Type', 'application/x-www-form-urlencoded') +        login_page = self._download_webpage( +            request, None, False, 'Wrong login info') +        if not re.search(r'id="tubi-logout"', login_page): +            raise ExtractorError( +                'Login failed (invalid username/password)', expected=True) + +    def _real_initialize(self): +        self._login() + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) +        if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage): +            raise ExtractorError( +                'This video requires login, use --username and --password ' +                'options to provide account credentials.', expected=True) + +        title = self._og_search_title(webpage) +        description = self._og_search_description(webpage) +        thumbnail = self._og_search_thumbnail(webpage) +        duration = int_or_none(self._html_search_meta( +            'video:duration', webpage, 'duration')) + +        apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu') +        m3u8_url = codecs.decode(apu, 'rot_13')[::-1] +        formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +            'thumbnail': thumbnail, +            'description': description, +            'duration': duration, +        } diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 828c808a6..e6218808f 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -28,6 +28,17 @@ class TumblrIE(InfoExtractor):              'description': 'md5:dba62ac8639482759c8eb10ce474586a',              'thumbnail': 're:http://.*\.jpg',          } +    }, { +        'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching', +        'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab', +        'info_dict': { +            'id': 'Wmur', +            'ext': 'mp4', +            'title': 'naked smoking & stretching', +            'upload_date': '20150506', +            'timestamp': 1430931613, +        }, +        'add_ie': ['Vidme'],      }]      def _real_extract(self, url): @@ -38,6 +49,12 @@ class TumblrIE(InfoExtractor):          url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)          webpage = self._download_webpage(url, video_id) +        vid_me_embed_url = self._search_regex( +            r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', +            webpage, 'vid.me embed', default=None) +        if vid_me_embed_url is not None: +            return self.url_result(vid_me_embed_url, 'Vidme') +          iframe_url = self._search_regex(              r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',              webpage, 'iframe url') diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index 4de0aac52..fad720b68 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -26,7 +26,7 @@ class TutvIE(InfoExtractor):          data_content = self._download_webpage(              'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info') -        video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8') +        video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8')          return {              'id': internal_id, diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py new file mode 100644 index 000000000..fa338b936 --- /dev/null +++ b/youtube_dl/extractor/tv2.py @@ -0,0 +1,126 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    int_or_none, +    float_or_none, +    parse_iso8601, +    remove_end, +) + + +class TV2IE(InfoExtractor): +    _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)' +    _TEST = { +        'url': 'http://www.tv2.no/v/916509/', +        'md5': '9cb9e3410b18b515d71892f27856e9b1', +        'info_dict': { +            'id': '916509', +            'ext': 'flv', +            'title': 'Se Gryttens hyllest av Steven Gerrard', +            'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.', +            'timestamp': 1431715610, +            'upload_date': '20150515', +            'duration': 156.967, +            'view_count': int, +            'categories': list, +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        formats = [] +        format_urls = [] +        for protocol in ('HDS', 'HLS'): +            data = self._download_json( +                'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (video_id, protocol), +                video_id, 'Downloading play JSON')['playback'] +            for item in data['items']['item']: +                video_url = item.get('url') +                if not video_url or video_url in format_urls: +                    continue +                format_id = '%s-%s' % (protocol.lower(), item.get('mediaFormat')) +                if not self._is_valid_url(video_url, video_id, format_id): +                    continue +                format_urls.append(video_url) +                ext = determine_ext(video_url) +                if ext == 'f4m': +                    formats.extend(self._extract_f4m_formats( +                        video_url, video_id, f4m_id=format_id)) +                elif ext == 'm3u8': +                    formats.extend(self._extract_m3u8_formats( +                        video_url, video_id, 'mp4', m3u8_id=format_id)) +                elif ext == 'ism' or video_url.endswith('.ism/Manifest'): +                    pass +                else: +                    formats.append({ +                        'url': video_url, +                        'format_id': format_id, +                        'tbr': int_or_none(item.get('bitrate')), +                        'filesize': int_or_none(item.get('fileSize')), +                    }) +        self._sort_formats(formats) + +        asset = self._download_json( +            'http://sumo.tv2.no/api/web/asset/%s.json' % video_id, +            video_id, 'Downloading metadata JSON')['asset'] + +        title = asset['title'] +        description = asset.get('description') +        timestamp = parse_iso8601(asset.get('createTime')) +        duration = float_or_none(asset.get('accurateDuration') or asset.get('duration')) +        view_count = int_or_none(asset.get('views')) +        categories = asset.get('keywords', '').split(',') + +        thumbnails = [{ +            'id': thumbnail.get('@type'), +            'url': thumbnail.get('url'), +        } for _, thumbnail in asset.get('imageVersions', {}).items()] + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'description': description, +            'thumbnails': thumbnails, +            'timestamp': timestamp, +            'duration': duration, +            'view_count': view_count, +            'categories': categories, +            'formats': formats, +        } + + +class TV2ArticleIE(InfoExtractor): +    _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' +    _TESTS = [{ +        'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', +        'info_dict': { +            'id': '6930542', +            'title': 'Russen hetses etter pingvintyveri – innrømmer å ha åpnet luken på buret', +            'description': 'md5:339573779d3eea3542ffe12006190954', +        }, +        'playlist_count': 2, +    }, { +        'url': 'http://www.tv2.no/a/6930542', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) + +        webpage = self._download_webpage(url, playlist_id) + +        entries = [ +            self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2') +            for video_id in re.findall(r'data-assetid="(\d+)"', webpage)] + +        title = remove_end(self._og_search_title(webpage), ' - TV2.no') +        description = remove_end(self._og_search_description(webpage), ' - TV2.no') + +        return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py index 102362b29..dc3a8334a 100644 --- a/youtube_dl/extractor/tvigle.py +++ b/youtube_dl/extractor/tvigle.py @@ -5,7 +5,9 @@ import re  from .common import InfoExtractor  from ..utils import ( +    ExtractorError,      float_or_none, +    int_or_none,      parse_age_limit,  ) @@ -24,22 +26,24 @@ class TvigleIE(InfoExtractor):                  'display_id': 'sokrat',                  'ext': 'flv',                  'title': 'Сократ', -                'description': 'md5:a05bd01be310074d5833efc6743be95e', +                'description': 'md5:d6b92ffb7217b4b8ebad2e7665253c17',                  'duration': 6586, -                'age_limit': 0, +                'age_limit': 12,              }, +            'skip': 'georestricted',          },          {              'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/', -            'md5': 'd9012d7c7c598fe7a11d7fb46dc1f574', +            'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',              'info_dict': {                  'id': '5142516', -                'ext': 'mp4', +                'ext': 'flv',                  'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',                  'description': 'md5:027f7dc872948f14c96d19b4178428a4',                  'duration': 186.080,                  'age_limit': 0,              }, +            'skip': 'georestricted',          }, {              'url': 'https://cloud.tvigle.ru/video/5267604/',              'only_matching': True, @@ -54,7 +58,7 @@ class TvigleIE(InfoExtractor):          if not video_id:              webpage = self._download_webpage(url, display_id)              video_id = self._html_search_regex( -                r'<li class="video-preview current_playing" id="(\d+)">', +                r'class="video-preview current_playing" id="(\d+)">',                  webpage, 'video id')          video_data = self._download_json( @@ -62,21 +66,34 @@ class TvigleIE(InfoExtractor):          item = video_data['playlist']['items'][0] +        videos = item.get('videos') + +        error_message = item.get('errorMessage') +        if not videos and error_message: +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) +          title = item['title'] -        description = item['description'] -        thumbnail = item['thumbnail'] +        description = item.get('description') +        thumbnail = item.get('thumbnail')          duration = float_or_none(item.get('durationMilliseconds'), 1000)          age_limit = parse_age_limit(item.get('ageRestrictions'))          formats = []          for vcodec, fmts in item['videos'].items(): -            for quality, video_url in fmts.items(): +            for format_id, video_url in fmts.items(): +                if format_id == 'm3u8': +                    formats.extend(self._extract_m3u8_formats( +                        video_url, video_id, 'mp4', m3u8_id=vcodec)) +                    continue +                height = self._search_regex( +                    r'^(\d+)[pP]$', format_id, 'height', default=None)                  formats.append({                      'url': video_url, -                    'format_id': '%s-%s' % (vcodec, quality), +                    'format_id': '%s-%s' % (vcodec, format_id),                      'vcodec': vcodec, -                    'height': int(quality[:-1]), -                    'filesize': item['video_files_size'][vcodec][quality], +                    'height': int_or_none(height), +                    'filesize': int_or_none(item.get('video_files_size', {}).get(vcodec, {}).get(format_id)),                  })          self._sort_formats(formats) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 67e8bfea0..c1ee1decc 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -15,7 +15,7 @@ class TwentyFourVideoIE(InfoExtractor):      _TESTS = [          {              'url': 'http://www.24video.net/video/view/1044982', -            'md5': '48dd7646775690a80447a8dca6a2df76', +            'md5': 'd041af8b5b4246ea466226a0d6693345',              'info_dict': {                  'id': '1044982',                  'ext': 'mp4', @@ -54,7 +54,7 @@ class TwentyFourVideoIE(InfoExtractor):              webpage, 'upload date'))          uploader = self._html_search_regex( -            r'Загрузил\s*<a href="/jsecUser/movies/[^"]+" class="link">([^<]+)</a>', +            r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>',              webpage, 'uploader', fatal=False)          view_count = int_or_none(self._html_search_regex( diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py index 96c809eaf..c4751050e 100644 --- a/youtube_dl/extractor/ultimedia.py +++ b/youtube_dl/extractor/ultimedia.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse  from ..utils import (      ExtractorError,      qualities, @@ -44,9 +45,9 @@ class UltimediaIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        deliver_url = self._search_regex( -            r'<iframe[^>]+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"', -            webpage, 'deliver URL') +        deliver_url = self._proto_relative_url(self._search_regex( +            r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?ultimedia\.com/deliver/[^"]+)"', +            webpage, 'deliver URL'), compat_urllib_parse_urlparse(url).scheme + ':')          deliver_page = self._download_webpage(              deliver_url, video_id, 'Downloading iframe page') @@ -57,7 +58,8 @@ class UltimediaIE(InfoExtractor):          player = self._parse_json(              self._search_regex( -                r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'), +                r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", +                deliver_page, 'player'),              video_id)          quality = qualities(['flash', 'html5']) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index e6ee1e471..f38a72fde 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import float_or_none +from ..utils import ( +    ExtractorError, +    float_or_none, +)  class VGTVIE(InfoExtractor): @@ -59,16 +62,16 @@ class VGTVIE(InfoExtractor):          },          {              # streamType: live -            'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen', +            'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla',              'info_dict': { -                'id': '100015', +                'id': '113063',                  'ext': 'flv', -                'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!', -                'description': 'md5:9a60cc23fa349f761628924e56eeec2d', +                'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +                'description': 'md5:b3743425765355855f88e096acc93231',                  'thumbnail': 're:^https?://.*\.jpg',                  'duration': 0, -                'timestamp': 1407423348, -                'upload_date': '20140807', +                'timestamp': 1432975582, +                'upload_date': '20150530',                  'view_count': int,              },              'params': { @@ -97,7 +100,12 @@ class VGTVIE(InfoExtractor):              % (host, video_id, HOST_WEBSITES[host]),              video_id, 'Downloading media JSON') +        if data.get('status') == 'inactive': +            raise ExtractorError( +                'Video %s is no longer available' % video_id, expected=True) +          streams = data['streamUrls'] +        stream_type = data.get('streamType')          formats = [] @@ -107,7 +115,8 @@ class VGTVIE(InfoExtractor):                  hls_url, video_id, 'mp4', m3u8_id='hls'))          hds_url = streams.get('hds') -        if hds_url: +        # wasLive hds are always 404 +        if hds_url and stream_type != 'wasLive':              formats.extend(self._extract_f4m_formats(                  hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18',                  video_id, f4m_id='hds')) @@ -135,13 +144,14 @@ class VGTVIE(InfoExtractor):          return {              'id': video_id, -            'title': data['title'], +            'title': self._live_title(data['title']),              'description': data['description'],              'thumbnail': data['images']['main'] + '?t[]=900x506q80',              'timestamp': data['published'],              'duration': float_or_none(data['duration'], 1000),              'view_count': data['displays'],              'formats': formats, +            'is_live': True if stream_type == 'live' else False,          } diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py index ececc7ee0..591024ead 100644 --- a/youtube_dl/extractor/videott.py +++ b/youtube_dl/extractor/videott.py @@ -43,7 +43,7 @@ class VideoTtIE(InfoExtractor):          formats = [              { -                'url': base64.b64decode(res['u']).decode('utf-8'), +                'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'),                  'ext': 'flv',                  'format_id': res['l'],              } for res in settings['res'] if res['u'] diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index bd953fb4c..e0b55078b 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -10,7 +10,7 @@ from ..utils import (  class VidmeIE(InfoExtractor):      _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)' -    _TEST = { +    _TESTS = [{          'url': 'https://vid.me/QNB',          'md5': 'f42d05e7149aeaec5c037b17e5d3dc82',          'info_dict': { @@ -23,9 +23,14 @@ class VidmeIE(InfoExtractor):              'upload_date': '20140725',              'thumbnail': 're:^https?://.*\.jpg',          }, -    } +    }, { +        # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching +        'url': 'https://vid.me/e/Wmur', +        'only_matching': True, +    }]      def _real_extract(self, url): +        url = url.replace('vid.me/e/', 'vid.me/')          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 619039e51..15377097e 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -38,11 +38,14 @@ class VierIE(InfoExtractor):          webpage = self._download_webpage(url, display_id)          video_id = self._search_regex( -            r'"nid"\s*:\s*"(\d+)"', webpage, 'video id') +            [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], +            webpage, 'video id')          application = self._search_regex( -            r'"application"\s*:\s*"([^"]+)"', webpage, 'application', default='vier_vod') +            [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], +            webpage, 'application', default='vier_vod')          filename = self._search_regex( -            r'"filename"\s*:\s*"([^"]+)"', webpage, 'filename') +            [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], +            webpage, 'filename')          playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename)          formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4') diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index cf6af1e5c..7f2fb1ca8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,29 +1,65 @@  from __future__ import unicode_literals -import re +import time +import hmac +import hashlib +import itertools -from ..compat import ( -    compat_urlparse, -    compat_urllib_request, -)  from ..utils import (      ExtractorError, -    unescapeHTML, -    unified_strdate, -    US_RATINGS, -    determine_ext, -    mimetype2ext, +    int_or_none, +    parse_age_limit, +    parse_iso8601,  )  from .common import InfoExtractor -class VikiIE(InfoExtractor): -    IE_NAME = 'viki' +class VikiBaseIE(InfoExtractor): +    _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' +    _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' +    _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s' + +    _APP = '65535a' +    _APP_VERSION = '2.2.5.1428709186' +    _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)' + +    def _prepare_call(self, path, timestamp=None): +        path += '?' if '?' not in path else '&' +        if not timestamp: +            timestamp = int(time.time()) +        query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp) +        sig = hmac.new( +            self._APP_SECRET.encode('ascii'), +            query.encode('ascii'), +            hashlib.sha1 +        ).hexdigest() +        return self._API_URL_TEMPLATE % (query, sig) + +    def _call_api(self, path, video_id, note, timestamp=None): +        resp = self._download_json( +            self._prepare_call(path, timestamp), video_id, note) + +        error = resp.get('error') +        if error: +            if error == 'invalid timestamp': +                resp = self._download_json( +                    self._prepare_call(path, int(resp['current_timestamp'])), +                    video_id, '%s (retry)' % note) +                error = resp.get('error') +            if error: +                self._raise_error(resp['error']) + +        return resp -    # iPad2 -    _USER_AGENT = 'Mozilla/5.0(iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F191 Safari/6533.18.5' +    def _raise_error(self, error): +        raise ExtractorError( +            '%s returned error: %s' % (self.IE_NAME, error), +            expected=True) -    _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)' + +class VikiIE(VikiBaseIE): +    IE_NAME = 'viki' +    _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE      _TESTS = [{          'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',          'info_dict': { @@ -37,111 +73,218 @@ class VikiIE(InfoExtractor):          },          'skip': 'Blocked in the US',      }, { +        # clip          'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', -        'md5': 'ca6493e6f0a6ec07da9aa8d6304b4b2c', +        'md5': '86c0b5dbd4d83a6611a79987cc7a1989',          'info_dict': {              'id': '1067139v',              'ext': 'mp4', +            'title': "'The Avengers: Age of Ultron' Press Conference",              'description': 'md5:d70b2f9428f5488321bfe1db10d612ea', +            'duration': 352, +            'timestamp': 1430380829,              'upload_date': '20150430', -            'title': '\'The Avengers: Age of Ultron\' Press Conference', +            'uploader': 'Arirang TV', +            'like_count': int, +            'age_limit': 0,          }      }, {          'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',          'info_dict': {              'id': '1048879v',              'ext': 'mp4', -            'upload_date': '20140820', -            'description': 'md5:54ff56d51bdfc7a30441ec967394e91c',              'title': 'Ankhon Dekhi', +            'duration': 6512, +            'timestamp': 1408532356, +            'upload_date': '20140820', +            'uploader': 'Spuul', +            'like_count': int, +            'age_limit': 13,          },          'params': { -            # requires ffmpeg +            # m3u8 download              'skip_download': True,          } +    }, { +        # episode +        'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', +        'md5': '190f3ef426005ba3a080a63325955bc3', +        'info_dict': { +            'id': '44699v', +            'ext': 'mp4', +            'title': 'Boys Over Flowers - Episode 1', +            'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2', +            'duration': 4155, +            'timestamp': 1270496524, +            'upload_date': '20100405', +            'uploader': 'group8', +            'like_count': int, +            'age_limit': 13, +        } +    }, { +        # youtube external +        'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', +        'md5': '216d1afdc0c64d1febc1e9f2bd4b864b', +        'info_dict': { +            'id': '50562v', +            'ext': 'mp4', +            'title': 'Poor Nastya [COMPLETE] - Episode 1', +            'description': '', +            'duration': 607, +            'timestamp': 1274949505, +            'upload_date': '20101213', +            'uploader': 'ad14065n', +            'uploader_id': 'ad14065n', +            'like_count': int, +            'age_limit': 13, +        } +    }, { +        'url': 'http://www.viki.com/player/44699v', +        'only_matching': True,      }]      def _real_extract(self, url):          video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) -        title = self._og_search_title(webpage) -        description = self._og_search_description(webpage) -        thumbnail = self._og_search_thumbnail(webpage) - -        uploader_m = re.search( -            r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage) -        if uploader_m is None: -            uploader = None -        else: -            uploader = uploader_m.group(1).strip() - -        rating_str = self._html_search_regex( -            r'<strong>Rating: </strong>\s*([^<]*)<', webpage, -            'rating information', default='').strip() -        age_limit = US_RATINGS.get(rating_str) - -        req = compat_urllib_request.Request( -            'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id) -        req.add_header('User-Agent', self._USER_AGENT) -        info_webpage = self._download_webpage( -            req, video_id, note='Downloading info page') -        err_msg = self._html_search_regex(r'<div[^>]+class="video-error[^>]+>(.+)</div>', info_webpage, 'error message', default=None) -        if err_msg: -            if 'not available in your region' in err_msg: -                raise ExtractorError( -                    'Video %s is blocked from your location.' % video_id, -                    expected=True) -            else: -                raise ExtractorError('Viki said: ' + err_msg) -        mobj = re.search( -            r'<source[^>]+type="(?P<mime_type>[^"]+)"[^>]+src="(?P<url>[^"]+)"', info_webpage) -        if not mobj: -            raise ExtractorError('Unable to find video URL') -        video_url = unescapeHTML(mobj.group('url')) -        video_ext = mimetype2ext(mobj.group('mime_type')) - -        if determine_ext(video_url) == 'm3u8': -            formats = self._extract_m3u8_formats( -                video_url, video_id, ext=video_ext) -        else: -            formats = [{ -                'url': video_url, -                'ext': video_ext, -            }] - -        upload_date_str = self._html_search_regex( -            r'"created_at":"([^"]+)"', info_webpage, 'upload date') -        upload_date = ( -            unified_strdate(upload_date_str) -            if upload_date_str is not None -            else None -        ) - -        # subtitles -        video_subtitles = self.extract_subtitles(video_id, info_webpage) - -        return { +        video = self._call_api( +            'videos/%s.json' % video_id, video_id, 'Downloading video JSON') + +        title = None +        titles = video.get('titles') +        if titles: +            title = titles.get('en') or titles[titles.keys()[0]] +        if not title: +            title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id +            container_titles = video.get('container', {}).get('titles') +            if container_titles: +                container_title = container_titles.get('en') or container_titles[container_titles.keys()[0]] +                title = '%s - %s' % (container_title, title) + +        descriptions = video.get('descriptions') +        description = descriptions.get('en') or descriptions[titles.keys()[0]] if descriptions else None + +        duration = int_or_none(video.get('duration')) +        timestamp = parse_iso8601(video.get('created_at')) +        uploader = video.get('author') +        like_count = int_or_none(video.get('likes', {}).get('count')) +        age_limit = parse_age_limit(video.get('rating')) + +        thumbnails = [] +        for thumbnail_id, thumbnail in video.get('images', {}).items(): +            thumbnails.append({ +                'id': thumbnail_id, +                'url': thumbnail.get('url'), +            }) + +        subtitles = {} +        for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): +            subtitles[subtitle_lang] = [{ +                'ext': subtitles_format, +                'url': self._prepare_call( +                    'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), +            } for subtitles_format in ('srt', 'vtt')] + +        result = {              'id': video_id,              'title': title, -            'formats': formats,              'description': description, -            'thumbnail': thumbnail, -            'age_limit': age_limit, +            'duration': duration, +            'timestamp': timestamp,              'uploader': uploader, -            'subtitles': video_subtitles, -            'upload_date': upload_date, +            'like_count': like_count, +            'age_limit': age_limit, +            'thumbnails': thumbnails, +            'subtitles': subtitles,          } -    def _get_subtitles(self, video_id, info_webpage): -        res = {} -        for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage): -            sturl = unescapeHTML(sturl_html) -            m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl) -            if not m: -                continue -            res[m.group('lang')] = [{ -                'url': compat_urlparse.urljoin('http://www.viki.com', sturl), -                'ext': 'vtt', -            }] -        return res +        streams = self._call_api( +            'videos/%s/streams.json' % video_id, video_id, +            'Downloading video streams JSON') + +        if 'external' in streams: +            result.update({ +                '_type': 'url_transparent', +                'url': streams['external']['url'], +            }) +            return result + +        formats = [] +        for format_id, stream_dict in streams.items(): +            height = self._search_regex( +                r'^(\d+)[pP]$', format_id, 'height', default=None) +            for protocol, format_dict in stream_dict.items(): +                if format_id == 'm3u8': +                    formats = self._extract_m3u8_formats( +                        format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol) +                else: +                    formats.append({ +                        'url': format_dict['url'], +                        'format_id': '%s-%s' % (format_id, protocol), +                        'height': height, +                    }) +        self._sort_formats(formats) + +        result['formats'] = formats +        return result + + +class VikiChannelIE(VikiBaseIE): +    IE_NAME = 'viki:channel' +    _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE +    _TESTS = [{ +        'url': 'http://www.viki.com/tv/50c-boys-over-flowers', +        'info_dict': { +            'id': '50c', +            'title': 'Boys Over Flowers', +            'description': 'md5:ecd3cff47967fe193cff37c0bec52790', +        }, +        'playlist_count': 70, +    }, { +        'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete', +        'info_dict': { +            'id': '1354c', +            'title': 'Poor Nastya [COMPLETE]', +            'description': 'md5:05bf5471385aa8b21c18ad450e350525', +        }, +        'playlist_count': 127, +    }, { +        'url': 'http://www.viki.com/news/24569c-showbiz-korea', +        'only_matching': True, +    }, { +        'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005', +        'only_matching': True, +    }, { +        'url': 'http://www.viki.com/artists/2141c-shinee', +        'only_matching': True, +    }] + +    _PER_PAGE = 25 + +    def _real_extract(self, url): +        channel_id = self._match_id(url) + +        channel = self._call_api( +            'containers/%s.json' % channel_id, channel_id, +            'Downloading channel JSON') + +        titles = channel['titles'] +        title = titles.get('en') or titles[titles.keys()[0]] + +        descriptions = channel['descriptions'] +        description = descriptions.get('en') or descriptions[descriptions.keys()[0]] + +        entries = [] +        for video_type in ('episodes', 'clips', 'movies'): +            for page_num in itertools.count(1): +                page = self._call_api( +                    'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d' +                    % (channel_id, video_type, self._PER_PAGE, page_num), channel_id, +                    'Downloading %s JSON page #%d' % (video_type, page_num)) +                for video in page['response']: +                    video_id = video['id'] +                    entries.append(self.url_result( +                        'http://www.viki.com/videos/%s' % video_id, 'Viki')) +                if not page['pagination']['next']: +                    break + +        return self.playlist_result(entries, channel_id, title, description) diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index c3fde53f5..a6d9b5fee 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -49,7 +49,7 @@ class VuClipIE(InfoExtractor):          links_code = self._search_regex(              r'''(?xs)                  (?: -                    <img\s+src="/im/play.gif".*?>| +                    <img\s+src="[^"]*/play.gif".*?>|                      <!--\ player\ end\ -->\s*</div><!--\ thumb\ end-->                  )                  (.*?) diff --git a/youtube_dl/extractor/vulture.py b/youtube_dl/extractor/vulture.py index 1eb24a3d6..faa167e65 100644 --- a/youtube_dl/extractor/vulture.py +++ b/youtube_dl/extractor/vulture.py @@ -44,7 +44,7 @@ class VultureIE(InfoExtractor):          query_webpage = self._download_webpage(              query_url, display_id, note='Downloading query page')          params_json = self._search_regex( -            r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n,\n', +            r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n?,\n',              query_webpage,              'player params')          params = json.loads(params_json) diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index d6dec25ca..f69d46a28 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -37,7 +37,8 @@ class WimpIE(InfoExtractor):          video_id = mobj.group(1)          webpage = self._download_webpage(url, video_id)          video_url = self._search_regex( -            r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL') +            [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"], +            webpage, 'video URL')          if YoutubeIE.suitable(video_url):              self.to_screen('Found YouTube video')              return { diff --git a/youtube_dl/extractor/xminus.py b/youtube_dl/extractor/xminus.py index 8c6241aed..7c9d8af6f 100644 --- a/youtube_dl/extractor/xminus.py +++ b/youtube_dl/extractor/xminus.py @@ -43,7 +43,7 @@ class XMinusIE(InfoExtractor):              r'minus_track\.dur_sec=\'([0-9]*?)\'',              webpage, 'duration', fatal=False))          filesize_approx = parse_filesize(self._html_search_regex( -            r'<div class="filesize[^"]*"></div>\s*([0-9.]+\s*[a-zA-Z][bB])', +            r'<div id="finfo"[^>]*>\s*↓\s*([0-9.]+\s*[a-zA-Z][bB])',              webpage, 'approximate filesize', fatal=False))          tbr = int_or_none(self._html_search_regex(              r'<div class="quality[^"]*"></div>\s*([0-9]+)\s*kbps', @@ -58,7 +58,7 @@ class XMinusIE(InfoExtractor):              description = re.sub(' *\r *', '\n', description)          enc_token = self._html_search_regex( -            r'minus_track\.tkn="(.+?)"', webpage, 'enc_token') +            r'minus_track\.s?tkn="(.+?)"', webpage, 'enc_token')          token = ''.join(              c if pos == 3 else compat_chr(compat_ord(c) - 1)              for pos, c in enumerate(reversed(enc_token))) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index bf4e659ac..f9afbdbab 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -15,6 +15,7 @@ from ..utils import (      unescapeHTML,      ExtractorError,      int_or_none, +    mimetype2ext,  )  from .nbc import NBCSportsVPlayerIE @@ -236,6 +237,22 @@ class YahooIE(InfoExtractor):          self._sort_formats(formats) +        closed_captions = self._html_search_regex( +            r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions', +            default='[]') + +        cc_json = self._parse_json(closed_captions, video_id, fatal=False) +        subtitles = {} +        if cc_json: +            for closed_caption in cc_json: +                lang = closed_caption['lang'] +                if lang not in subtitles: +                    subtitles[lang] = [] +                subtitles[lang].append({ +                    'url': closed_caption['url'], +                    'ext': mimetype2ext(closed_caption['content_type']), +                }) +          return {              'id': video_id,              'display_id': display_id, @@ -244,6 +261,7 @@ class YahooIE(InfoExtractor):              'description': clean_html(meta['description']),              'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),              'duration': int_or_none(meta.get('duration')), +            'subtitles': subtitles,          } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e58184adc..419f7b019 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -49,6 +49,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              # YouTube sets the expire time to about two months              expire_time=time.time() + 2 * 30 * 24 * 3600) +    def _ids_to_results(self, ids): +        return [ +            self.url_result(vid_id, 'Youtube', video_id=vid_id) +            for vid_id in ids] +      def _login(self):          """          Attempt to log in to YouTube. @@ -1121,12 +1126,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      self.report_warning(                          'Skipping DASH manifest: %r' % e, video_id)                  else: -                    # Hide the formats we found through non-DASH +                    # Remove the formats we found through non-DASH, they +                    # contain less info and it can be wrong, because we use +                    # fixed values (for example the resolution). See +                    # https://github.com/rg3/youtube-dl/issues/5774 for an +                    # example.                      dash_keys = set(df['format_id'] for df in dash_formats) -                    for f in formats: -                        if f['format_id'] in dash_keys: -                            f['format_id'] = 'nondash-%s' % f['format_id'] -                            f['preference'] = f.get('preference', 0) - 10000 +                    formats = [f for f in formats if f['format_id'] not in dash_keys]                      formats.extend(dash_formats)          # Check for malformed aspect ratio @@ -1261,11 +1267,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):      def _real_initialize(self):          self._login() -    def _ids_to_results(self, ids): -        return [ -            self.url_result(vid_id, 'Youtube', video_id=vid_id) -            for vid_id in ids] -      def _extract_mix(self, playlist_id):          # The mixes are generated from a single video          # the id of the playlist is just 'RD' + video_id @@ -1398,6 +1399,24 @@ class YoutubeChannelIE(InfoExtractor):          channel_id = self._match_id(url)          url = self._TEMPLATE_URL % channel_id + +        # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) +        # Workaround by extracting as a playlist if managed to obtain channel playlist URL +        # otherwise fallback on channel by page extraction +        channel_page = self._download_webpage( +            url + '?view=57', channel_id, +            'Downloading channel page', fatal=False) +        channel_playlist_id = self._html_search_meta( +            'channelId', channel_page, 'channel id', default=None) +        if not channel_playlist_id: +            channel_playlist_id = self._search_regex( +                r'data-channel-external-id="([^"]+)"', +                channel_page, 'channel id', default=None) +        if channel_playlist_id and channel_playlist_id.startswith('UC'): +            playlist_id = 'UU' + channel_playlist_id[2:] +            return self.url_result( +                compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') +          channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')          autogenerated = re.search(r'''(?x)                  class="[^"]*?(?: @@ -1601,20 +1620,10 @@ class YoutubeShowIE(InfoExtractor):  class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):      """ -    Base class for extractors that fetch info from -    http://www.youtube.com/feed_ajax +    Base class for feed extractors      Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.      """      _LOGIN_REQUIRED = True -    # use action_load_personal_feed instead of action_load_system_feed -    _PERSONAL_FEED = False - -    @property -    def _FEED_TEMPLATE(self): -        action = 'action_load_system_feed' -        if self._PERSONAL_FEED: -            action = 'action_load_personal_feed' -        return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)      @property      def IE_NAME(self): @@ -1624,67 +1633,23 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):          self._login()      def _real_extract(self, url): -        feed_entries = [] -        paging = 0 -        for i in itertools.count(1): -            info = self._download_json( -                self._FEED_TEMPLATE % paging, -                '%s feed' % self._FEED_NAME, -                'Downloading page %s' % i, -                transform_source=uppercase_escape) -            feed_html = info.get('feed_html') or info.get('content_html') -            load_more_widget_html = info.get('load_more_widget_html') or feed_html -            m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) -            ids = orderedSet(m.group(1) for m in m_ids) -            feed_entries.extend( -                self.url_result(video_id, 'Youtube', video_id=video_id) -                for video_id in ids) -            mobj = re.search( -                r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)', -                load_more_widget_html) -            if mobj is None: -                break -            paging = mobj.group('paging') -        return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) - - -class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): -    IE_NAME = 'youtube:recommended' -    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' -    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' -    _FEED_NAME = 'recommended' -    _PLAYLIST_TITLE = 'Youtube Recommended videos' - - -class YoutubeWatchLaterIE(YoutubePlaylistIE): -    IE_NAME = 'youtube:watchlater' -    IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' -    _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' - -    _TESTS = []  # override PlaylistIE tests - -    def _real_extract(self, url): -        return self._extract_playlist('WL') - - -class YoutubeHistoryIE(YoutubePlaylistIE): -    IE_NAME = 'youtube:history' -    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' -    _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' -    _TESTS = [] - -    def _real_extract(self, url): -        title = 'Youtube History' -        page = self._download_webpage('https://www.youtube.com/feed/history', title) +        page = self._download_webpage( +            'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)          # The extraction process is the same as for playlists, but the regex          # for the video ids doesn't contain an index          ids = []          more_widget_html = content_html = page -          for page_num in itertools.count(1):              matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) -            new_ids = orderedSet(matches) + +            # 'recommended' feed has infinite 'load more' and each new portion spins +            # the same videos in (sometimes) slightly different order, so we'll check +            # for unicity and break when portion has no new videos +            new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches)) +            if not new_ids: +                break +              ids.extend(new_ids)              mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) @@ -1692,17 +1657,25 @@ class YoutubeHistoryIE(YoutubePlaylistIE):                  break              more = self._download_json( -                'https://youtube.com/%s' % mobj.group('more'), title, +                'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,                  'Downloading page #%s' % page_num,                  transform_source=uppercase_escape)              content_html = more['content_html']              more_widget_html = more['load_more_widget_html'] -        return { -            '_type': 'playlist', -            'title': title, -            'entries': self._ids_to_results(ids), -        } +        return self.playlist_result( +            self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE) + + +class YoutubeWatchLaterIE(YoutubePlaylistIE): +    IE_NAME = 'youtube:watchlater' +    IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' +    _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' + +    _TESTS = []  # override PlaylistIE tests + +    def _real_extract(self, url): +        return self._extract_playlist('WL')  class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): @@ -1717,42 +1690,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):          return self.url_result(playlist_id, 'YoutubePlaylist') -class YoutubeSubscriptionsIE(YoutubePlaylistIE): -    IE_NAME = 'youtube:subscriptions' -    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' -    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' -    _TESTS = [] - -    def _real_extract(self, url): -        title = 'Youtube Subscriptions' -        page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title) - -        # The extraction process is the same as for playlists, but the regex -        # for the video ids doesn't contain an index -        ids = [] -        more_widget_html = content_html = page +class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): +    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' +    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' +    _FEED_NAME = 'recommended' +    _PLAYLIST_TITLE = 'Youtube Recommended videos' -        for page_num in itertools.count(1): -            matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) -            new_ids = orderedSet(matches) -            ids.extend(new_ids) -            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) -            if not mobj: -                break +class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): +    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' +    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' +    _FEED_NAME = 'subscriptions' +    _PLAYLIST_TITLE = 'Youtube Subscriptions' -            more = self._download_json( -                'https://youtube.com/%s' % mobj.group('more'), title, -                'Downloading page #%s' % page_num, -                transform_source=uppercase_escape) -            content_html = more['content_html'] -            more_widget_html = more['load_more_widget_html'] -        return { -            '_type': 'playlist', -            'title': title, -            'entries': self._ids_to_results(ids), -        } +class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): +    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' +    _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' +    _FEED_NAME = 'history' +    _PLAYLIST_TITLE = 'Youtube History'  class YoutubeTruncatedURLIE(InfoExtractor): diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 22dbc3aec..5a2315bd9 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -537,7 +537,7 @@ def parseOpts(overrideArguments=None):      verbosity.add_option(          '--dump-pages', '--dump-intermediate-pages',          action='store_true', dest='dump_intermediate_pages', default=False, -        help='Print downloaded pages to debug problems (very verbose)') +        help='Print downloaded pages encoded using base64 to debug problems (very verbose)')      verbosity.add_option(          '--write-pages',          action='store_true', dest='write_pages', default=False, @@ -713,7 +713,7 @@ def parseOpts(overrideArguments=None):          help='Parse additional metadata like song title / artist from the video title. '               'The format syntax is the same as --output, '               'the parsed parameters replace existing values. ' -             'Additional templates: %(album), %(artist). ' +             'Additional templates: %(album)s, %(artist)s. '               'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '               '"Coldplay - Paradise"')      postproc.add_option( diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 8f825f785..774494efd 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -49,7 +49,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor):              os.remove(encodeFilename(filename))              os.rename(encodeFilename(temp_filename), encodeFilename(filename)) -        elif info['ext'] == 'm4a': +        elif info['ext'] in ['m4a', 'mp4']:              if not check_executable('AtomicParsley', ['-v']):                  raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') @@ -82,6 +82,6 @@ class EmbedThumbnailPP(FFmpegPostProcessor):                  os.remove(encodeFilename(filename))                  os.rename(encodeFilename(temp_filename), encodeFilename(filename))          else: -            raise EmbedThumbnailPPError('Only mp3 and m4a are supported for thumbnail embedding for now.') +            raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.')          return [], info diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ed9ed9ed6..52d198fa3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1665,6 +1665,7 @@ def mimetype2ext(mt):      return {          'x-ms-wmv': 'wmv',          'x-mp4-fragmented': 'mp4', +        'ttml+xml': 'ttml',      }.get(res, res) @@ -1848,9 +1849,9 @@ def dfxp2srt(dfxp_data):          out = str_or_empty(node.text)          for child in node: -            if child.tag == _x('ttml:br'): +            if child.tag in (_x('ttml:br'), 'br'):                  out += '\n' + str_or_empty(child.tail) -            elif child.tag == _x('ttml:span'): +            elif child.tag in (_x('ttml:span'), 'span'):                  out += str_or_empty(parse_node(child))              else:                  out += str_or_empty(xml.etree.ElementTree.tostring(child)) @@ -1859,7 +1860,10 @@ def dfxp2srt(dfxp_data):      dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))      out = [] -    paras = dfxp.findall(_x('.//ttml:p')) +    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') + +    if not paras: +        raise ValueError('Invalid dfxp/TTML subtitle')      for para, index in zip(paras, itertools.count(1)):          begin_time = parse_dfxp_time_expr(para.attrib['begin']) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 38f00bc9b..9cf84ff71 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.05.15' +__version__ = '2015.06.04.1' | 
