diff options
61 files changed, 2080 insertions, 523 deletions
| @@ -128,3 +128,5 @@ Ping O.  Mister Hat  Peter Ding  jackyzy823 +George Brighton +Remita Amine @@ -54,6 +54,7 @@ which means you can modify it, redistribute it or use it however you like.      --dump-user-agent                Display the current browser identification      --list-extractors                List all supported extractors      --extractor-descriptions         Output descriptions of all supported extractors +    --force-generic-extractor        Force extraction to use the generic extractor      --default-search PREFIX          Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple".                                       Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The                                       default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching. @@ -107,7 +108,7 @@ which means you can modify it, redistribute it or use it however you like.      --playlist-reverse               Download playlist videos in reverse order      --xattr-set-filesize             Set file xattribute ytdl.filesize with expected filesize (experimental)      --hls-prefer-native              Use the native HLS downloader instead of ffmpeg (experimental) -    --external-downloader COMMAND    Use the specified external downloader. Currently supports aria2c,curl,wget +    --external-downloader COMMAND    Use the specified external downloader. Currently supports aria2c,curl,httpie,wget      --external-downloader-args ARGS  Give these arguments to the external downloader  ## Filesystem Options: @@ -189,8 +190,8 @@ which means you can modify it, redistribute it or use it however you like.      --all-formats                    Download all available video formats      --prefer-free-formats            Prefer free video formats unless a specific one is requested      -F, --list-formats               List all available formats -    --youtube-skip-dash-manifest     Do not download the DASH manifest on YouTube videos -    --merge-output-format FORMAT     If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.Ignored if no +    --youtube-skip-dash-manifest     Do not download the DASH manifests and related data on YouTube videos +    --merge-output-format FORMAT     If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv. Ignored if no                                       merge is required  ## Subtitle Options: @@ -379,7 +380,7 @@ In February 2015, the new YouTube player contained a character sequence in a str  ### HTTP Error 429: Too Many Requests or 402: Payment Required -These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--network-address` options](#network-options) to select another IP address. +These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address.  ### SyntaxError: Non-ASCII character ### diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 220e52b98..687936103 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -17,6 +17,7 @@   - **AcademicEarth:Course**   - **AddAnime**   - **AdobeTV** + - **AdobeTVVideo**   - **AdultSwim**   - **Aftenposten**   - **Aftonbladet** @@ -110,6 +111,7 @@   - **dailymotion**   - **dailymotion:playlist**   - **dailymotion:user** + - **DailymotionCloud**   - **daum.net**   - **DBTV**   - **DctpTv** @@ -281,6 +283,7 @@   - **Motherless**   - **Motorsport**: motorsport.com   - **MovieClips** + - **MovieFap**   - **Moviezine**   - **movshare**: MovShare   - **MPORA** @@ -344,6 +347,7 @@   - **Odnoklassniki**   - **OktoberfestTV**   - **on.aol.com** + - **OnionStudios**   - **Ooyala**   - **OoyalaExternal**   - **OpenFilm** @@ -357,6 +361,7 @@   - **PhilharmonieDeParis**: Philharmonie de Paris   - **Phoenix**   - **Photobucket** + - **Pinkbike**   - **Pladform**   - **PlanetaPlay**   - **play.fm** @@ -436,6 +441,8 @@   - **smotri:broadcast**: Smotri.com broadcasts   - **smotri:community**: Smotri.com community videos   - **smotri:user**: Smotri.com user videos + - **SnagFilms** + - **SnagFilmsEmbed**   - **Snotr**   - **Sohu**   - **soompi** @@ -498,6 +505,7 @@   - **TheOnion**   - **ThePlatform**   - **TheSixtyOne** + - **ThisAmericanLife**   - **ThisAV**   - **THVideo**   - **THVideoPlaylist** @@ -538,6 +546,7 @@   - **twitch:stream**   - **twitch:video**   - **twitch:vod** + - **TwitterCard**   - **Ubu**   - **udemy**   - **udemy:course** @@ -612,6 +621,7 @@   - **XBef**   - **XboxClips**   - **XHamster** + - **XHamsterEmbed**   - **XMinus**   - **XNXX**   - **Xstream** @@ -628,7 +638,7 @@   - **YesJapan**   - **Ynet**   - **YouJizz** - - **Youku** + - **youku**   - **YouPorn**   - **YourUpload**   - **youtube**: YouTube.com diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6e4b6f566..411de9ac9 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -139,6 +139,7 @@ class YoutubeDL(object):      outtmpl:           Template for output names.      restrictfilenames: Do not allow "&" and spaces in file names      ignoreerrors:      Do not stop on download errors. +    force_generic_extractor: Force downloader to use the generic extractor      nooverwrites:      Prevent overwriting files.      playliststart:     Playlist item to start at.      playlistend:       Playlist item to end at. @@ -626,13 +627,16 @@ class YoutubeDL(object):              info_dict.setdefault(key, value)      def extract_info(self, url, download=True, ie_key=None, extra_info={}, -                     process=True): +                     process=True, force_generic_extractor=False):          '''          Returns a list with a dictionary for each video we find.          If 'download', also downloads the videos.          extra_info is a dict containing the extra values to add to each result          ''' +        if not ie_key and force_generic_extractor: +            ie_key = 'Generic' +          if ie_key:              ies = [self.get_info_extractor(ie_key)]          else: @@ -1004,7 +1008,7 @@ class YoutubeDL(object):                  t.get('preference'), t.get('width'), t.get('height'),                  t.get('id'), t.get('url')))              for i, t in enumerate(thumbnails): -                if 'width' in t and 'height' in t: +                if t.get('width') and t.get('height'):                      t['resolution'] = '%dx%d' % (t['width'], t['height'])                  if t.get('id') is None:                      t['id'] = '%d' % i @@ -1493,7 +1497,8 @@ class YoutubeDL(object):          for url in url_list:              try:                  # It also downloads the videos -                res = self.extract_info(url) +                res = self.extract_info( +                    url, force_generic_extractor=self.params.get('force_generic_extractor', False))              except UnavailableVideoError:                  self.report_error('unable to download video')              except MaxDownloadsReached: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ace17857c..215b616de 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -293,6 +293,7 @@ def _real_main(argv=None):          'autonumber_size': opts.autonumber_size,          'restrictfilenames': opts.restrictfilenames,          'ignoreerrors': opts.ignoreerrors, +        'force_generic_extractor': opts.force_generic_extractor,          'ratelimit': opts.ratelimit,          'nooverwrites': opts.nooverwrites,          'retries': opts_retries, diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 7ca2d3143..1d5cc9904 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -109,6 +109,14 @@ class Aria2cFD(ExternalFD):          cmd += ['--', info_dict['url']]          return cmd + +class HttpieFD(ExternalFD): +    def _make_cmd(self, tmpfilename, info_dict): +        cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] +        for key, val in info_dict['http_headers'].items(): +            cmd += ['%s:%s' % (key, val)] +        return cmd +  _BY_NAME = dict(      (klass.get_basename(), klass)      for name, klass in globals().items() @@ -123,5 +131,6 @@ def list_external_downloaders():  def get_external_downloader(external_downloader):      """ Given the name of the executable, see whether we support the given          downloader . """ -    bn = os.path.basename(external_downloader) +    # Drop .exe extension on Windows +    bn = os.path.splitext(os.path.basename(external_downloader))[0]      return _BY_NAME[bn] diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6fdaf90b2..f7c1f07a4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -4,7 +4,10 @@ from .abc import ABCIE  from .abc7news import Abc7NewsIE  from .academicearth import AcademicEarthCourseIE  from .addanime import AddAnimeIE -from .adobetv import AdobeTVIE +from .adobetv import ( +    AdobeTVIE, +    AdobeTVVideoIE, +)  from .adultswim import AdultSwimIE  from .aftenposten import AftenpostenIE  from .aftonbladet import AftonbladetIE @@ -103,6 +106,7 @@ from .dailymotion import (      DailymotionIE,      DailymotionPlaylistIE,      DailymotionUserIE, +    DailymotionCloudIE,  )  from .daum import DaumIE  from .dbtv import DBTVIE @@ -140,7 +144,6 @@ from .ellentv import (  )  from .elpais import ElPaisIE  from .embedly import EmbedlyIE -from .empflix import EMPFlixIE  from .engadget import EngadgetIE  from .eporner import EpornerIE  from .eroprofile import EroProfileIE @@ -384,6 +387,7 @@ from .nytimes import (  from .nuvid import NuvidIE  from .odnoklassniki import OdnoklassnikiIE  from .oktoberfesttv import OktoberfestTVIE +from .onionstudios import OnionStudiosIE  from .ooyala import (      OoyalaIE,      OoyalaExternalIE, @@ -401,6 +405,7 @@ from .pbs import PBSIE  from .philharmoniedeparis import PhilharmonieDeParisIE  from .phoenix import PhoenixIE  from .photobucket import PhotobucketIE +from .pinkbike import PinkbikeIE  from .planetaplay import PlanetaPlayIE  from .pladform import PladformIE  from .played import PlayedIE @@ -427,6 +432,7 @@ from .qqmusic import (      QQMusicSingerIE,      QQMusicAlbumIE,      QQMusicToplistIE, +    QQMusicPlaylistIE,  )  from .quickvid import QuickVidIE  from .r7 import R7IE @@ -487,6 +493,10 @@ from .smotri import (      SmotriUserIE,      SmotriBroadcastIE,  ) +from .snagfilms import ( +    SnagFilmsIE, +    SnagFilmsEmbedIE, +)  from .snotr import SnotrIE  from .sohu import SohuIE  from .soompi import ( @@ -560,6 +570,7 @@ from .tf1 import TF1IE  from .theonion import TheOnionIE  from .theplatform import ThePlatformIE  from .thesixtyone import TheSixtyOneIE +from .thisamericanlife import ThisAmericanLifeIE  from .thisav import ThisAVIE  from .tinypic import TinyPicIE  from .tlc import TlcIE, TlcDeIE @@ -567,7 +578,11 @@ from .tmz import (      TMZIE,      TMZArticleIE,  ) -from .tnaflix import TNAFlixIE +from .tnaflix import ( +    TNAFlixIE, +    EMPFlixIE, +    MovieFapIE, +)  from .thvideo import (      THVideoIE,      THVideoPlaylistIE @@ -611,6 +626,7 @@ from .twitch import (      TwitchBookmarksIE,      TwitchStreamIE,  ) +from .twitter import TwitterCardIE  from .ubu import UbuIE  from .udemy import (      UdemyIE, @@ -696,7 +712,10 @@ from .wrzuta import WrzutaIE  from .wsj import WSJIE  from .xbef import XBefIE  from .xboxclips import XboxClipsIE -from .xhamster import XHamsterIE +from .xhamster import ( +    XHamsterIE, +    XHamsterEmbedIE, +)  from .xminus import XMinusIE  from .xnxx import XNXXIE  from .xstream import XstreamIE diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 97d128560..5e43adc51 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -5,6 +5,8 @@ from ..utils import (      parse_duration,      unified_strdate,      str_to_int, +    float_or_none, +    ISO639Utils,  ) @@ -69,3 +71,61 @@ class AdobeTVIE(InfoExtractor):              'view_count': view_count,              'formats': formats,          } + + +class AdobeTVVideoIE(InfoExtractor): +    _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)' + +    _TEST = { +        # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners +        'url': 'https://video.tv.adobe.com/v/2456/', +        'md5': '43662b577c018ad707a63766462b1e87', +        'info_dict': { +            'id': '2456', +            'ext': 'mp4', +            'title': 'New experience with Acrobat DC', +            'description': 'New experience with Acrobat DC', +            'duration': 248.667, +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        player_params = self._parse_json(self._search_regex( +            r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'), +            video_id) + +        formats = [{ +            'url': source['src'], +            'width': source.get('width'), +            'height': source.get('height'), +            'tbr': source.get('bitrate'), +        } for source in player_params['sources']] + +        # For both metadata and downloaded files the duration varies among +        # formats. I just pick the max one +        duration = max(filter(None, [ +            float_or_none(source.get('duration'), scale=1000) +            for source in player_params['sources']])) + +        subtitles = {} +        for translation in player_params.get('translations', []): +            lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) +            if lang_id not in subtitles: +                subtitles[lang_id] = [] +            subtitles[lang_id].append({ +                'url': translation['vttPath'], +                'ext': 'vtt', +            }) + +        return { +            'id': video_id, +            'formats': formats, +            'title': player_params['title'], +            'description': self._og_search_description(webpage), +            'duration': duration, +            'subtitles': subtitles, +        } diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 249bc6bbd..5825d2867 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -130,6 +130,20 @@ class BBCCoUkIE(InfoExtractor):              },              'skip': 'geolocation',          }, { +            'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', +            'info_dict': { +                'id': 'b05zmgw1', +                'ext': 'flv', +                'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.', +                'title': 'Royal Academy Summer Exhibition', +                'duration': 3540, +            }, +            'params': { +                # rtmp download +                'skip_download': True, +            }, +            'skip': 'geolocation', +        }, {              'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',              'only_matching': True,          }, { @@ -237,26 +251,11 @@ class BBCCoUkIE(InfoExtractor):          for connection in self._extract_connections(media):              captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')              lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') -            ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}')) -            srt = '' - -            def _extract_text(p): -                if p.text is not None: -                    stripped_text = p.text.strip() -                    if stripped_text: -                        return stripped_text -                return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span')) -            for pos, p in enumerate(ps): -                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))              subtitles[lang] = [                  {                      'url': connection.get('href'),                      'ext': 'ttml',                  }, -                { -                    'data': srt, -                    'ext': 'srt', -                },              ]          return subtitles @@ -267,7 +266,7 @@ class BBCCoUkIE(InfoExtractor):                  programme_id, 'Downloading media selection XML')          except ExtractorError as ee:              if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: -                media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) +                media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8'))              else:                  raise @@ -362,7 +361,7 @@ class BBCCoUkIE(InfoExtractor):              formats, subtitles = self._download_media_selector(programme_id)              title = self._og_search_title(webpage)              description = self._search_regex( -                r'<p class="medium-description">([^<]+)</p>', +                r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',                  webpage, 'description', fatal=False)          else:              programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index d768f99e6..4721c2293 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -13,6 +13,7 @@ from ..compat import (      compat_urllib_parse_urlparse,      compat_urllib_request,      compat_urlparse, +    compat_xml_parse_error,  )  from ..utils import (      determine_ext, @@ -119,7 +120,7 @@ class BrightcoveIE(InfoExtractor):          try:              object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) -        except xml.etree.ElementTree.ParseError: +        except compat_xml_parse_error:              return          fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 49e4dc710..d859aea52 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -22,6 +22,7 @@ from ..compat import (      compat_str,  )  from ..utils import ( +    NO_DEFAULT,      age_restricted,      bug_reports_message,      clean_html, @@ -33,7 +34,6 @@ from ..utils import (      sanitize_filename,      unescapeHTML,  ) -_NO_DEFAULT = object()  class InfoExtractor(object): @@ -523,7 +523,7 @@ class InfoExtractor(object):              video_info['description'] = playlist_description          return video_info -    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): +    def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):          """          Perform a regex search on the given string, using a single or a list of          patterns returning the first matching group. @@ -549,7 +549,7 @@ class InfoExtractor(object):                  return next(g for g in mobj.groups() if g is not None)              else:                  return mobj.group(group) -        elif default is not _NO_DEFAULT: +        elif default is not NO_DEFAULT:              return default          elif fatal:              raise RegexNotFoundError('Unable to extract %s' % _name) @@ -557,7 +557,7 @@ class InfoExtractor(object):              self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())              return None -    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): +    def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):          """          Like _search_regex, but strips HTML tags and unescapes entities.          """ @@ -846,7 +846,8 @@ class InfoExtractor(object):      def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,                                entry_protocol='m3u8', preference=None, -                              m3u8_id=None, note=None, errnote=None): +                              m3u8_id=None, note=None, errnote=None, +                              fatal=True):          formats = [{              'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), @@ -866,7 +867,10 @@ class InfoExtractor(object):          m3u8_doc = self._download_webpage(              m3u8_url, video_id,              note=note or 'Downloading m3u8 information', -            errnote=errnote or 'Failed to download m3u8 information') +            errnote=errnote or 'Failed to download m3u8 information', +            fatal=fatal) +        if m3u8_doc is False: +            return m3u8_doc          last_info = None          last_media = None          kv_rex = re.compile( diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 41f0c736d..73f1e22ef 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -27,7 +27,7 @@ from ..aes import (  class CrunchyrollIE(InfoExtractor): -    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)' +    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'      _NETRC_MACHINE = 'crunchyroll'      _TESTS = [{          'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', @@ -46,6 +46,22 @@ class CrunchyrollIE(InfoExtractor):              'skip_download': True,          },      }, { +        'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1', +        'info_dict': { +            'id': '589804', +            'ext': 'flv', +            'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', +            'description': 'md5:fe2743efedb49d279552926d0bd0cd9e', +            'thumbnail': 're:^https?://.*\.jpg$', +            'uploader': 'Danny Choo Network', +            'upload_date': '20120213', +        }, +        'params': { +            # rtmp +            'skip_download': True, +        }, + +    }, {          'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',          'only_matching': True,      }] @@ -251,16 +267,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text          for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage):              stream_quality, stream_format = self._FORMAT_IDS[fmt]              video_format = fmt + 'p' -            streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/') -            # urlencode doesn't work! -            streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality=' + stream_quality + '&media%5Fid=' + stream_id + '&video%5Fformat=' + stream_format +            streamdata_req = compat_urllib_request.Request( +                'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s' +                % (stream_id, stream_format, stream_quality), +                compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8'))              streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') -            streamdata_req.add_header('Content-Length', str(len(streamdata_req.data)))              streamdata = self._download_xml(                  streamdata_req, video_id,                  note='Downloading media info for %s' % video_format) -            video_url = streamdata.find('./host').text -            video_play_path = streamdata.find('./file').text +            stream_info = streamdata.find('./{default}preload/stream_info') +            video_url = stream_info.find('./host').text +            video_play_path = stream_info.find('./file').text              formats.append({                  'url': video_url,                  'play_path': video_play_path, diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 70aa4333c..8852f0add 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -251,3 +251,53 @@ class DailymotionUserIE(DailymotionPlaylistIE):              'title': full_user,              'entries': self._extract_entries(user),          } + + +class DailymotionCloudIE(DailymotionBaseInfoExtractor): +    _VALID_URL_PREFIX = r'http://api\.dmcloud\.net/(?:player/)?embed/' +    _VALID_URL = r'%s[^/]+/(?P<id>[^/?]+)' % _VALID_URL_PREFIX +    _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX + +    _TESTS = [{ +        # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html +        # Tested at FranceTvInfo_2 +        'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1', +        'only_matching': True, +    }, { +        # http://www.francetvinfo.fr/societe/larguez-les-amarres-le-cobaturage-se-developpe_980101.html +        'url': 'http://api.dmcloud.net/player/embed/4e7343f894a6f677b10006b4/559545469473996d31429f06?auth=1467430263-0-90tglw2l-a3a4b64ed41efe48d7fccad85b8b8fda&autoplay=1', +        'only_matching': True, +    }] + +    @classmethod +    def _extract_dmcloud_url(self, webpage): +        mobj = re.search(r'<iframe[^>]+src=[\'"](%s)[\'"]' % self._VALID_EMBED_URL, webpage) +        if mobj: +            return mobj.group(1) + +        mobj = re.search( +            r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % self._VALID_EMBED_URL, +            webpage) +        if mobj: +            return mobj.group(1) + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        request = self._build_request(url) +        webpage = self._download_webpage(request, video_id) + +        title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title') + +        video_info = self._parse_json(self._search_regex( +            r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id) + +        # TODO: parse ios_url, which is in fact a manifest +        video_url = video_info['mp4_url'] + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'thumbnail': video_info.get('thumbnail_url'), +        } diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index a34aad486..ca41a3abf 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -6,6 +6,8 @@ import itertools  from .common import InfoExtractor  from ..compat import (      compat_HTTPError, +    compat_urllib_parse, +    compat_urllib_request,      compat_urlparse,  )  from ..utils import ( @@ -17,7 +19,39 @@ from ..utils import (  ) -class DramaFeverIE(InfoExtractor): +class DramaFeverBaseIE(InfoExtractor): +    _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' +    _NETRC_MACHINE = 'dramafever' + +    def _real_initialize(self): +        self._login() + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            return + +        login_form = { +            'username': username, +            'password': password, +        } + +        request = compat_urllib_request.Request( +            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) +        response = self._download_webpage( +            request, None, 'Logging in as %s' % username) + +        if all(logout_pattern not in response +               for logout_pattern in ['href="/accounts/logout/"', '>Log out<']): +            error = self._html_search_regex( +                r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<', +                response, 'error message', default=None) +            if error: +                raise ExtractorError('Unable to login: %s' % error, expected=True) +            raise ExtractorError('Unable to log in') + + +class DramaFeverIE(DramaFeverBaseIE):      IE_NAME = 'dramafever'      _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)'      _TEST = { @@ -97,7 +131,7 @@ class DramaFeverIE(InfoExtractor):          } -class DramaFeverSeriesIE(InfoExtractor): +class DramaFeverSeriesIE(DramaFeverBaseIE):      IE_NAME = 'dramafever:series'      _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$'      _TESTS = [{ @@ -151,8 +185,11 @@ class DramaFeverSeriesIE(InfoExtractor):                  % (consumer_secret, series_id, self._PAGE_SIZE, page_num),                  series_id, 'Downloading episodes JSON page #%d' % page_num)              for episode in episodes.get('value', []): +                episode_url = episode.get('episode_url') +                if not episode_url: +                    continue                  entries.append(self.url_result( -                    compat_urlparse.urljoin(url, episode['episode_url']), +                    compat_urlparse.urljoin(url, episode_url),                      'DramaFever', episode.get('guid')))              if page_num == episodes['num_pages']:                  break diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py index 7626219ba..8b98b013a 100644 --- a/youtube_dl/extractor/drbonanza.py +++ b/youtube_dl/extractor/drbonanza.py @@ -15,7 +15,6 @@ class DRBonanzaIE(InfoExtractor):      _TESTS = [{          'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517', -        'md5': 'fe330252ddea607635cf2eb2c99a0af3',          'info_dict': {              'id': '65517',              'ext': 'mp4', @@ -26,6 +25,9 @@ class DRBonanzaIE(InfoExtractor):              'upload_date': '20110120',              'duration': 3664,          }, +        'params': { +            'skip_download': True,  # requires rtmp +        },      }, {          'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410',          'md5': '6dfe039417e76795fb783c52da3de11d', @@ -93,6 +95,11 @@ class DRBonanzaIE(InfoExtractor):                          'format_id': file['Type'].replace('Video', ''),                          'preference': preferencemap.get(file['Type'], -10),                      }) +                    if format['url'].startswith('rtmp'): +                        rtmp_url = format['url'] +                        format['rtmp_live'] = True  # --resume does not work +                        if '/bonanza/' in rtmp_url: +                            format['play_path'] = rtmp_url.split('/bonanza/')[1]                      formats.append(format)                  elif file['Type'] == "Thumb":                      thumbnail = file['Location'] @@ -111,9 +118,6 @@ class DRBonanzaIE(InfoExtractor):          description = '%s\n%s\n%s\n' % (              info['Description'], info['Actors'], info['Colophon']) -        for f in formats: -            f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/') -            f['url'] = f['url'].replace('mp4:bonanza', 'bonanza')          self._sort_formats(formats)          display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 37c5c181f..639f9182c 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -36,25 +36,24 @@ class DrTuberIE(InfoExtractor):              r'<source src="([^"]+)"', webpage, 'video URL')          title = self._html_search_regex( -            [r'class="hd_title" style="[^"]+">([^<]+)</h1>', r'<title>([^<]+) - \d+'], +            [r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'],              webpage, 'title')          thumbnail = self._html_search_regex(              r'poster="([^"]+)"',              webpage, 'thumbnail', fatal=False) -        like_count = str_to_int(self._html_search_regex( -            r'<span id="rate_likes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>', -            webpage, 'like count', fatal=False)) -        dislike_count = str_to_int(self._html_search_regex( -            r'<span id="rate_dislikes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>', -            webpage, 'like count', fatal=False)) -        comment_count = str_to_int(self._html_search_regex( -            r'<span class="comments_count">([\d,\.]+)</span>', -            webpage, 'comment count', fatal=False)) +        def extract_count(id_, name): +            return str_to_int(self._html_search_regex( +                r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_, +                webpage, '%s count' % name, fatal=False)) + +        like_count = extract_count('rate_likes', 'like') +        dislike_count = extract_count('rate_dislikes', 'dislike') +        comment_count = extract_count('comments_count', 'comment')          cats_str = self._search_regex( -            r'<span>Categories:</span><div>(.+?)</div>', webpage, 'categories', fatal=False) +            r'<div[^>]+class="categories_list">(.+?)</div>', webpage, 'categories', fatal=False)          categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str)          return { diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py deleted file mode 100644 index 4827022e0..000000000 --- a/youtube_dl/extractor/empflix.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import unicode_literals - -from .tnaflix import TNAFlixIE - - -class EMPFlixIE(TNAFlixIE): -    _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html' - -    _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"' -    _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"' -    _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - -    _TESTS = [ -        { -            'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', -            'md5': 'b1bc15b6412d33902d6e5952035fcabc', -            'info_dict': { -                'id': '33051', -                'display_id': 'Amateur-Finger-Fuck', -                'ext': 'mp4', -                'title': 'Amateur Finger Fuck', -                'description': 'Amateur solo finger fucking.', -                'thumbnail': 're:https?://.*\.jpg$', -                'age_limit': 18, -            } -        }, -        { -            'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', -            'only_matching': True, -        } -    ] diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index 3c39ca451..cebdd0193 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -6,9 +6,9 @@ from .common import InfoExtractor  class FazIE(InfoExtractor):      IE_NAME = 'faz.net' -    _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html' +    _VALID_URL = r'https?://(?:www\.)?faz\.net/(?:[^/]+/)*.*?-(?P<id>\d+)\.html' -    _TEST = { +    _TESTS = [{          'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',          'info_dict': {              'id': '12610585', @@ -16,7 +16,22 @@ class FazIE(InfoExtractor):              'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher',              'description': 'md5:1453fbf9a0d041d985a47306192ea253',          }, -    } +    }, { +        'url': 'http://www.faz.net/aktuell/politik/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html', +        'only_matching': True, +    }, { +        'url': 'http://www.faz.net/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html', +        'only_matching': True, +    }, { +        'url': 'http://www.faz.net/-13659345.html', +        'only_matching': True, +    }, { +        'url': 'http://www.faz.net/aktuell/politik/-13659345.html', +        'only_matching': True, +    }, { +        'url': 'http://www.faz.net/foobarblafasel-13659345.html', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index db0bbec1e..b2c984bf2 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -18,6 +18,7 @@ from ..utils import (      parse_duration,      determine_ext,  ) +from .dailymotion import DailymotionCloudIE  class FranceTVBaseInfoExtractor(InfoExtractor): @@ -131,12 +132,26 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):              'skip_download': 'HLS (reqires ffmpeg)'          },          'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.', +    }, { +        'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', +        'md5': 'f485bda6e185e7d15dbc69b72bae993e', +        'info_dict': { +            'id': '556e03339473995ee145930c', +            'ext': 'mp4', +            'title': 'Les entreprises familiales : le secret de la réussite', +            'thumbnail': 're:^https?://.*\.jpe?g$', +        }      }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          page_title = mobj.group('title')          webpage = self._download_webpage(url, page_title) + +        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) +        if dmcloud_url: +            return self.url_result(dmcloud_url, 'DailymotionCloud') +          video_id, catalogue = self._search_regex(              r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@')          return self._extract_video(video_id, catalogue) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f6b984300..ea60d4a96 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -43,6 +43,11 @@ from .senateisvp import SenateISVPIE  from .bliptv import BlipTVIE  from .svt import SVTIE  from .pornhub import PornHubIE +from .xhamster import XHamsterEmbedIE +from .vimeo import VimeoIE +from .dailymotion import DailymotionCloudIE +from .onionstudios import OnionStudiosIE +from .snagfilms import SnagFilmsEmbedIE  class GenericIE(InfoExtractor): @@ -333,6 +338,15 @@ class GenericIE(InfoExtractor):                  'skip_download': True,              },          }, +        # XHamster embed +        { +            'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8', +            'info_dict': { +                'id': 'showthread', +                'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', +            }, +            'playlist_mincount': 7, +        },          # Embedded TED video          {              'url': 'http://en.support.wordpress.com/videos/ted-talks/', @@ -655,6 +669,18 @@ class GenericIE(InfoExtractor):                  'title': 'John Carlson Postgame 2/25/15',              },          }, +        # Kaltura embed (different embed code) +        { +            'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014', +            'info_dict': { +                'id': '1_a52wc67y', +                'ext': 'flv', +                'upload_date': '20150127', +                'uploader_id': 'PremierMedia', +                'timestamp': int, +                'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014', +            }, +        },          # Eagle.Platform embed (generic URL)          {              'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -812,6 +838,50 @@ class GenericIE(InfoExtractor):                  'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',                  'uploader': 'Rogers Sportsnet',              }, +        }, +        # Dailymotion Cloud video +        { +            'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', +            'md5': '49444254273501a64675a7e68c502681', +            'info_dict': { +                'id': '5585de919473990de4bee11b', +                'ext': 'mp4', +                'title': 'Le débat', +                'thumbnail': 're:^https?://.*\.jpe?g$', +            } +        }, +        # OnionStudios embed +        { +            'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', +            'info_dict': { +                'id': '2855', +                'ext': 'mp4', +                'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You', +                'thumbnail': 're:^https?://.*\.jpe?g$', +                'uploader': 'ClickHole', +                'uploader_id': 'clickhole', +            } +        }, +        # SnagFilms embed +        { +            'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html', +            'info_dict': { +                'id': '74849a00-85a9-11e1-9660-123139220831', +                'ext': 'mp4', +                'title': '#whilewewatch', +            } +        }, +        # AdobeTVVideo embed +        { +            'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners', +            'md5': '43662b577c018ad707a63766462b1e87', +            'info_dict': { +                'id': '2456', +                'ext': 'mp4', +                'title': 'New experience with Acrobat DC', +                'description': 'New experience with Acrobat DC', +                'duration': 248.667, +            },          }      ] @@ -979,7 +1049,9 @@ class GenericIE(InfoExtractor):              }          if not self._downloader.params.get('test', False) and not is_intentional: -            self._downloader.report_warning('Falling back on generic information extractor.') +            force = self._downloader.params.get('force_generic_extractor', False) +            self._downloader.report_warning( +                '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))          if not full_response:              request = compat_urllib_request.Request(url) @@ -1089,18 +1161,9 @@ class GenericIE(InfoExtractor):          if matches:              return _playlist_from_matches(matches, ie='RtlNl') -        # Look for embedded (iframe) Vimeo player -        mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) -        if mobj: -            player_url = unescapeHTML(mobj.group('url')) -            surl = smuggle_url(player_url, {'Referer': url}) -            return self.url_result(surl) -        # Look for embedded (swf embed) Vimeo player -        mobj = re.search( -            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) -        if mobj: -            return self.url_result(mobj.group(1)) +        vimeo_url = VimeoIE._extract_vimeo_url(url, webpage) +        if vimeo_url is not None: +            return self.url_result(vimeo_url)          # Look for embedded YouTube player          matches = re.findall(r'''(?x) @@ -1327,6 +1390,11 @@ class GenericIE(InfoExtractor):          if pornhub_url:              return self.url_result(pornhub_url, 'PornHub') +        # Look for embedded XHamster player +        xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) +        if xhamster_urls: +            return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') +          # Look for embedded Tvigle player          mobj = re.search(              r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) @@ -1436,8 +1504,8 @@ class GenericIE(InfoExtractor):              return self.url_result(mobj.group('url'), 'Zapiks')          # Look for Kaltura embeds -        mobj = re.search( -            r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) +        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or +                re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage))          if mobj is not None:              return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura') @@ -1494,6 +1562,30 @@ class GenericIE(InfoExtractor):          if senate_isvp_url:              return self.url_result(senate_isvp_url, 'SenateISVP') +        # Look for Dailymotion Cloud videos +        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) +        if dmcloud_url: +            return self.url_result(dmcloud_url, 'DailymotionCloud') + +        # Look for OnionStudios embeds +        onionstudios_url = OnionStudiosIE._extract_url(webpage) +        if onionstudios_url: +            return self.url_result(onionstudios_url) + +        # Look for SnagFilms embeds +        snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage) +        if snagfilms_url: +            return self.url_result(snagfilms_url) + +        # Look for AdobeTVVideo embeds +        mobj = re.search( +            r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', +            webpage) +        if mobj is not None: +            return self.url_result( +                self._proto_relative_url(unescapeHTML(mobj.group(1))), +                'AdobeTVVideo') +          def check_video(vurl):              if YoutubeIE.suitable(vurl):                  return True diff --git a/youtube_dl/extractor/hentaistigma.py b/youtube_dl/extractor/hentaistigma.py index 63d87b74c..f5aa73d18 100644 --- a/youtube_dl/extractor/hentaistigma.py +++ b/youtube_dl/extractor/hentaistigma.py @@ -1,7 +1,5 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor @@ -19,20 +17,19 @@ class HentaiStigmaIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          title = self._html_search_regex( -            r'<h2 class="posttitle"><a[^>]*>([^<]+)</a>', +            r'<h2[^>]+class="posttitle"[^>]*><a[^>]*>([^<]+)</a>',              webpage, 'title')          wrap_url = self._html_search_regex( -            r'<iframe src="([^"]+mp4)"', webpage, 'wrapper url') +            r'<iframe[^>]+src="([^"]+mp4)"', webpage, 'wrapper url')          wrap_webpage = self._download_webpage(wrap_url, video_id)          video_url = self._html_search_regex( -            r'clip:\s*{\s*url: "([^"]*)"', wrap_webpage, 'video url') +            r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url')          return {              'id': video_id, diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py index 3f7d6666c..16677f179 100644 --- a/youtube_dl/extractor/howcast.py +++ b/youtube_dl/extractor/howcast.py @@ -1,8 +1,7 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor +from ..utils import parse_iso8601  class HowcastIE(InfoExtractor): @@ -13,29 +12,31 @@ class HowcastIE(InfoExtractor):          'info_dict': {              'id': '390161',              'ext': 'mp4', -            'description': 'The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here\'s the proper way to tie a square knot.',              'title': 'How to Tie a Square Knot Properly', -        } +            'description': 'md5:dbe792e5f6f1489027027bf2eba188a3', +            'timestamp': 1276081287, +            'upload_date': '20100609', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) +        video_id = self._match_id(url) -        video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        self.report_extraction(video_id) - -        video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', -                                       webpage, 'video URL') - -        video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', -                                                    webpage, 'description', fatal=False) +        embed_code = self._search_regex( +            r'<iframe[^>]+src="[^"]+\bembed_code=([^\b]+)\b', +            webpage, 'ooyala embed code')          return { +            '_type': 'url_transparent', +            'ie_key': 'Ooyala', +            'url': 'ooyala:%s' % embed_code,              'id': video_id, -            'url': video_url, -            'title': self._og_search_title(webpage), -            'description': video_description, -            'thumbnail': self._og_search_thumbnail(webpage), +            'timestamp': parse_iso8601(self._html_search_meta( +                'article:published_time', webpage, 'timestamp')),          } diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index f29df36b5..4bb574cf3 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -46,7 +46,7 @@ class ImdbIE(InfoExtractor):              format_info = info['videoPlayerObject']['video']              formats.append({                  'format_id': f_id, -                'url': format_info['url'], +                'url': format_info['videoInfoList'][0]['videoUrl'],              })          return { diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py index 0847074ee..65712abc2 100644 --- a/youtube_dl/extractor/ina.py +++ b/youtube_dl/extractor/ina.py @@ -7,7 +7,7 @@ from .common import InfoExtractor  class InaIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)' +    _VALID_URL = r'https?://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'      _TEST = {          'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',          'md5': 'a667021bf2b41f8dc6049479d9bb38a3', diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index f25f43664..91a1b3ccb 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -5,13 +5,14 @@ import base64  from .common import InfoExtractor  from ..compat import (      compat_urllib_parse, +    compat_urlparse,  )  class InfoQIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P<id>[^/]+)$' +    _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things',          'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2',          'info_dict': { @@ -20,7 +21,10 @@ class InfoQIE(InfoExtractor):              'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.',              'title': 'A Few of My Favorite [Python] Things',          }, -    } +    }, { +        'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) @@ -42,7 +46,7 @@ class InfoQIE(InfoExtractor):          video_id, extension = video_filename.split('.')          http_base = self._search_regex( -            r'EXPRESSINSTALL_SWF\s*=\s*"(https?://[^/"]+/)', webpage, +            r'EXPRESSINSTALL_SWF\s*=\s*[^"]*"((?:https?:)?//[^/"]+/)', webpage,              'HTTP base URL')          formats = [{ @@ -52,7 +56,7 @@ class InfoQIE(InfoExtractor):              'play_path': playpath,          }, {              'format_id': 'http', -            'url': http_base + real_id, +            'url': compat_urlparse.urljoin(url, http_base) + real_id,          }]          self._sort_formats(formats) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index cfd3b14f4..a00f6e5e5 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -30,13 +30,13 @@ class LyndaBaseIE(InfoExtractor):              return          login_form = { -            'username': username, -            'password': password, +            'username': username.encode('utf-8'), +            'password': password.encode('utf-8'),              'remember': 'false',              'stayPut': 'false'          }          request = compat_urllib_request.Request( -            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) +            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))          login_page = self._download_webpage(              request, None, 'Logging in as %s' % username) @@ -65,7 +65,7 @@ class LyndaBaseIE(InfoExtractor):                      'stayPut': 'false',                  }                  request = compat_urllib_request.Request( -                    self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form)) +                    self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form).encode('utf-8'))                  login_page = self._download_webpage(                      request, None,                      'Confirming log in and log out from another device') diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py index 85fcad06b..5a9e73cd6 100644 --- a/youtube_dl/extractor/newstube.py +++ b/youtube_dl/extractor/newstube.py @@ -31,7 +31,7 @@ class NewstubeIE(InfoExtractor):          page = self._download_webpage(url, video_id, 'Downloading page')          video_guid = self._html_search_regex( -            r'<meta property="og:video" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', +            r'<meta property="og:video:url" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',              page, 'video GUID')          player = self._download_xml( diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 5bbd2dcf6..a53e27b27 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -195,7 +195,7 @@ class NocoIE(InfoExtractor):          if episode_number:              title += ' #' + compat_str(episode_number)          if episode: -            title += ' - ' + episode +            title += ' - ' + compat_str(episode)          description = show.get('show_resume') or show.get('family_resume') diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 5d8448571..62d12b7a6 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -16,8 +16,24 @@ class NPOBaseIE(InfoExtractor):          token_page = self._download_webpage(              'http://ida.omroep.nl/npoplayer/i.js',              video_id, note='Downloading token') -        return self._search_regex( +        token = self._search_regex(              r'npoplayer\.token = "(.+?)"', token_page, 'token') +        # Decryption algorithm extracted from http://npoplayer.omroep.nl/csjs/npoplayer-min.js +        token_l = list(token) +        first = second = None +        for i in range(5, len(token_l) - 4): +            if token_l[i].isdigit(): +                if first is None: +                    first = i +                elif second is None: +                    second = i +        if first is None or second is None: +            first = 12 +            second = 13 + +        token_l[first], token_l[second] = token_l[second], token_l[first] + +        return ''.join(token_l)  class NPOIE(NPOBaseIE): @@ -92,7 +108,7 @@ class NPOIE(NPOBaseIE):      def _get_info(self, video_id):          metadata = self._download_json( -            'http://e.omroep.nl/metadata/aflevering/%s' % video_id, +            'http://e.omroep.nl/metadata/%s' % video_id,              video_id,              # We have to remove the javascript callback              transform_source=strip_jsonp, diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index cc70c2950..9e4581cf9 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -13,7 +13,7 @@ from ..utils import (  class NRKIE(InfoExtractor): -    _VALID_URL = r'(?:nrk:|http://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' +    _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'      _TESTS = [          { @@ -76,7 +76,7 @@ class NRKIE(InfoExtractor):  class NRKPlaylistIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)' +    _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'      _TESTS = [{          'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', @@ -116,11 +116,11 @@ class NRKPlaylistIE(InfoExtractor):  class NRKTVIE(InfoExtractor): -    _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' +    _VALID_URL = r'(?P<baseurl>https?://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'      _TESTS = [          { -            'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', +            'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',              'md5': 'adf2c5454fa2bf032f47a9f8fb351342',              'info_dict': {                  'id': 'MUHH48000314', @@ -132,7 +132,7 @@ class NRKTVIE(InfoExtractor):              },          },          { -            'url': 'http://tv.nrk.no/program/mdfp15000514', +            'url': 'https://tv.nrk.no/program/mdfp15000514',              'md5': '383650ece2b25ecec996ad7b5bb2a384',              'info_dict': {                  'id': 'mdfp15000514', @@ -145,7 +145,7 @@ class NRKTVIE(InfoExtractor):          },          {              # single playlist video -            'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', +            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',              'md5': 'adbd1dbd813edaf532b0a253780719c2',              'info_dict': {                  'id': 'MSPO40010515-part2', @@ -157,7 +157,7 @@ class NRKTVIE(InfoExtractor):              'skip': 'Only works from Norway',          },          { -            'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', +            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',              'playlist': [                  {                      'md5': '9480285eff92d64f06e02a5367970a7a', diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py new file mode 100644 index 000000000..8fa507dec --- /dev/null +++ b/youtube_dl/extractor/onionstudios.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class OnionStudiosIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:videos/[^/]+-|embed\?.*\bid=)(?P<id>\d+)(?!-)' + +    _TESTS = [{ +        'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', +        'md5': 'd4851405d31adfadf71cd7a487b765bb', +        'info_dict': { +            'id': '2937', +            'ext': 'mp4', +            'title': 'Hannibal charges forward, stops for a cocktail', +            'description': 'md5:545299bda6abf87e5ec666548c6a9448', +            'thumbnail': 're:^https?://.*\.jpg$', +            'uploader': 'The A.V. Club', +            'uploader_id': 'TheAVClub', +        }, +    }, { +        'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true', +        'only_matching': True, +    }] + +    @staticmethod +    def _extract_url(webpage): +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/embed.+?)\1', webpage) +        if mobj: +            return mobj.group('url') + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage( +            'http://www.onionstudios.com/embed?id=%s' % video_id, video_id) + +        formats = [] +        for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage): +            if determine_ext(src) != 'm3u8':  # m3u8 always results in 403 +                formats.append({ +                    'url': src, +                }) +        self._sort_formats(formats) + +        title = self._search_regex( +            r'share_title\s*=\s*"([^"]+)"', webpage, 'title') +        description = self._search_regex( +            r'share_description\s*=\s*"([^"]+)"', webpage, +            'description', default=None) +        thumbnail = self._search_regex( +            r'poster="([^"]+)"', webpage, 'thumbnail', default=False) + +        uploader_id = self._search_regex( +            r'twitter_handle\s*=\s*"([^"]+)"', +            webpage, 'uploader id', fatal=False) +        uploader = self._search_regex( +            r'window\.channelName\s*=\s*"Embedded:([^"]+)"', +            webpage, 'uploader', default=False) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 143a76696..1e2b965f9 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -1,3 +1,4 @@ +# coding: utf-8  from __future__ import unicode_literals  import re @@ -35,6 +36,9 @@ class PBSIE(InfoExtractor):                  'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',                  'duration': 3190,              }, +            'params': { +                'skip_download': True,  # requires ffmpeg +            },          },          {              'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', @@ -46,6 +50,9 @@ class PBSIE(InfoExtractor):                  'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',                  'duration': 5050,              }, +            'params': { +                'skip_download': True,  # requires ffmpeg +            }          },          {              'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', @@ -68,7 +75,10 @@ class PBSIE(InfoExtractor):                  'title': 'Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',                  'duration': 6559,                  'thumbnail': 're:^https?://.*\.jpg$', -            } +            }, +            'params': { +                'skip_download': True,  # requires ffmpeg +            },          },          {              'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', @@ -82,7 +92,10 @@ class PBSIE(InfoExtractor):                  'duration': 3172,                  'thumbnail': 're:^https?://.*\.jpg$',                  'upload_date': '20140122', -            } +            }, +            'params': { +                'skip_download': True,  # requires ffmpeg +            },          },          {              'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', @@ -90,6 +103,21 @@ class PBSIE(InfoExtractor):                  'id': 'united-states-of-secrets',              },              'playlist_count': 2, +        }, +        { +            'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/', +            'info_dict': { +                'id': '2280706814', +                'display_id': 'player', +                'ext': 'mp4', +                'title': 'Death and the Civil War', +                'description': 'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.', +                'duration': 6705, +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +            'params': { +                'skip_download': True,  # requires ffmpeg +            },          }      ] @@ -123,7 +151,7 @@ class PBSIE(InfoExtractor):                  return media_id, presumptive_id, upload_date              url = self._search_regex( -                r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>', +                r'<iframe\s+[^>]*\s+src=["\']([^\'"]+partnerplayer[^\'"]+)["\']',                  webpage, 'player URL')              mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py new file mode 100644 index 000000000..a52210fab --- /dev/null +++ b/youtube_dl/extractor/pinkbike.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    remove_end, +    remove_start, +    str_to_int, +    unified_strdate, +) + + +class PinkbikeIE(InfoExtractor): +    _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)' +    _TESTS = [{ +        'url': 'http://www.pinkbike.com/video/402811/', +        'md5': '4814b8ca7651034cd87e3361d5c2155a', +        'info_dict': { +            'id': '402811', +            'ext': 'mp4', +            'title': 'Brandon Semenuk - RAW 100', +            'description': 'Official release: www.redbull.ca/rupertwalker', +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 100, +            'upload_date': '20150406', +            'uploader': 'revelco', +            'location': 'Victoria, British Columbia, Canada', +            'view_count': int, +            'comment_count': int, +        } +    }, { +        'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage( +            'http://www.pinkbike.com/video/%s' % video_id, video_id) + +        formats = [] +        for _, format_id, src in re.findall( +                r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage): +            height = int_or_none(self._search_regex( +                r'^(\d+)[pP]$', format_id, 'height', default=None)) +            formats.append({ +                'url': src, +                'format_id': format_id, +                'height': height, +            }) +        self._sort_formats(formats) + +        title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike') +        description = self._html_search_regex( +            r'(?s)id="media-description"[^>]*>(.+?)<', +            webpage, 'description', default=None) or remove_start( +            self._og_search_description(webpage), title + '. ') +        thumbnail = self._og_search_thumbnail(webpage) +        duration = int_or_none(self._html_search_meta( +            'video:duration', webpage, 'duration')) + +        uploader = self._search_regex( +            r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False) +        upload_date = unified_strdate(self._search_regex( +            r'class="fullTime"[^>]+title="([^"]+)"', +            webpage, 'upload date', fatal=False)) + +        location = self._html_search_regex( +            r'(?s)<dt>Location</dt>\s*<dd>(.+?)<', +            webpage, 'location', fatal=False) + +        def extract_count(webpage, label): +            return str_to_int(self._search_regex( +                r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label, +                webpage, label, fatal=False)) + +        view_count = extract_count(webpage, 'Views') +        comment_count = extract_count(webpage, 'Comments') + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'upload_date': upload_date, +            'uploader': uploader, +            'location': location, +            'view_count': view_count, +            'comment_count': comment_count, +            'formats': formats +        } diff --git a/youtube_dl/extractor/planetaplay.py b/youtube_dl/extractor/planetaplay.py index 596c621d7..06505e96f 100644 --- a/youtube_dl/extractor/planetaplay.py +++ b/youtube_dl/extractor/planetaplay.py @@ -18,7 +18,8 @@ class PlanetaPlayIE(InfoExtractor):              'id': '3586',              'ext': 'flv',              'title': 'md5:e829428ee28b1deed00de90de49d1da1', -        } +        }, +        'skip': 'Not accessible from Travis CI server',      }      _SONG_FORMATS = { diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 8565d7551..8172bc997 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -19,8 +19,8 @@ from ..aes import (  class PornHubIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-f]+)' -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)' +    _TESTS = [{          'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',          'md5': '882f488fa1f0026f023f33576004a2ed',          'info_dict': { @@ -30,7 +30,10 @@ class PornHubIE(InfoExtractor):              "title": "Seductive Indian beauty strips down and fingers her pink pussy",              "age_limit": 18          } -    } +    }, { +        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', +        'only_matching': True, +    }]      @classmethod      def _extract_url(cls, webpage): diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index d9a783f8a..e704640e5 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -9,6 +9,7 @@ from .common import InfoExtractor  from ..utils import (      strip_jsonp,      unescapeHTML, +    clean_html,  )  from ..compat import compat_urllib_request @@ -250,3 +251,36 @@ class QQMusicToplistIE(QQPlaylistBaseIE):          list_name = topinfo.get('ListName')          list_description = topinfo.get('info')          return self.playlist_result(entries, list_id, list_name, list_description) + + +class QQMusicPlaylistIE(QQPlaylistBaseIE): +    IE_NAME = 'qqmusic:playlist' +    _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' + +    _TEST = { +        'url': 'http://y.qq.com/#type=taoge&id=3462654915', +        'info_dict': { +            'id': '3462654915', +            'title': '韩国5月新歌精选下旬', +            'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4', +        }, +        'playlist_count': 40, +    } + +    def _real_extract(self, url): +        list_id = self._match_id(url) + +        list_json = self._download_json( +            'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s' +            % list_id, list_id, 'Download list page', +            transform_source=strip_jsonp)['cdlist'][0] + +        entries = [ +            self.url_result( +                'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] +            ) for song in list_json['songlist'] +        ] + +        list_name = list_json.get('dissname') +        list_description = clean_html(unescapeHTML(list_json.get('desc'))) +        return self.playlist_result(entries, list_id, list_name, list_description) diff --git a/youtube_dl/extractor/quickvid.py b/youtube_dl/extractor/quickvid.py index af7d76cf4..f414e2384 100644 --- a/youtube_dl/extractor/quickvid.py +++ b/youtube_dl/extractor/quickvid.py @@ -24,6 +24,7 @@ class QuickVidIE(InfoExtractor):              'thumbnail': 're:^https?://.*\.(?:png|jpg|gif)$',              'view_count': int,          }, +        'skip': 'Not accessible from Travis CI server',      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 41d202c28..a4d3d73ff 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -44,6 +44,10 @@ class RtlNlIE(InfoExtractor):              'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',          }      }, { +        # encrypted m3u8 streams, georestricted +        'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7', +        'only_matching': True, +    }, {          'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0',          'only_matching': True,      }] @@ -51,7 +55,7 @@ class RtlNlIE(InfoExtractor):      def _real_extract(self, url):          uuid = self._match_id(url)          info = self._download_json( -            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid, +            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid,              uuid)          material = info['material'][0] @@ -59,9 +63,14 @@ class RtlNlIE(InfoExtractor):          subtitle = material['title'] or info['episodes'][0]['name']          description = material.get('synopsis') or info['episodes'][0]['synopsis'] +        meta = info.get('meta', {}) +          # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118) -        videopath = material['videopath'].replace('.f4m', '.m3u8') -        m3u8_url = 'http://manifest.us.rtl.nl' + videopath +        # NB: nowadays, recent ffmpeg and avconv can handle these encrypted streams, so +        # this adaptive -> flash workaround is not required in general, but it also +        # allows bypassing georestriction therefore is retained for now. +        videopath = material['videopath'].replace('/adaptive/', '/flash/') +        m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath          formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') @@ -82,7 +91,7 @@ class RtlNlIE(InfoExtractor):          self._sort_formats(formats)          thumbnails = [] -        meta = info.get('meta', {}) +          for p in ('poster_base_url', '"thumb_base_url"'):              if not meta.get(p):                  continue diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 24746a09a..93a7cfe15 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -53,7 +53,7 @@ class SmotriIE(InfoExtractor):                  'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',              },          }, -        # video-password +        # video-password, not approved by moderator          {              'url': 'http://smotri.com/video/view/?id=v1390466a13c',              'md5': 'f6331cef33cad65a0815ee482a54440b', @@ -71,7 +71,24 @@ class SmotriIE(InfoExtractor):              },              'skip': 'Video is not approved by moderator',          }, -        # age limit + video-password +        # video-password +        { +            'url': 'http://smotri.com/video/view/?id=v6984858774#', +            'md5': 'f11e01d13ac676370fc3b95b9bda11b0', +            'info_dict': { +                'id': 'v6984858774', +                'ext': 'mp4', +                'title': 'Дача Солженицина ПАРОЛЬ 223322', +                'uploader': 'psavari1', +                'uploader_id': 'psavari1', +                'upload_date': '20081103', +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +            'params': { +                'videopassword': '223322', +            }, +        }, +        # age limit + video-password, not approved by moderator          {              'url': 'http://smotri.com/video/view/?id=v15408898bcf',              'md5': '91e909c9f0521adf5ee86fbe073aad70', @@ -90,19 +107,22 @@ class SmotriIE(InfoExtractor):              },              'skip': 'Video is not approved by moderator',          }, -        # not approved by moderator, but available +        # age limit + video-password          { -            'url': 'http://smotri.com/video/view/?id=v28888533b73', -            'md5': 'f44bc7adac90af518ef1ecf04893bb34', +            'url': 'http://smotri.com/video/view/?id=v7780025814', +            'md5': 'b4599b068422559374a59300c5337d72',              'info_dict': { -                'id': 'v28888533b73', +                'id': 'v7780025814',                  'ext': 'mp4', -                'title': 'Russian Spies Killed By ISIL Child Soldier', -                'uploader': 'Mopeder', -                'uploader_id': 'mopeder', -                'duration': 71, -                'thumbnail': 'http://frame9.loadup.ru/d7/32/2888853.2.3.jpg', -                'upload_date': '20150114', +                'title': 'Sexy Beach (пароль 123)', +                'uploader': 'вАся', +                'uploader_id': 'asya_prosto', +                'upload_date': '20081218', +                'thumbnail': 're:^https?://.*\.jpg$', +                'age_limit': 18, +            }, +            'params': { +                'videopassword': '123'              },          },          # swf player @@ -152,6 +172,10 @@ class SmotriIE(InfoExtractor):              'getvideoinfo': '1',          } +        video_password = self._downloader.params.get('videopassword', None) +        if video_password: +            video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() +          request = compat_urllib_request.Request(              'http://smotri.com/video/view/url/bot/', compat_urllib_parse.urlencode(video_form))          request.add_header('Content-Type', 'application/x-www-form-urlencoded') @@ -161,13 +185,18 @@ class SmotriIE(InfoExtractor):          video_url = video.get('_vidURL') or video.get('_vidURL_mp4')          if not video_url: -            if video.get('_moderate_no') or not video.get('moderated'): +            if video.get('_moderate_no'):                  raise ExtractorError(                      'Video %s has not been approved by moderator' % video_id, expected=True)              if video.get('error'):                  raise ExtractorError('Video %s does not exist' % video_id, expected=True) +            if video.get('_pass_protected') == 1: +                msg = ('Invalid video password' if video_password +                       else 'This video is protected by a password, use the --video-password option') +                raise ExtractorError(msg, expected=True) +          title = video['title']          thumbnail = video['_imgURL']          upload_date = unified_strdate(video['added']) diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/snagfilms.py new file mode 100644 index 000000000..cf495f310 --- /dev/null +++ b/youtube_dl/extractor/snagfilms.py @@ -0,0 +1,171 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    clean_html, +    determine_ext, +    int_or_none, +    js_to_json, +    parse_duration, +) + + +class SnagFilmsEmbedIE(InfoExtractor): +    _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' +    _TESTS = [{ +        'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', +        'md5': '2924e9215c6eff7a55ed35b72276bd93', +        'info_dict': { +            'id': '74849a00-85a9-11e1-9660-123139220831', +            'ext': 'mp4', +            'title': '#whilewewatch', +        } +    }, { +        'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', +        'only_matching': True, +    }] + +    @staticmethod +    def _extract_url(webpage): +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1', +            webpage) +        if mobj: +            return mobj.group('url') + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        if '>This film is not playable in your area.<' in webpage: +            raise ExtractorError( +                'Film %s is not playable in your area.' % video_id, expected=True) + +        formats = [] +        for source in self._parse_json(js_to_json(self._search_regex( +                r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id): +            file_ = source.get('file') +            if not file_: +                continue +            type_ = source.get('type') +            format_id = source.get('label') +            ext = determine_ext(file_) +            if any(_ == 'm3u8' for _ in (type_, ext)): +                formats.extend(self._extract_m3u8_formats( +                    file_, video_id, 'mp4', m3u8_id='hls')) +            else: +                bitrate = int_or_none(self._search_regex( +                    r'(\d+)kbps', file_, 'bitrate', default=None)) +                height = int_or_none(self._search_regex( +                    r'^(\d+)[pP]$', format_id, 'height', default=None)) +                formats.append({ +                    'url': file_, +                    'format_id': format_id, +                    'tbr': bitrate, +                    'height': height, +                }) +        self._sort_formats(formats) + +        title = self._search_regex( +            [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'], +            webpage, 'title') + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +        } + + +class SnagFilmsIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P<id>[^?#]+)' +    _TESTS = [{ +        'url': 'http://www.snagfilms.com/films/title/lost_for_life', +        'md5': '19844f897b35af219773fd63bdec2942', +        'info_dict': { +            'id': '0000014c-de2f-d5d6-abcf-ffef58af0017', +            'display_id': 'lost_for_life', +            'ext': 'mp4', +            'title': 'Lost for Life', +            'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82', +            'thumbnail': 're:^https?://.*\.jpg', +            'duration': 4489, +            'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals'] +        } +    }, { +        'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', +        'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd', +        'info_dict': { +            'id': '00000145-d75c-d96e-a9c7-ff5c67b20000', +            'display_id': 'the_world_cut_project/india', +            'ext': 'mp4', +            'title': 'India', +            'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', +            'thumbnail': 're:^https?://.*\.jpg', +            'duration': 979, +            'categories': ['Documentary', 'Sports', 'Politics'] +        } +    }, { +        # Film is not playable in your area. +        'url': 'http://www.snagfilms.com/films/title/inside_mecca', +        'only_matching': True, +    }, { +        # Film is not available. +        'url': 'http://www.snagfilms.com/show/augie_alone/flirting', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        if ">Sorry, the Film you're looking for is not available.<" in webpage: +            raise ExtractorError( +                'Film %s is not available.' % display_id, expected=True) + +        film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') + +        snag = self._parse_json( +            self._search_regex( +                'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'), +            display_id) + +        for item in snag: +            if item.get('data', {}).get('film', {}).get('id') == film_id: +                data = item['data']['film'] +                title = data['title'] +                description = clean_html(data.get('synopsis')) +                thumbnail = data.get('image') +                duration = int_or_none(data.get('duration') or data.get('runtime')) +                categories = [ +                    category['title'] for category in data.get('categories', []) +                    if category.get('title')] +                break +        else: +            title = self._search_regex( +                r'itemprop="title">([^<]+)<', webpage, 'title') +            description = self._html_search_regex( +                r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>', +                webpage, 'description', default=None) or self._og_search_description(webpage) +            thumbnail = self._og_search_thumbnail(webpage) +            duration = parse_duration(self._search_regex( +                r'<span itemprop="duration" class="film-duration strong">([^<]+)<', +                webpage, 'duration', fatal=False)) +            categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage) + +        return { +            '_type': 'url_transparent', +            'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id, +            'id': film_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'categories': categories, +        } diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 29bd9ce6f..ba2d5e19b 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -6,9 +6,12 @@ import re  from .common import InfoExtractor  from ..compat import (      compat_str, -    compat_urllib_request +    compat_urllib_request, +    compat_urllib_parse, +) +from ..utils import ( +    ExtractorError,  ) -from ..utils import ExtractorError  class SohuIE(InfoExtractor): @@ -26,7 +29,7 @@ class SohuIE(InfoExtractor):          'skip': 'On available in China',      }, {          'url': 'http://tv.sohu.com/20150305/n409385080.shtml', -        'md5': 'ac9a5d322b4bf9ae184d53e4711e4f1a', +        'md5': '699060e75cf58858dd47fb9c03c42cfb',          'info_dict': {              'id': '409385080',              'ext': 'mp4', @@ -34,7 +37,7 @@ class SohuIE(InfoExtractor):          }      }, {          'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', -        'md5': '49308ff6dafde5ece51137d04aec311e', +        'md5': '9bf34be48f2f4dadcb226c74127e203c',          'info_dict': {              'id': '78693464',              'ext': 'mp4', @@ -48,7 +51,7 @@ class SohuIE(InfoExtractor):              'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',          },          'playlist': [{ -            'md5': '492923eac023ba2f13ff69617c32754a', +            'md5': 'bdbfb8f39924725e6589c146bc1883ad',              'info_dict': {                  'id': '78910339_part1',                  'ext': 'mp4', @@ -56,7 +59,7 @@ class SohuIE(InfoExtractor):                  'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',              }          }, { -            'md5': 'de604848c0e8e9c4a4dde7e1347c0637', +            'md5': '3e1f46aaeb95354fd10e7fca9fc1804e',              'info_dict': {                  'id': '78910339_part2',                  'ext': 'mp4', @@ -64,7 +67,7 @@ class SohuIE(InfoExtractor):                  'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',              }          }, { -            'md5': '93584716ee0657c0b205b8aa3d27aa13', +            'md5': '8407e634175fdac706766481b9443450',              'info_dict': {                  'id': '78910339_part3',                  'ext': 'mp4', @@ -139,21 +142,42 @@ class SohuIE(InfoExtractor):          for i in range(part_count):              formats = []              for format_id, format_data in formats_json.items(): +                allot = format_data['allot'] +                  data = format_data['data'] +                clips_url = data['clipsURL'] +                su = data['su'] -                # URLs starts with http://newflv.sohu.ccgslb.net/ is not usable -                # so retry until got a working URL                  video_url = 'newflv.sohu.ccgslb.net' +                cdnId = None                  retries = 0 -                while 'newflv.sohu.ccgslb.net' in video_url and retries < 5: -                    download_note = 'Download information from CDN gateway for format ' + format_id + +                while 'newflv.sohu.ccgslb.net' in video_url: +                    params = { +                        'prot': 9, +                        'file': clips_url[i], +                        'new': su[i], +                        'prod': 'flash', +                    } + +                    if cdnId is not None: +                        params['idc'] = cdnId + +                    download_note = 'Downloading %s video URL part %d of %d' % ( +                        format_id, i + 1, part_count) +                      if retries > 0:                          download_note += ' (retry #%d)' % retries +                    part_info = self._parse_json(self._download_webpage( +                        'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)), +                        video_id, download_note), video_id) + +                    video_url = part_info['url'] +                    cdnId = part_info.get('nid') +                      retries += 1 -                    cdn_info = self._download_json( -                        'http://data.vod.itc.cn/cdnList?new=' + data['su'][i], -                        video_id, download_note) -                    video_url = cdn_info['url'] +                    if retries > 5: +                        raise ExtractorError('Failed to get video URL')                  formats.append({                      'url': video_url, diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index c23c5ee0f..118ca4832 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor):      _VALID_URL = r'''(?x)^(?:https?://)?                      (?:(?:(?:www\.|m\.)?soundcloud\.com/                              (?P<uploader>[\w\d-]+)/ -                            (?!sets/|likes/?(?:$|[?#])) +                            (?!sets/|(?:likes|tracks)/?(?:$|[?#]))                              (?P<title>[\w\d-]+)/?                              (?P<token>[^?]+?)?(?:[?].*)?$)                         |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -307,6 +307,9 @@ class SoundcloudUserIE(SoundcloudIE):              'title': 'The Royal Concept',          },          'playlist_mincount': 1, +    }, { +        'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index 08a5c4314..27f4033c5 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -77,11 +77,13 @@ class SpiegeltvIE(InfoExtractor):                      'rtmp_live': True,                  })              elif determine_ext(endpoint) == 'm3u8': -                formats.extend(self._extract_m3u8_formats( +                m3u8_formats = self._extract_m3u8_formats(                      endpoint.replace('[video]', play_path),                      video_id, 'm4v',                      preference=1,  # Prefer hls since it allows to workaround georestriction -                    m3u8_id='hls')) +                    m3u8_id='hls', fatal=False) +                if m3u8_formats is not False: +                    formats.extend(m3u8_formats)              else:                  formats.append({                      'url': endpoint, diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py index a77c6a2fc..5d09eb9a8 100644 --- a/youtube_dl/extractor/thesixtyone.py +++ b/youtube_dl/extractor/thesixtyone.py @@ -1,9 +1,6 @@  # coding: utf-8  from __future__ import unicode_literals -import json -import re -  from .common import InfoExtractor  from ..utils import unified_strdate @@ -17,7 +14,7 @@ class TheSixtyOneIE(InfoExtractor):              song          )/(?P<id>[A-Za-z0-9]+)/?$'''      _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}' -    _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}.thesixtyone.com/thesixtyone_production/audio/{0:}_stream' +    _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream'      _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop'      _TESTS = [          { @@ -70,14 +67,19 @@ class TheSixtyOneIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        song_id = mobj.group('id') +        song_id = self._match_id(url)          webpage = self._download_webpage(              self._SONG_URL_TEMPLATE.format(song_id), song_id) -        song_data = json.loads(self._search_regex( -            r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data')) +        song_data = self._parse_json(self._search_regex( +            r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'), song_id) + +        if self._search_regex(r'(t61\.s3_audio_load\s*=\s*1\.0;)', webpage, 's3_audio_load marker', default=None): +            song_data['audio_server'] = 's3.amazonaws.com' +        else: +            song_data['audio_server'] = song_data['audio_server'] + '.thesixtyone.com' +          keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']]          url = self._SONG_FILE_URL_TEMPLATE.format(              "".join(reversed(keys)), **song_data) diff --git a/youtube_dl/extractor/thisamericanlife.py b/youtube_dl/extractor/thisamericanlife.py new file mode 100644 index 000000000..36493a5de --- /dev/null +++ b/youtube_dl/extractor/thisamericanlife.py @@ -0,0 +1,40 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class ThisAmericanLifeIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?thisamericanlife\.org/(?:radio-archives/episode/|play_full\.php\?play=)(?P<id>\d+)' +    _TESTS = [{ +        'url': 'http://www.thisamericanlife.org/radio-archives/episode/487/harper-high-school-part-one', +        'md5': '8f7d2da8926298fdfca2ee37764c11ce', +        'info_dict': { +            'id': '487', +            'ext': 'm4a', +            'title': '487: Harper High School, Part One', +            'description': 'md5:ee40bdf3fb96174a9027f76dbecea655', +            'thumbnail': 're:^https?://.*\.jpg$', +        }, +    }, { +        'url': 'http://www.thisamericanlife.org/play_full.php?play=487', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage( +            'http://www.thisamericanlife.org/radio-archives/episode/%s' % video_id, video_id) + +        return { +            'id': video_id, +            'url': 'http://stream.thisamericanlife.org/{0}/stream/{0}_64k.m3u8'.format(video_id), +            'protocol': 'm3u8_native', +            'ext': 'm4a', +            'acodec': 'aac', +            'vcodec': 'none', +            'abr': 64, +            'title': self._html_search_meta(r'twitter:title', webpage, 'title', fatal=True), +            'description': self._html_search_meta(r'description', webpage, 'description'), +            'thumbnail': self._og_search_thumbnail(webpage), +        } diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index c282865b2..49516abca 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -3,39 +3,70 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_str  from ..utils import ( -    parse_duration,      fix_xml_ampersands, +    float_or_none, +    int_or_none, +    parse_duration, +    str_to_int, +    xpath_text,  ) -class TNAFlixIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' - -    _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>' -    _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>' -    _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - -    _TESTS = [ -        { -            'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', -            'md5': 'ecf3498417d09216374fc5907f9c6ec0', -            'info_dict': { -                'id': '553878', -                'display_id': 'Carmella-Decesare-striptease', -                'ext': 'mp4', -                'title': 'Carmella Decesare - striptease', -                'description': '', -                'thumbnail': 're:https?://.*\.jpg$', -                'duration': 91, -                'age_limit': 18, -            } -        }, -        { -            'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', -            'only_matching': True, -        } +class TNAFlixNetworkBaseIE(InfoExtractor): +    # May be overridden in descendants if necessary +    _CONFIG_REGEX = [ +        r'flashvars\.config\s*=\s*escape\("([^"]+)"', +        r'<input[^>]+name="config\d?" value="([^"]+)"',      ] +    _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"' +    _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"' +    _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"' +    _VIEW_COUNT_REGEX = None +    _COMMENT_COUNT_REGEX = None +    _AVERAGE_RATING_REGEX = None +    _CATEGORIES_REGEX = r'<li[^>]*>\s*<span[^>]+class="infoTitle"[^>]*>Categories:</span>\s*<span[^>]+class="listView"[^>]*>(.+?)</span>\s*</li>' + +    def _extract_thumbnails(self, flix_xml): + +        def get_child(elem, names): +            for name in names: +                child = elem.find(name) +                if child is not None: +                    return child + +        timeline = get_child(flix_xml, ['timeline', 'rolloverBarImage']) +        if timeline is None: +            return + +        pattern_el = get_child(timeline, ['imagePattern', 'pattern']) +        if pattern_el is None or not pattern_el.text: +            return + +        first_el = get_child(timeline, ['imageFirst', 'first']) +        last_el = get_child(timeline, ['imageLast', 'last']) +        if first_el is None or last_el is None: +            return + +        first_text = first_el.text +        last_text = last_el.text +        if not first_text.isdigit() or not last_text.isdigit(): +            return + +        first = int(first_text) +        last = int(last_text) +        if first > last: +            return + +        width = int_or_none(xpath_text(timeline, './imageWidth', 'thumbnail width')) +        height = int_or_none(xpath_text(timeline, './imageHeight', 'thumbnail height')) + +        return [{ +            'url': self._proto_relative_url(pattern_el.text.replace('#', compat_str(i)), 'http:'), +            'width': width, +            'height': height, +        } for i in range(first, last + 1)]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -44,47 +75,195 @@ class TNAFlixIE(InfoExtractor):          webpage = self._download_webpage(url, display_id) -        title = self._html_search_regex( -            self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage) -        description = self._html_search_regex( -            self._DESCRIPTION_REGEX, webpage, 'description', fatal=False, default='') - -        age_limit = self._rta_search(webpage) - -        duration = parse_duration(self._html_search_meta( -            'duration', webpage, 'duration', default=None)) -          cfg_url = self._proto_relative_url(self._html_search_regex(              self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:')          cfg_xml = self._download_xml( -            cfg_url, display_id, note='Downloading metadata', +            cfg_url, display_id, 'Downloading metadata',              transform_source=fix_xml_ampersands) -        thumbnail = self._proto_relative_url( -            cfg_xml.find('./startThumb').text, 'http:') -          formats = [] + +        def extract_video_url(vl): +            return re.sub('speed=\d+', 'speed=', vl.text) + +        video_link = cfg_xml.find('./videoLink') +        if video_link is not None: +            formats.append({ +                'url': extract_video_url(video_link), +                'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'), +            }) +          for item in cfg_xml.findall('./quality/item'): -            video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text) -            format_id = item.find('res').text -            fmt = { -                'url': self._proto_relative_url(video_url, 'http:'), +            video_link = item.find('./videoLink') +            if video_link is None: +                continue +            res = item.find('res') +            format_id = None if res is None else res.text +            height = int_or_none(self._search_regex( +                r'^(\d+)[pP]', format_id, 'height', default=None)) +            formats.append({ +                'url': self._proto_relative_url(extract_video_url(video_link), 'http:'),                  'format_id': format_id, -            } -            m = re.search(r'^(\d+)', format_id) -            if m: -                fmt['height'] = int(m.group(1)) -            formats.append(fmt) +                'height': height, +            }) +          self._sort_formats(formats) +        thumbnail = self._proto_relative_url( +            xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') +        thumbnails = self._extract_thumbnails(cfg_xml) + +        title = self._html_search_regex( +            self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage) + +        age_limit = self._rta_search(webpage) + +        duration = parse_duration(self._html_search_meta( +            'duration', webpage, 'duration', default=None)) + +        def extract_field(pattern, name): +            return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None + +        description = extract_field(self._DESCRIPTION_REGEX, 'description') +        uploader = extract_field(self._UPLOADER_REGEX, 'uploader') +        view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count')) +        comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count')) +        average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')) + +        categories_str = extract_field(self._CATEGORIES_REGEX, 'categories') +        categories = categories_str.split(', ') if categories_str is not None else [] +          return {              'id': video_id,              'display_id': display_id,              'title': title,              'description': description,              'thumbnail': thumbnail, +            'thumbnails': thumbnails,              'duration': duration,              'age_limit': age_limit, +            'uploader': uploader, +            'view_count': view_count, +            'comment_count': comment_count, +            'average_rating': average_rating, +            'categories': categories,              'formats': formats,          } + + +class TNAFlixIE(TNAFlixNetworkBaseIE): +    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' + +    _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>' +    _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>' +    _UPLOADER_REGEX = r'(?s)<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)<div' + +    _TESTS = [{ +        # anonymous uploader, no categories +        'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', +        'md5': 'ecf3498417d09216374fc5907f9c6ec0', +        'info_dict': { +            'id': '553878', +            'display_id': 'Carmella-Decesare-striptease', +            'ext': 'mp4', +            'title': 'Carmella Decesare - striptease', +            'thumbnail': 're:https?://.*\.jpg$', +            'duration': 91, +            'age_limit': 18, +            'uploader': 'Anonymous', +            'categories': [], +        } +    }, { +        # non-anonymous uploader, categories +        'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538', +        'md5': '0f5d4d490dbfd117b8607054248a07c0', +        'info_dict': { +            'id': '6538', +            'display_id': 'Educational-xxx-video', +            'ext': 'mp4', +            'title': 'Educational xxx video', +            'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', +            'thumbnail': 're:https?://.*\.jpg$', +            'duration': 164, +            'age_limit': 18, +            'uploader': 'bobwhite39', +            'categories': ['Amateur Porn', 'Squirting Videos', 'Teen Girls 18+'], +        } +    }, { +        'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', +        'only_matching': True, +    }] + + +class EMPFlixIE(TNAFlixNetworkBaseIE): +    _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html' + +    _UPLOADER_REGEX = r'<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)</li>' + +    _TESTS = [{ +        'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', +        'md5': 'b1bc15b6412d33902d6e5952035fcabc', +        'info_dict': { +            'id': '33051', +            'display_id': 'Amateur-Finger-Fuck', +            'ext': 'mp4', +            'title': 'Amateur Finger Fuck', +            'description': 'Amateur solo finger fucking.', +            'thumbnail': 're:https?://.*\.jpg$', +            'duration': 83, +            'age_limit': 18, +            'uploader': 'cwbike', +            'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'], +        } +    }, { +        'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', +        'only_matching': True, +    }] + + +class MovieFapIE(TNAFlixNetworkBaseIE): +    _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html' + +    _VIEW_COUNT_REGEX = r'<br>Views\s*<strong>([\d,.]+)</strong>' +    _COMMENT_COUNT_REGEX = r'<span[^>]+id="comCount"[^>]*>([\d,.]+)</span>' +    _AVERAGE_RATING_REGEX = r'Current Rating\s*<br>\s*<strong>([\d.]+)</strong>' +    _CATEGORIES_REGEX = r'(?s)<div[^>]+id="vid_info"[^>]*>\s*<div[^>]*>.+?</div>(.*?)<br>' + +    _TESTS = [{ +        # normal, multi-format video +        'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html', +        'md5': '26624b4e2523051b550067d547615906', +        'info_dict': { +            'id': 'be9867c9416c19f54a4a', +            'display_id': 'experienced-milf-amazing-handjob', +            'ext': 'mp4', +            'title': 'Experienced MILF Amazing Handjob', +            'description': 'Experienced MILF giving an Amazing Handjob', +            'thumbnail': 're:https?://.*\.jpg$', +            'age_limit': 18, +            'uploader': 'darvinfred06', +            'view_count': int, +            'comment_count': int, +            'average_rating': float, +            'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing'], +        } +    }, { +        # quirky single-format case where the extension is given as fid, but the video is really an flv +        'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html', +        'md5': 'fa56683e291fc80635907168a743c9ad', +        'info_dict': { +            'id': 'e5da0d3edce5404418f5', +            'display_id': 'jeune-couple-russe', +            'ext': 'flv', +            'title': 'Jeune Couple Russe', +            'description': 'Amateur', +            'thumbnail': 're:https?://.*\.jpg$', +            'age_limit': 18, +            'uploader': 'whiskeyjar', +            'view_count': int, +            'comment_count': int, +            'average_rating': float, +            'categories': ['Amateur', 'Teen'], +        } +    }] diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 63c20310d..9ead13a91 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -5,6 +5,7 @@ import re  from .common import InfoExtractor  from .pornhub import PornHubIE +from .vimeo import VimeoIE  class TumblrIE(InfoExtractor): @@ -40,6 +41,17 @@ class TumblrIE(InfoExtractor):              'timestamp': 1430931613,          },          'add_ie': ['Vidme'], +    }, { +        'url': 'http://camdamage.tumblr.com/post/98846056295/', +        'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6', +        'info_dict': { +            'id': '105463834', +            'ext': 'mp4', +            'title': 'Cam Damage-HD 720p', +            'uploader': 'John Moyer', +            'uploader_id': 'user32021558', +        }, +        'add_ie': ['Vimeo'],      }]      def _real_extract(self, url): @@ -60,6 +72,10 @@ class TumblrIE(InfoExtractor):          if pornhub_url:              return self.url_result(pornhub_url, 'PornHub') +        vimeo_url = VimeoIE._extract_vimeo_url(url, webpage) +        if vimeo_url: +            return self.url_result(vimeo_url, 'Vimeo') +          iframe_url = self._search_regex(              r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',              webpage, 'iframe url') diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 94bd6345d..b56ee2959 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -189,17 +189,17 @@ class TwitchVodIE(TwitchItemBaseIE):      _ITEM_SHORTCUT = 'v'      _TEST = { -        'url': 'http://www.twitch.tv/ksptv/v/3622000', +        'url': 'http://www.twitch.tv/riotgames/v/6528877',          'info_dict': { -            'id': 'v3622000', +            'id': 'v6528877',              'ext': 'mp4', -            'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''', +            'title': 'LCK Summer Split - Week 6 Day 1',              'thumbnail': 're:^https?://.*\.jpg$', -            'duration': 6951, -            'timestamp': 1419028564, -            'upload_date': '20141219', -            'uploader': 'KSPTV', -            'uploader_id': 'ksptv', +            'duration': 17208, +            'timestamp': 1435131709, +            'upload_date': '20150624', +            'uploader': 'Riot Games', +            'uploader_id': 'riotgames',              'view_count': int,          },          'params': { @@ -215,7 +215,7 @@ class TwitchVodIE(TwitchItemBaseIE):              '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,              'Downloading %s access token' % self._ITEM_TYPE)          formats = self._extract_m3u8_formats( -            '%s/vod/%s?nauth=%s&nauthsig=%s' +            '%s/vod/%s?nauth=%s&nauthsig=%s&allow_source=true'              % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']),              item_id, 'mp4')          self._prefer_source(formats) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py new file mode 100644 index 000000000..1aaa06305 --- /dev/null +++ b/youtube_dl/extractor/twitter.py @@ -0,0 +1,72 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_request +from ..utils import ( +    float_or_none, +    unescapeHTML, +) + + +class TwitterCardIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)' +    _TEST = { +        'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', +        'md5': 'a74f50b310c83170319ba16de6955192', +        'info_dict': { +            'id': '560070183650213889', +            'ext': 'mp4', +            'title': 'TwitterCard', +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 30.033, +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        # Different formats served for different User-Agents +        USER_AGENTS = [ +            'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',  # mp4 +            'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',  # webm +        ] + +        config = None +        formats = [] +        for user_agent in USER_AGENTS: +            request = compat_urllib_request.Request(url) +            request.add_header('User-Agent', user_agent) +            webpage = self._download_webpage(request, video_id) + +            config = self._parse_json( +                unescapeHTML(self._search_regex( +                    r'data-player-config="([^"]+)"', webpage, 'data player config')), +                video_id) + +            video_url = config['playlist'][0]['source'] + +            f = { +                'url': video_url, +            } + +            m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) +            if m: +                f.update({ +                    'width': int(m.group('width')), +                    'height': int(m.group('height')), +                }) +            formats.append(f) +        self._sort_formats(formats) + +        thumbnail = config.get('posterImageUrl') +        duration = float_or_none(config.get('duration')) + +        return { +            'id': video_id, +            'title': 'TwitterCard', +            'thumbnail': thumbnail, +            'duration': duration, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 7f2fb1ca8..51cdc6b65 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,5 +1,7 @@ +# coding: utf-8  from __future__ import unicode_literals +import json  import time  import hmac  import hashlib @@ -11,6 +13,7 @@ from ..utils import (      parse_age_limit,      parse_iso8601,  ) +from ..compat import compat_urllib_request  from .common import InfoExtractor @@ -23,27 +26,35 @@ class VikiBaseIE(InfoExtractor):      _APP_VERSION = '2.2.5.1428709186'      _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)' -    def _prepare_call(self, path, timestamp=None): +    _NETRC_MACHINE = 'viki' + +    _token = None + +    def _prepare_call(self, path, timestamp=None, post_data=None):          path += '?' if '?' not in path else '&'          if not timestamp:              timestamp = int(time.time())          query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp) +        if self._token: +            query += '&token=%s' % self._token          sig = hmac.new(              self._APP_SECRET.encode('ascii'),              query.encode('ascii'),              hashlib.sha1          ).hexdigest() -        return self._API_URL_TEMPLATE % (query, sig) +        url = self._API_URL_TEMPLATE % (query, sig) +        return compat_urllib_request.Request( +            url, json.dumps(post_data).encode('utf-8')) if post_data else url -    def _call_api(self, path, video_id, note, timestamp=None): +    def _call_api(self, path, video_id, note, timestamp=None, post_data=None):          resp = self._download_json( -            self._prepare_call(path, timestamp), video_id, note) +            self._prepare_call(path, timestamp, post_data), video_id, note)          error = resp.get('error')          if error:              if error == 'invalid timestamp':                  resp = self._download_json( -                    self._prepare_call(path, int(resp['current_timestamp'])), +                    self._prepare_call(path, int(resp['current_timestamp']), post_data),                      video_id, '%s (retry)' % note)                  error = resp.get('error')              if error: @@ -56,6 +67,27 @@ class VikiBaseIE(InfoExtractor):              '%s returned error: %s' % (self.IE_NAME, error),              expected=True) +    def _real_initialize(self): +        self._login() + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            return + +        login_form = { +            'login_id': username, +            'password': password, +        } + +        login = self._call_api( +            'sessions.json', None, +            'Logging in as %s' % username, post_data=login_form) + +        self._token = login.get('token') +        if not self._token: +            self.report_warning('Unable to get session token, login has probably failed') +  class VikiIE(VikiBaseIE):      IE_NAME = 'viki' diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index f300c7ca4..cae90205d 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -22,6 +22,7 @@ from ..utils import (      unified_strdate,      unsmuggle_url,      urlencode_postdata, +    unescapeHTML,  ) @@ -173,6 +174,21 @@ class VimeoIE(VimeoBaseInfoExtractor):          },      ] +    @staticmethod +    def _extract_vimeo_url(url, webpage): +        # Look for embedded (iframe) Vimeo player +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) +        if mobj: +            player_url = unescapeHTML(mobj.group('url')) +            surl = smuggle_url(player_url, {'Referer': url}) +            return surl +        # Look for embedded (swf embed) Vimeo player +        mobj = re.search( +            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) +        if mobj: +            return mobj.group(1) +      def _verify_video_password(self, url, video_id, webpage):          password = self._downloader.params.get('videopassword', None)          if password is None: diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 38ff3c1a9..f2ae109f9 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -121,20 +121,27 @@ class VKIE(InfoExtractor):          if username is None:              return -        login_form = { -            'act': 'login', -            'role': 'al_frame', -            'expire': '1', +        login_page = self._download_webpage( +            'https://vk.com', None, 'Downloading login page') + +        login_form = dict(re.findall( +            r'<input\s+type="hidden"\s+name="([^"]+)"\s+(?:id="[^"]+"\s+)?value="([^"]*)"', +            login_page)) + +        login_form.update({              'email': username.encode('cp1251'),              'pass': password.encode('cp1251'), -        } +        }) -        request = compat_urllib_request.Request('https://login.vk.com/?act=login', -                                                compat_urllib_parse.urlencode(login_form).encode('utf-8')) -        login_page = self._download_webpage(request, None, note='Logging in as %s' % username) +        request = compat_urllib_request.Request( +            'https://login.vk.com/?act=login', +            compat_urllib_parse.urlencode(login_form).encode('utf-8')) +        login_page = self._download_webpage( +            request, None, note='Logging in as %s' % username)          if re.search(r'onLoginFailed', login_page): -            raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) +            raise ExtractorError( +                'Unable to login, incorrect username and/or password', expected=True)      def _real_initialize(self):          self._login() diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py index 405cb9db4..149e36467 100644 --- a/youtube_dl/extractor/vube.py +++ b/youtube_dl/extractor/vube.py @@ -36,6 +36,7 @@ class VubeIE(InfoExtractor):                  'comment_count': int,                  'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'],              }, +            'skip': 'Not accessible from Travis CI server',          }, {              'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon',              'md5': 'db7aba89d4603dadd627e9d1973946fe', diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 4527567f8..b4ad513a0 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -13,7 +13,6 @@ from ..utils import (  class XHamsterIE(InfoExtractor): -    """Information Extractor for xHamster"""      _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'      _TESTS = [          { @@ -133,3 +132,36 @@ class XHamsterIE(InfoExtractor):              'age_limit': age_limit,              'formats': formats,          } + + +class XHamsterEmbedIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P<id>\d+)' +    _TEST = { +        'url': 'http://xhamster.com/xembed.php?video=3328539', +        'info_dict': { +            'id': '3328539', +            'ext': 'mp4', +            'title': 'Pen Masturbation', +            'upload_date': '20140728', +            'uploader_id': 'anonymous', +            'duration': 5, +            'age_limit': 18, +        } +    } + +    @staticmethod +    def _extract_urls(webpage): +        return [url for _, url in re.findall( +            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1', +            webpage)] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        video_url = self._search_regex( +            r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id, +            webpage, 'xhamster url') + +        return self.url_result(video_url, 'XHamster') diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 2a45dc574..d8415bed4 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -5,10 +5,12 @@ import re  from .common import InfoExtractor  from ..compat import (      compat_urllib_parse, +    compat_urllib_request,  )  from ..utils import (      clean_html,      ExtractorError, +    determine_ext,  ) @@ -25,6 +27,8 @@ class XVideosIE(InfoExtractor):          }      } +    _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19' +      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) @@ -40,9 +44,30 @@ class XVideosIE(InfoExtractor):          video_thumbnail = self._search_regex(              r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) +        formats = [{ +            'url': video_url, +        }] + +        android_req = compat_urllib_request.Request(url) +        android_req.add_header('User-Agent', self._ANDROID_USER_AGENT) +        android_webpage = self._download_webpage(android_req, video_id, fatal=False) + +        if android_webpage is not None: +            player_params_str = self._search_regex( +                'mobileReplacePlayerDivTwoQual\(([^)]+)\)', +                android_webpage, 'player parameters', default='') +            player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(','))) +            if player_params: +                formats.extend([{ +                    'url': param, +                    'preference': -10, +                } for param in player_params if determine_ext(param) == 'mp4']) + +        self._sort_formats(formats) +          return {              'id': video_id, -            'url': video_url, +            'formats': formats,              'title': video_title,              'ext': 'flv',              'thumbnail': video_thumbnail, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9e2671192..6769a009d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -29,9 +29,11 @@ from ..utils import (      get_element_by_id,      int_or_none,      orderedSet, +    str_to_int,      unescapeHTML,      unified_strdate,      uppercase_escape, +    ISO3166Utils,  ) @@ -234,6 +236,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          '44': {'ext': 'webm', 'width': 854, 'height': 480},          '45': {'ext': 'webm', 'width': 1280, 'height': 720},          '46': {'ext': 'webm', 'width': 1920, 'height': 1080}, +        '59': {'ext': 'mp4', 'width': 854, 'height': 480}, +        '78': {'ext': 'mp4', 'width': 854, 'height': 480},          # 3d videos @@ -516,6 +520,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'skip_download': 'requires avconv',              }          }, +        # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097) +        { +            'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', +            'info_dict': { +                'id': 'FIl7x6_3R5Y', +                'ext': 'mp4', +                'title': 'md5:7b81415841e02ecd4313668cde88737a', +                'description': 'md5:116377fd2963b81ec4ce64b542173306', +                'upload_date': '20150625', +                'uploader_id': 'dorappi2000', +                'uploader': 'dorappi2000', +                'formats': 'mincount:33', +            }, +        }      ]      def __init__(self, *args, **kwargs): @@ -822,6 +840,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      except StopIteration:                          full_info = self._formats.get(format_id, {}).copy()                          full_info.update(f) +                        codecs = r.attrib.get('codecs') +                        if codecs: +                            if full_info.get('acodec') == 'none' and 'vcodec' not in full_info: +                                full_info['vcodec'] = codecs +                            elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info: +                                full_info['acodec'] = codecs                          formats.append(full_info)                      else:                          existing_format.update(f) @@ -851,6 +875,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          else:              player_url = None +        dash_mpds = [] + +        def add_dash_mpd(video_info): +            dash_mpd = video_info.get('dashmpd') +            if dash_mpd and dash_mpd[0] not in dash_mpds: +                dash_mpds.append(dash_mpd[0]) +          # Get video info          embed_webpage = None          if re.search(r'player-age-gate-content">', video_webpage) is not None: @@ -871,24 +902,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  note='Refetching age-gated info webpage',                  errnote='unable to download video info webpage')              video_info = compat_parse_qs(video_info_webpage) +            add_dash_mpd(video_info)          else:              age_gate = False -            try: -                # Try looking directly into the video webpage -                mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) -                if not mobj: -                    raise ValueError('Could not find ytplayer.config')  # caught below +            video_info = None +            # Try looking directly into the video webpage +            mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) +            if mobj:                  json_code = uppercase_escape(mobj.group(1))                  ytplayer_config = json.loads(json_code)                  args = ytplayer_config['args'] -                # Convert to the same format returned by compat_parse_qs -                video_info = dict((k, [v]) for k, v in args.items()) -                if not args.get('url_encoded_fmt_stream_map'): -                    raise ValueError('No stream_map present')  # caught below -            except ValueError: -                # We fallback to the get_video_info pages (used by the embed page) +                if args.get('url_encoded_fmt_stream_map'): +                    # Convert to the same format returned by compat_parse_qs +                    video_info = dict((k, [v]) for k, v in args.items()) +                    add_dash_mpd(video_info) +            if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): +                # We also try looking in get_video_info since it may contain different dashmpd +                # URL that points to a DASH manifest with possibly different itag set (some itags +                # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH +                # manifest pointed by get_video_info's dashmpd). +                # The general idea is to take a union of itags of both DASH manifests (for example +                # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)                  self.report_video_info_webpage_download(video_id) -                for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: +                for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:                      video_info_url = (                          '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'                          % (proto, video_id, el_type)) @@ -896,11 +932,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                          video_info_url,                          video_id, note=False,                          errnote='unable to download video info webpage') -                    video_info = compat_parse_qs(video_info_webpage) -                    if 'token' in video_info: +                    get_video_info = compat_parse_qs(video_info_webpage) +                    add_dash_mpd(get_video_info) +                    if not video_info: +                        video_info = get_video_info +                    if 'token' in get_video_info:                          break          if 'token' not in video_info:              if 'reason' in video_info: +                if 'The uploader has not made this video available in your country.' in video_info['reason']: +                    regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None) +                    if regions_allowed is not None: +                        raise ExtractorError('YouTube said: This video is available in %s only' % ( +                            ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))), +                            expected=True)                  raise ExtractorError(                      'YouTube said: %s' % video_info['reason'][0],                      expected=True, video_id=video_id) @@ -954,15 +999,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])          # upload date -        upload_date = None -        mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage) -        if mobj is None: -            mobj = re.search( -                r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>', -                video_webpage) -        if mobj is not None: -            upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) -            upload_date = unified_strdate(upload_date) +        upload_date = self._html_search_meta( +            'datePublished', video_webpage, 'upload date', default=None) +        if not upload_date: +            upload_date = self._search_regex( +                [r'(?s)id="eow-date.*?>(.*?)</span>', +                 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'], +                video_webpage, 'upload date', default=None) +            if upload_date: +                upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) +        upload_date = unified_strdate(upload_date)          m_cat_container = self._search_regex(              r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', @@ -996,12 +1042,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  video_description = ''          def _extract_count(count_name): -            count = self._search_regex( -                r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name), -                video_webpage, count_name, default=None) -            if count is not None: -                return int(count.replace(',', '')) -            return None +            return str_to_int(self._search_regex( +                r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' +                % re.escape(count_name), +                video_webpage, count_name, default=None)) +          like_count = _extract_count('like')          dislike_count = _extract_count('dislike') @@ -1116,24 +1161,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          # Look for the DASH manifest          if self._downloader.params.get('youtube_include_dash_manifest', True): -            dash_mpd = video_info.get('dashmpd') -            if dash_mpd: -                dash_manifest_url = dash_mpd[0] +            for dash_manifest_url in dash_mpds: +                dash_formats = {}                  try: -                    dash_formats = self._parse_dash_manifest( -                        video_id, dash_manifest_url, player_url, age_gate) +                    for df in self._parse_dash_manifest( +                            video_id, dash_manifest_url, player_url, age_gate): +                        # Do not overwrite DASH format found in some previous DASH manifest +                        if df['format_id'] not in dash_formats: +                            dash_formats[df['format_id']] = df                  except (ExtractorError, KeyError) as e:                      self.report_warning(                          'Skipping DASH manifest: %r' % e, video_id) -                else: +                if dash_formats:                      # Remove the formats we found through non-DASH, they                      # contain less info and it can be wrong, because we use                      # fixed values (for example the resolution). See                      # https://github.com/rg3/youtube-dl/issues/5774 for an                      # example. -                    dash_keys = set(df['format_id'] for df in dash_formats) -                    formats = [f for f in formats if f['format_id'] not in dash_keys] -                    formats.extend(dash_formats) +                    formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] +                    formats.extend(dash_formats.values())          # Check for malformed aspect ratio          stretched_m = re.search( diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 740458e51..4762e1e3c 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -151,6 +151,10 @@ def parseOpts(overrideArguments=None):          action='store_true', dest='list_extractor_descriptions', default=False,          help='Output descriptions of all supported extractors')      general.add_option( +        '--force-generic-extractor', +        action='store_true', dest='force_generic_extractor', default=False, +        help='Force extraction to use the generic extractor') +    general.add_option(          '--default-search',          dest='default_search', metavar='PREFIX',          help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching.') @@ -342,12 +346,13 @@ def parseOpts(overrideArguments=None):      video_format.add_option(          '--youtube-skip-dash-manifest',          action='store_false', dest='youtube_include_dash_manifest', -        help='Do not download the DASH manifest on YouTube videos') +        help='Do not download the DASH manifests and related data on YouTube videos')      video_format.add_option(          '--merge-output-format',          action='store', dest='merge_output_format', metavar='FORMAT', default=None,          help=( -            'If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.' +            'If a merge is required (e.g. bestvideo+bestaudio), ' +            'output to given container format. One of mkv, mp4, ogg, webm, flv. '              'Ignored if no merge is required'))      subtitles = optparse.OptionGroup(parser, 'Subtitle Options') diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 774494efd..e19dbf73d 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -35,6 +35,11 @@ class EmbedThumbnailPP(FFmpegPostProcessor):          thumbnail_filename = info['thumbnails'][-1]['filename'] +        if not os.path.exists(encodeFilename(thumbnail_filename)): +            self._downloader.report_warning( +                'Skipping embedding the thumbnail because the file is missing.') +            return [], info +          if info['ext'] == 'mp3':              options = [                  '-c', 'copy', '-map', '0', '-map', '1', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index cc65b34e7..fe7e0a8ee 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -21,6 +21,7 @@ from ..utils import (      shell_quote,      subtitles_filename,      dfxp2srt, +    ISO639Utils,  ) @@ -307,199 +308,6 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):  class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): -    # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt -    _lang_map = { -        'aa': 'aar', -        'ab': 'abk', -        'ae': 'ave', -        'af': 'afr', -        'ak': 'aka', -        'am': 'amh', -        'an': 'arg', -        'ar': 'ara', -        'as': 'asm', -        'av': 'ava', -        'ay': 'aym', -        'az': 'aze', -        'ba': 'bak', -        'be': 'bel', -        'bg': 'bul', -        'bh': 'bih', -        'bi': 'bis', -        'bm': 'bam', -        'bn': 'ben', -        'bo': 'bod', -        'br': 'bre', -        'bs': 'bos', -        'ca': 'cat', -        'ce': 'che', -        'ch': 'cha', -        'co': 'cos', -        'cr': 'cre', -        'cs': 'ces', -        'cu': 'chu', -        'cv': 'chv', -        'cy': 'cym', -        'da': 'dan', -        'de': 'deu', -        'dv': 'div', -        'dz': 'dzo', -        'ee': 'ewe', -        'el': 'ell', -        'en': 'eng', -        'eo': 'epo', -        'es': 'spa', -        'et': 'est', -        'eu': 'eus', -        'fa': 'fas', -        'ff': 'ful', -        'fi': 'fin', -        'fj': 'fij', -        'fo': 'fao', -        'fr': 'fra', -        'fy': 'fry', -        'ga': 'gle', -        'gd': 'gla', -        'gl': 'glg', -        'gn': 'grn', -        'gu': 'guj', -        'gv': 'glv', -        'ha': 'hau', -        'he': 'heb', -        'hi': 'hin', -        'ho': 'hmo', -        'hr': 'hrv', -        'ht': 'hat', -        'hu': 'hun', -        'hy': 'hye', -        'hz': 'her', -        'ia': 'ina', -        'id': 'ind', -        'ie': 'ile', -        'ig': 'ibo', -        'ii': 'iii', -        'ik': 'ipk', -        'io': 'ido', -        'is': 'isl', -        'it': 'ita', -        'iu': 'iku', -        'ja': 'jpn', -        'jv': 'jav', -        'ka': 'kat', -        'kg': 'kon', -        'ki': 'kik', -        'kj': 'kua', -        'kk': 'kaz', -        'kl': 'kal', -        'km': 'khm', -        'kn': 'kan', -        'ko': 'kor', -        'kr': 'kau', -        'ks': 'kas', -        'ku': 'kur', -        'kv': 'kom', -        'kw': 'cor', -        'ky': 'kir', -        'la': 'lat', -        'lb': 'ltz', -        'lg': 'lug', -        'li': 'lim', -        'ln': 'lin', -        'lo': 'lao', -        'lt': 'lit', -        'lu': 'lub', -        'lv': 'lav', -        'mg': 'mlg', -        'mh': 'mah', -        'mi': 'mri', -        'mk': 'mkd', -        'ml': 'mal', -        'mn': 'mon', -        'mr': 'mar', -        'ms': 'msa', -        'mt': 'mlt', -        'my': 'mya', -        'na': 'nau', -        'nb': 'nob', -        'nd': 'nde', -        'ne': 'nep', -        'ng': 'ndo', -        'nl': 'nld', -        'nn': 'nno', -        'no': 'nor', -        'nr': 'nbl', -        'nv': 'nav', -        'ny': 'nya', -        'oc': 'oci', -        'oj': 'oji', -        'om': 'orm', -        'or': 'ori', -        'os': 'oss', -        'pa': 'pan', -        'pi': 'pli', -        'pl': 'pol', -        'ps': 'pus', -        'pt': 'por', -        'qu': 'que', -        'rm': 'roh', -        'rn': 'run', -        'ro': 'ron', -        'ru': 'rus', -        'rw': 'kin', -        'sa': 'san', -        'sc': 'srd', -        'sd': 'snd', -        'se': 'sme', -        'sg': 'sag', -        'si': 'sin', -        'sk': 'slk', -        'sl': 'slv', -        'sm': 'smo', -        'sn': 'sna', -        'so': 'som', -        'sq': 'sqi', -        'sr': 'srp', -        'ss': 'ssw', -        'st': 'sot', -        'su': 'sun', -        'sv': 'swe', -        'sw': 'swa', -        'ta': 'tam', -        'te': 'tel', -        'tg': 'tgk', -        'th': 'tha', -        'ti': 'tir', -        'tk': 'tuk', -        'tl': 'tgl', -        'tn': 'tsn', -        'to': 'ton', -        'tr': 'tur', -        'ts': 'tso', -        'tt': 'tat', -        'tw': 'twi', -        'ty': 'tah', -        'ug': 'uig', -        'uk': 'ukr', -        'ur': 'urd', -        'uz': 'uzb', -        've': 'ven', -        'vi': 'vie', -        'vo': 'vol', -        'wa': 'wln', -        'wo': 'wol', -        'xh': 'xho', -        'yi': 'yid', -        'yo': 'yor', -        'za': 'zha', -        'zh': 'zho', -        'zu': 'zul', -    } - -    @classmethod -    def _conver_lang_code(cls, code): -        """Convert language code from ISO 639-1 to ISO 639-2/T""" -        return cls._lang_map.get(code[:2]) -      def run(self, information):          if information['ext'] not in ['mp4', 'mkv']:              self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files') @@ -525,7 +333,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):              opts += ['-c:s', 'mov_text']          for (i, lang) in enumerate(sub_langs):              opts.extend(['-map', '%d:0' % (i + 1)]) -            lang_code = self._conver_lang_code(lang) +            lang_code = ISO639Utils.short2long(lang)              if lang_code is not None:                  opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 52d198fa3..942f76d24 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -62,6 +62,8 @@ std_headers = {  } +NO_DEFAULT = object() +  ENGLISH_MONTH_NAMES = [      'January', 'February', 'March', 'April', 'May', 'June',      'July', 'August', 'September', 'October', 'November', 'December'] @@ -171,13 +173,15 @@ def xpath_with_ns(path, ns_map):      return '/'.join(replaced) -def xpath_text(node, xpath, name=None, fatal=False): +def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):      if sys.version_info < (2, 7):  # Crazy 2.6          xpath = xpath.encode('ascii')      n = node.find(xpath)      if n is None or n.text is None: -        if fatal: +        if default is not NO_DEFAULT: +            return default +        elif fatal:              name = xpath if name is None else name              raise ExtractorError('Could not find XML element %s' % name)          else: @@ -1841,7 +1845,10 @@ def srt_subtitles_timecode(seconds):  def dfxp2srt(dfxp_data): -    _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'}) +    _x = functools.partial(xpath_with_ns, ns_map={ +        'ttml': 'http://www.w3.org/ns/ttml', +        'ttaf1': 'http://www.w3.org/2006/10/ttaf1', +    })      def parse_node(node):          str_or_empty = functools.partial(str_or_none, default='') @@ -1849,9 +1856,9 @@ def dfxp2srt(dfxp_data):          out = str_or_empty(node.text)          for child in node: -            if child.tag in (_x('ttml:br'), 'br'): +            if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):                  out += '\n' + str_or_empty(child.tail) -            elif child.tag in (_x('ttml:span'), 'span'): +            elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):                  out += str_or_empty(parse_node(child))              else:                  out += str_or_empty(xml.etree.ElementTree.tostring(child)) @@ -1860,7 +1867,7 @@ def dfxp2srt(dfxp_data):      dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))      out = [] -    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') +    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')      if not paras:          raise ValueError('Invalid dfxp/TTML subtitle') @@ -1879,6 +1886,468 @@ def dfxp2srt(dfxp_data):      return ''.join(out) +class ISO639Utils(object): +    # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt +    _lang_map = { +        'aa': 'aar', +        'ab': 'abk', +        'ae': 'ave', +        'af': 'afr', +        'ak': 'aka', +        'am': 'amh', +        'an': 'arg', +        'ar': 'ara', +        'as': 'asm', +        'av': 'ava', +        'ay': 'aym', +        'az': 'aze', +        'ba': 'bak', +        'be': 'bel', +        'bg': 'bul', +        'bh': 'bih', +        'bi': 'bis', +        'bm': 'bam', +        'bn': 'ben', +        'bo': 'bod', +        'br': 'bre', +        'bs': 'bos', +        'ca': 'cat', +        'ce': 'che', +        'ch': 'cha', +        'co': 'cos', +        'cr': 'cre', +        'cs': 'ces', +        'cu': 'chu', +        'cv': 'chv', +        'cy': 'cym', +        'da': 'dan', +        'de': 'deu', +        'dv': 'div', +        'dz': 'dzo', +        'ee': 'ewe', +        'el': 'ell', +        'en': 'eng', +        'eo': 'epo', +        'es': 'spa', +        'et': 'est', +        'eu': 'eus', +        'fa': 'fas', +        'ff': 'ful', +        'fi': 'fin', +        'fj': 'fij', +        'fo': 'fao', +        'fr': 'fra', +        'fy': 'fry', +        'ga': 'gle', +        'gd': 'gla', +        'gl': 'glg', +        'gn': 'grn', +        'gu': 'guj', +        'gv': 'glv', +        'ha': 'hau', +        'he': 'heb', +        'hi': 'hin', +        'ho': 'hmo', +        'hr': 'hrv', +        'ht': 'hat', +        'hu': 'hun', +        'hy': 'hye', +        'hz': 'her', +        'ia': 'ina', +        'id': 'ind', +        'ie': 'ile', +        'ig': 'ibo', +        'ii': 'iii', +        'ik': 'ipk', +        'io': 'ido', +        'is': 'isl', +        'it': 'ita', +        'iu': 'iku', +        'ja': 'jpn', +        'jv': 'jav', +        'ka': 'kat', +        'kg': 'kon', +        'ki': 'kik', +        'kj': 'kua', +        'kk': 'kaz', +        'kl': 'kal', +        'km': 'khm', +        'kn': 'kan', +        'ko': 'kor', +        'kr': 'kau', +        'ks': 'kas', +        'ku': 'kur', +        'kv': 'kom', +        'kw': 'cor', +        'ky': 'kir', +        'la': 'lat', +        'lb': 'ltz', +        'lg': 'lug', +        'li': 'lim', +        'ln': 'lin', +        'lo': 'lao', +        'lt': 'lit', +        'lu': 'lub', +        'lv': 'lav', +        'mg': 'mlg', +        'mh': 'mah', +        'mi': 'mri', +        'mk': 'mkd', +        'ml': 'mal', +        'mn': 'mon', +        'mr': 'mar', +        'ms': 'msa', +        'mt': 'mlt', +        'my': 'mya', +        'na': 'nau', +        'nb': 'nob', +        'nd': 'nde', +        'ne': 'nep', +        'ng': 'ndo', +        'nl': 'nld', +        'nn': 'nno', +        'no': 'nor', +        'nr': 'nbl', +        'nv': 'nav', +        'ny': 'nya', +        'oc': 'oci', +        'oj': 'oji', +        'om': 'orm', +        'or': 'ori', +        'os': 'oss', +        'pa': 'pan', +        'pi': 'pli', +        'pl': 'pol', +        'ps': 'pus', +        'pt': 'por', +        'qu': 'que', +        'rm': 'roh', +        'rn': 'run', +        'ro': 'ron', +        'ru': 'rus', +        'rw': 'kin', +        'sa': 'san', +        'sc': 'srd', +        'sd': 'snd', +        'se': 'sme', +        'sg': 'sag', +        'si': 'sin', +        'sk': 'slk', +        'sl': 'slv', +        'sm': 'smo', +        'sn': 'sna', +        'so': 'som', +        'sq': 'sqi', +        'sr': 'srp', +        'ss': 'ssw', +        'st': 'sot', +        'su': 'sun', +        'sv': 'swe', +        'sw': 'swa', +        'ta': 'tam', +        'te': 'tel', +        'tg': 'tgk', +        'th': 'tha', +        'ti': 'tir', +        'tk': 'tuk', +        'tl': 'tgl', +        'tn': 'tsn', +        'to': 'ton', +        'tr': 'tur', +        'ts': 'tso', +        'tt': 'tat', +        'tw': 'twi', +        'ty': 'tah', +        'ug': 'uig', +        'uk': 'ukr', +        'ur': 'urd', +        'uz': 'uzb', +        've': 'ven', +        'vi': 'vie', +        'vo': 'vol', +        'wa': 'wln', +        'wo': 'wol', +        'xh': 'xho', +        'yi': 'yid', +        'yo': 'yor', +        'za': 'zha', +        'zh': 'zho', +        'zu': 'zul', +    } + +    @classmethod +    def short2long(cls, code): +        """Convert language code from ISO 639-1 to ISO 639-2/T""" +        return cls._lang_map.get(code[:2]) + +    @classmethod +    def long2short(cls, code): +        """Convert language code from ISO 639-2/T to ISO 639-1""" +        for short_name, long_name in cls._lang_map.items(): +            if long_name == code: +                return short_name + + +class ISO3166Utils(object): +    # From http://data.okfn.org/data/core/country-list +    _country_map = { +        'AF': 'Afghanistan', +        'AX': 'Åland Islands', +        'AL': 'Albania', +        'DZ': 'Algeria', +        'AS': 'American Samoa', +        'AD': 'Andorra', +        'AO': 'Angola', +        'AI': 'Anguilla', +        'AQ': 'Antarctica', +        'AG': 'Antigua and Barbuda', +        'AR': 'Argentina', +        'AM': 'Armenia', +        'AW': 'Aruba', +        'AU': 'Australia', +        'AT': 'Austria', +        'AZ': 'Azerbaijan', +        'BS': 'Bahamas', +        'BH': 'Bahrain', +        'BD': 'Bangladesh', +        'BB': 'Barbados', +        'BY': 'Belarus', +        'BE': 'Belgium', +        'BZ': 'Belize', +        'BJ': 'Benin', +        'BM': 'Bermuda', +        'BT': 'Bhutan', +        'BO': 'Bolivia, Plurinational State of', +        'BQ': 'Bonaire, Sint Eustatius and Saba', +        'BA': 'Bosnia and Herzegovina', +        'BW': 'Botswana', +        'BV': 'Bouvet Island', +        'BR': 'Brazil', +        'IO': 'British Indian Ocean Territory', +        'BN': 'Brunei Darussalam', +        'BG': 'Bulgaria', +        'BF': 'Burkina Faso', +        'BI': 'Burundi', +        'KH': 'Cambodia', +        'CM': 'Cameroon', +        'CA': 'Canada', +        'CV': 'Cape Verde', +        'KY': 'Cayman Islands', +        'CF': 'Central African Republic', +        'TD': 'Chad', +        'CL': 'Chile', +        'CN': 'China', +        'CX': 'Christmas Island', +        'CC': 'Cocos (Keeling) Islands', +        'CO': 'Colombia', +        'KM': 'Comoros', +        'CG': 'Congo', +        'CD': 'Congo, the Democratic Republic of the', +        'CK': 'Cook Islands', +        'CR': 'Costa Rica', +        'CI': 'Côte d\'Ivoire', +        'HR': 'Croatia', +        'CU': 'Cuba', +        'CW': 'Curaçao', +        'CY': 'Cyprus', +        'CZ': 'Czech Republic', +        'DK': 'Denmark', +        'DJ': 'Djibouti', +        'DM': 'Dominica', +        'DO': 'Dominican Republic', +        'EC': 'Ecuador', +        'EG': 'Egypt', +        'SV': 'El Salvador', +        'GQ': 'Equatorial Guinea', +        'ER': 'Eritrea', +        'EE': 'Estonia', +        'ET': 'Ethiopia', +        'FK': 'Falkland Islands (Malvinas)', +        'FO': 'Faroe Islands', +        'FJ': 'Fiji', +        'FI': 'Finland', +        'FR': 'France', +        'GF': 'French Guiana', +        'PF': 'French Polynesia', +        'TF': 'French Southern Territories', +        'GA': 'Gabon', +        'GM': 'Gambia', +        'GE': 'Georgia', +        'DE': 'Germany', +        'GH': 'Ghana', +        'GI': 'Gibraltar', +        'GR': 'Greece', +        'GL': 'Greenland', +        'GD': 'Grenada', +        'GP': 'Guadeloupe', +        'GU': 'Guam', +        'GT': 'Guatemala', +        'GG': 'Guernsey', +        'GN': 'Guinea', +        'GW': 'Guinea-Bissau', +        'GY': 'Guyana', +        'HT': 'Haiti', +        'HM': 'Heard Island and McDonald Islands', +        'VA': 'Holy See (Vatican City State)', +        'HN': 'Honduras', +        'HK': 'Hong Kong', +        'HU': 'Hungary', +        'IS': 'Iceland', +        'IN': 'India', +        'ID': 'Indonesia', +        'IR': 'Iran, Islamic Republic of', +        'IQ': 'Iraq', +        'IE': 'Ireland', +        'IM': 'Isle of Man', +        'IL': 'Israel', +        'IT': 'Italy', +        'JM': 'Jamaica', +        'JP': 'Japan', +        'JE': 'Jersey', +        'JO': 'Jordan', +        'KZ': 'Kazakhstan', +        'KE': 'Kenya', +        'KI': 'Kiribati', +        'KP': 'Korea, Democratic People\'s Republic of', +        'KR': 'Korea, Republic of', +        'KW': 'Kuwait', +        'KG': 'Kyrgyzstan', +        'LA': 'Lao People\'s Democratic Republic', +        'LV': 'Latvia', +        'LB': 'Lebanon', +        'LS': 'Lesotho', +        'LR': 'Liberia', +        'LY': 'Libya', +        'LI': 'Liechtenstein', +        'LT': 'Lithuania', +        'LU': 'Luxembourg', +        'MO': 'Macao', +        'MK': 'Macedonia, the Former Yugoslav Republic of', +        'MG': 'Madagascar', +        'MW': 'Malawi', +        'MY': 'Malaysia', +        'MV': 'Maldives', +        'ML': 'Mali', +        'MT': 'Malta', +        'MH': 'Marshall Islands', +        'MQ': 'Martinique', +        'MR': 'Mauritania', +        'MU': 'Mauritius', +        'YT': 'Mayotte', +        'MX': 'Mexico', +        'FM': 'Micronesia, Federated States of', +        'MD': 'Moldova, Republic of', +        'MC': 'Monaco', +        'MN': 'Mongolia', +        'ME': 'Montenegro', +        'MS': 'Montserrat', +        'MA': 'Morocco', +        'MZ': 'Mozambique', +        'MM': 'Myanmar', +        'NA': 'Namibia', +        'NR': 'Nauru', +        'NP': 'Nepal', +        'NL': 'Netherlands', +        'NC': 'New Caledonia', +        'NZ': 'New Zealand', +        'NI': 'Nicaragua', +        'NE': 'Niger', +        'NG': 'Nigeria', +        'NU': 'Niue', +        'NF': 'Norfolk Island', +        'MP': 'Northern Mariana Islands', +        'NO': 'Norway', +        'OM': 'Oman', +        'PK': 'Pakistan', +        'PW': 'Palau', +        'PS': 'Palestine, State of', +        'PA': 'Panama', +        'PG': 'Papua New Guinea', +        'PY': 'Paraguay', +        'PE': 'Peru', +        'PH': 'Philippines', +        'PN': 'Pitcairn', +        'PL': 'Poland', +        'PT': 'Portugal', +        'PR': 'Puerto Rico', +        'QA': 'Qatar', +        'RE': 'Réunion', +        'RO': 'Romania', +        'RU': 'Russian Federation', +        'RW': 'Rwanda', +        'BL': 'Saint Barthélemy', +        'SH': 'Saint Helena, Ascension and Tristan da Cunha', +        'KN': 'Saint Kitts and Nevis', +        'LC': 'Saint Lucia', +        'MF': 'Saint Martin (French part)', +        'PM': 'Saint Pierre and Miquelon', +        'VC': 'Saint Vincent and the Grenadines', +        'WS': 'Samoa', +        'SM': 'San Marino', +        'ST': 'Sao Tome and Principe', +        'SA': 'Saudi Arabia', +        'SN': 'Senegal', +        'RS': 'Serbia', +        'SC': 'Seychelles', +        'SL': 'Sierra Leone', +        'SG': 'Singapore', +        'SX': 'Sint Maarten (Dutch part)', +        'SK': 'Slovakia', +        'SI': 'Slovenia', +        'SB': 'Solomon Islands', +        'SO': 'Somalia', +        'ZA': 'South Africa', +        'GS': 'South Georgia and the South Sandwich Islands', +        'SS': 'South Sudan', +        'ES': 'Spain', +        'LK': 'Sri Lanka', +        'SD': 'Sudan', +        'SR': 'Suriname', +        'SJ': 'Svalbard and Jan Mayen', +        'SZ': 'Swaziland', +        'SE': 'Sweden', +        'CH': 'Switzerland', +        'SY': 'Syrian Arab Republic', +        'TW': 'Taiwan, Province of China', +        'TJ': 'Tajikistan', +        'TZ': 'Tanzania, United Republic of', +        'TH': 'Thailand', +        'TL': 'Timor-Leste', +        'TG': 'Togo', +        'TK': 'Tokelau', +        'TO': 'Tonga', +        'TT': 'Trinidad and Tobago', +        'TN': 'Tunisia', +        'TR': 'Turkey', +        'TM': 'Turkmenistan', +        'TC': 'Turks and Caicos Islands', +        'TV': 'Tuvalu', +        'UG': 'Uganda', +        'UA': 'Ukraine', +        'AE': 'United Arab Emirates', +        'GB': 'United Kingdom', +        'US': 'United States', +        'UM': 'United States Minor Outlying Islands', +        'UY': 'Uruguay', +        'UZ': 'Uzbekistan', +        'VU': 'Vanuatu', +        'VE': 'Venezuela, Bolivarian Republic of', +        'VN': 'Viet Nam', +        'VG': 'Virgin Islands, British', +        'VI': 'Virgin Islands, U.S.', +        'WF': 'Wallis and Futuna', +        'EH': 'Western Sahara', +        'YE': 'Yemen', +        'ZM': 'Zambia', +        'ZW': 'Zimbabwe', +    } + +    @classmethod +    def short2full(cls, code): +        """Convert an ISO 3166-2 country code to the corresponding full name""" +        return cls._country_map.get(code.upper()) + +  class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):      def __init__(self, proxies=None):          # Set default handlers diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 34a13cb81..eff4aebeb 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.06.15' +__version__ = '2015.07.04' | 
