diff options
Diffstat (limited to 'youtube_dl')
65 files changed, 2685 insertions, 890 deletions
| diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 34a1e3b5c..242affb5b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -108,6 +108,8 @@ class YoutubeDL(object):      forcefilename:     Force printing final filename.      forceduration:     Force printing duration.      forcejson:         Force printing info_dict as JSON. +    dump_single_json:  Force printing the info_dict of the whole playlist +                       (or video) as a single JSON line.      simulate:          Do not download the video files.      format:            Video format code.      format_limit:      Highest quality format to try. @@ -166,6 +168,8 @@ class YoutubeDL(object):                         'auto' for elaborate guessing      encoding:          Use this encoding instead of the system-specified.      extract_flat:      Do not resolve URLs, return the immediate result. +                       Pass in 'in_playlist' to only show this behavior for +                       playlist items.      The following parameters are not used by YoutubeDL itself, they are used by      the FileDownloader: @@ -229,11 +233,11 @@ class YoutubeDL(object):          if (sys.version_info >= (3,) and sys.platform != 'win32' and                  sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] -                and not params['restrictfilenames']): +                and not params.get('restrictfilenames', False)):              # On Python 3, the Unicode filesystem API will throw errors (#1474)              self.report_warning(                  'Assuming --restrict-filenames since file system encoding ' -                'cannot encode all charactes. ' +                'cannot encode all characters. '                  'Set the LC_ALL environment variable to fix this.')              self.params['restrictfilenames'] = True @@ -569,8 +573,12 @@ class YoutubeDL(object):          result_type = ie_result.get('_type', 'video') -        if self.params.get('extract_flat', False): -            if result_type in ('url', 'url_transparent'): +        if result_type in ('url', 'url_transparent'): +            extract_flat = self.params.get('extract_flat', False) +            if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or +                    extract_flat is True): +                if self.params.get('forcejson', False): +                    self.to_stdout(json.dumps(ie_result))                  return ie_result          if result_type == 'video': @@ -898,6 +906,8 @@ class YoutubeDL(object):          if self.params.get('forcejson', False):              info_dict['_filename'] = filename              self.to_stdout(json.dumps(info_dict)) +        if self.params.get('dump_single_json', False): +            info_dict['_filename'] = filename          # Do nothing else if in simulate mode          if self.params.get('simulate', False): @@ -1065,12 +1075,15 @@ class YoutubeDL(object):          for url in url_list:              try:                  #It also downloads the videos -                self.extract_info(url) +                res = self.extract_info(url)              except UnavailableVideoError:                  self.report_error('unable to download video')              except MaxDownloadsReached:                  self.to_screen('[info] Maximum number of downloaded files reached.')                  raise +            else: +                if self.params.get('dump_single_json', False): +                    self.to_stdout(json.dumps(res))          return self._download_retcode diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index e73bc5c37..cb4f2e41c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -79,6 +79,10 @@ __authors__  = (      'Carlos Ramos',      '5moufl',      'lenaten', +    'Dennis Scheiba', +    'Damon Timm', +    'winwon', +    'Xavier Beynon'  )  __license__ = 'Public Domain' @@ -256,8 +260,6 @@ def _real_main(argv=None):          date = DateRange.day(opts.date)      else:          date = DateRange(opts.dateafter, opts.datebefore) -    if opts.default_search not in ('auto', 'auto_warning', 'error', 'fixup_error', None) and ':' not in opts.default_search: -        parser.error(u'--default-search invalid; did you forget a colon (:) at the end?')      # Do not download videos when there are audio-only formats      if opts.extractaudio and not opts.keepvideo and opts.format is None: @@ -285,7 +287,7 @@ def _real_main(argv=None):                       u' file! Use "{0}.%(ext)s" instead of "{0}" as the output'                       u' template'.format(outtmpl)) -    any_printing = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson +    any_printing = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json      download_archive_fn = compat_expanduser(opts.download_archive) if opts.download_archive is not None else opts.download_archive      ydl_opts = { @@ -305,8 +307,9 @@ def _real_main(argv=None):          'forcefilename': opts.getfilename,          'forceformat': opts.getformat,          'forcejson': opts.dumpjson, -        'simulate': opts.simulate, -        'skip_download': (opts.skip_download or opts.simulate or any_printing), +        'dump_single_json': opts.dump_single_json, +        'simulate': opts.simulate or any_printing, +        'skip_download': opts.skip_download,          'format': opts.format,          'format_limit': opts.format_limit,          'listformats': opts.listformats, @@ -370,6 +373,7 @@ def _real_main(argv=None):          'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,          'encoding': opts.encoding,          'exec_cmd': opts.exec_cmd, +        'extract_flat': opts.extract_flat,      }      with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ce1a2b32b..5ec9b4745 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,12 +20,14 @@ from .arte import (      ArteTVDDCIE,      ArteTVEmbedIE,  ) +from .audiomack import AudiomackIE  from .auengine import AUEngineIE  from .bambuser import BambuserIE, BambuserChannelIE  from .bandcamp import BandcampIE, BandcampAlbumIE  from .bbccouk import BBCCoUkIE  from .beeg import BeegIE  from .behindkink import BehindKinkIE +from .bild import BildIE  from .bilibili import BiliBiliIE  from .blinkx import BlinkxIE  from .bliptv import BlipTVIE, BlipTVUserIE @@ -134,6 +136,8 @@ from .gamestar import GameStarIE  from .gametrailers import GametrailersIE  from .gdcvault import GDCVaultIE  from .generic import GenericIE +from .glide import GlideIE +from .globo import GloboIE  from .godtube import GodTubeIE  from .golem import GolemIE  from .googleplus import GooglePlusIE @@ -172,7 +176,6 @@ from .jadorecettepub import JadoreCettePubIE  from .jeuxvideo import JeuxVideoIE  from .jove import JoveIE  from .jukebox import JukeboxIE -from .justintv import JustinTVIE  from .jpopsukitv import JpopsukiIE  from .kankan import KankanIE  from .keezmovies import KeezMoviesIE @@ -275,6 +278,7 @@ from .parliamentliveuk import ParliamentLiveUKIE  from .patreon import PatreonIE  from .pbs import PBSIE  from .photobucket import PhotobucketIE +from .planetaplay import PlanetaPlayIE  from .played import PlayedIE  from .playfm import PlayFMIE  from .playvid import PlayvidIE @@ -314,6 +318,7 @@ from .sbs import SBSIE  from .scivee import SciVeeIE  from .screencast import ScreencastIE  from .servingsys import ServingSysIE +from .sexykarma import SexyKarmaIE  from .shared import SharedIE  from .sharesix import ShareSixIE  from .sina import SinaIE @@ -345,6 +350,7 @@ from .spiegel import SpiegelIE, SpiegelArticleIE  from .spiegeltv import SpiegeltvIE  from .spike import SpikeIE  from .sport5 import Sport5IE +from .sportbox import SportBoxIE  from .sportdeutschland import SportDeutschlandIE  from .stanfordoc import StanfordOpenClassroomIE  from .steam import SteamIE @@ -355,6 +361,7 @@ from .swrmediathek import SWRMediathekIE  from .syfy import SyfyIE  from .sztvhu import SztvHuIE  from .tagesschau import TagesschauIE +from .tapely import TapelyIE  from .teachertube import (      TeacherTubeIE,      TeacherTubeUserIE, @@ -363,11 +370,14 @@ from .teachingchannel import TeachingChannelIE  from .teamcoco import TeamcocoIE  from .techtalks import TechTalksIE  from .ted import TEDIE +from .telecinco import TelecincoIE  from .telemb import TeleMBIE  from .tenplay import TenPlayIE  from .testurl import TestURLIE  from .tf1 import TF1IE +from .theonion import TheOnionIE  from .theplatform import ThePlatformIE +from .thesixtyone import TheSixtyOneIE  from .thisav import ThisAVIE  from .tinypic import TinyPicIE  from .tlc import TlcIE, TlcDeIE @@ -389,6 +399,7 @@ from .tutv import TutvIE  from .tvigle import TvigleIE  from .tvp import TvpIE  from .tvplay import TVPlayIE +from .twitch import TwitchIE  from .ubu import UbuIE  from .udemy import (      UdemyIE, @@ -414,6 +425,7 @@ from .videopremium import VideoPremiumIE  from .videott import VideoTtIE  from .videoweed import VideoWeedIE  from .vidme import VidmeIE +from .vidzi import VidziIE  from .vimeo import (      VimeoIE,      VimeoAlbumIE, @@ -433,9 +445,11 @@ from .viki import VikiIE  from .vk import VKIE  from .vodlocker import VodlockerIE  from .vporn import VpornIE +from .vrt import VRTIE  from .vube import VubeIE  from .vuclip import VuClipIE  from .vulture import VultureIE +from .walla import WallaIE  from .washingtonpost import WashingtonPostIE  from .wat import WatIE  from .wayofthemaster import WayOfTheMasterIE @@ -457,7 +471,6 @@ from .xvideos import XVideosIE  from .xtube import XTubeUserIE, XTubeIE  from .yahoo import (      YahooIE, -    YahooNewsIE,      YahooSearchIE,  )  from .ynet import YnetIE @@ -482,10 +495,8 @@ from .youtube import (      YoutubeUserIE,      YoutubeWatchLaterIE,  ) -  from .zdf import ZDFIE -  _ALL_CLASSES = [      klass      for name, klass in globals().items() diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index c3d02f85e..b9a9440c0 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -10,8 +10,8 @@ from ..utils import (      unified_strdate,      determine_ext,      get_element_by_id, -    compat_str,      get_element_by_attribute, +    int_or_none,  )  # There are different sources of video in arte.tv, the extraction process  @@ -90,15 +90,24 @@ class ArteTVPlus7IE(InfoExtractor):          if not upload_date_str:              upload_date_str = player_info.get('VDA', '').split(' ')[0] +        title = player_info['VTI'].strip() +        subtitle = player_info.get('VSU', '').strip() +        if subtitle: +            title += ' - %s' % subtitle +          info_dict = {              'id': player_info['VID'], -            'title': player_info['VTI'], +            'title': title,              'description': player_info.get('VDE'),              'upload_date': unified_strdate(upload_date_str),              'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),          } -        all_formats = player_info['VSR'].values() +        all_formats = [] +        for format_id, format_dict in player_info['VSR'].items(): +            fmt = dict(format_dict) +            fmt['format_id'] = format_id +            all_formats.append(fmt)          # Some formats use the m3u8 protocol          all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats))          def _match_lang(f): @@ -149,22 +158,12 @@ class ArteTVPlus7IE(InfoExtractor):                  )          formats = sorted(formats, key=sort_key)          def _format(format_info): -            quality = '' -            height = format_info.get('height') -            if height is not None: -                quality = compat_str(height) -            bitrate = format_info.get('bitrate') -            if bitrate is not None: -                quality += '-%d' % bitrate -            if format_info.get('versionCode') is not None: -                format_id = '%s-%s' % (quality, format_info['versionCode']) -            else: -                format_id = quality              info = { -                'format_id': format_id, -                'format_note': format_info.get('versionLibelle'), -                'width': format_info.get('width'), -                'height': height, +                'format_id': format_info['format_id'], +                'format_note': '%s, %s' % (format_info.get('versionCode'), format_info.get('versionLibelle')), +                'width': int_or_none(format_info.get('width')), +                'height': int_or_none(format_info.get('height')), +                'tbr': int_or_none(format_info.get('bitrate')),              }              if format_info['mediaType'] == 'rtmp':                  info['url'] = format_info['streamer'] diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py new file mode 100644 index 000000000..57446fddd --- /dev/null +++ b/youtube_dl/extractor/audiomack.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .soundcloud import SoundcloudIE +from ..utils import ExtractorError +import datetime +import time + + +class AudiomackIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P<id>[\w/-]+)' +    IE_NAME = 'audiomack' +    _TESTS = [ +        #hosted on audiomack +        { +            'url': 'http://www.audiomack.com/song/roosh-williams/extraordinary', +            'info_dict': +            { +                'id' : 'roosh-williams/extraordinary', +                'ext': 'mp3', +                'title': 'Roosh Williams - Extraordinary' +            } +        }, +        #hosted on soundcloud via audiomack +        { +            'url': 'http://www.audiomack.com/song/xclusiveszone/take-kare', +            'file': '172419696.mp3', +            'info_dict': +            { +                'ext': 'mp3', +                'title': 'Young Thug ft Lil Wayne - Take Kare', +                "upload_date": "20141016", +                "description": "New track produced by London On Da Track called “Take Kare\"\n\nhttp://instagram.com/theyoungthugworld\nhttps://www.facebook.com/ThuggerThuggerCashMoney\n", +                "uploader": "Young Thug World" +            } +        } +    ] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        api_response = self._download_json( +            "http://www.audiomack.com/api/music/url/song/%s?_=%d" % ( +                video_id, time.time()), +            video_id) + +        if "url" not in api_response: +            raise ExtractorError("Unable to deduce api url of song") +        realurl = api_response["url"] + +        #Audiomack wraps a lot of soundcloud tracks in their branded wrapper +        # - if so, pass the work off to the soundcloud extractor +        if SoundcloudIE.suitable(realurl): +            return {'_type': 'url', 'url': realurl, 'ie_key': 'Soundcloud'} + +        webpage = self._download_webpage(url, video_id) +        artist = self._html_search_regex( +            r'<span class="artist">(.*?)</span>', webpage, "artist") +        songtitle = self._html_search_regex( +            r'<h1 class="profile-title song-title"><span class="artist">.*?</span>(.*?)</h1>', +            webpage, "title") +        title = artist + " - " + songtitle + +        return { +            'id': video_id, +            'title': title, +            'url': realurl, +        } diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index c569aa4d2..c13446665 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -15,13 +15,23 @@ class BandcampIE(InfoExtractor):      _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)'      _TESTS = [{          'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', -        'file': '1812978515.mp3',          'md5': 'c557841d5e50261777a6585648adf439',          'info_dict': { -            "title": "youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", -            "duration": 9.8485, +            'id': '1812978515', +            'ext': 'mp3', +            'title': "youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", +            'duration': 9.8485,          },          '_skip': 'There is a limit of 200 free downloads / month for the test song' +    }, { +        'url': 'http://benprunty.bandcamp.com/track/lanius-battle', +        'md5': '2b68e5851514c20efdff2afc5603b8b4', +        'info_dict': { +            'id': '2650410135', +            'ext': 'mp3', +            'title': 'Lanius (Battle)', +            'uploader': 'Ben Prunty Music', +        },      }]      def _real_extract(self, url): @@ -59,9 +69,9 @@ class BandcampIE(InfoExtractor):                  raise ExtractorError('No free songs found')          download_link = m_download.group(1) -        video_id = re.search( -            r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', -            webpage, re.MULTILINE | re.DOTALL).group('id') +        video_id = self._search_regex( +            r'var TralbumData = {.*?id: (?P<id>\d+),?$', +            webpage, 'video id', flags=re.MULTILINE | re.DOTALL)          download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page')          # We get the dictionary of the track from some javascript code diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py new file mode 100644 index 000000000..0269d1174 --- /dev/null +++ b/youtube_dl/extractor/bild.py @@ -0,0 +1,39 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class BildIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html' +    IE_DESC = 'Bild.de' +    _TEST = { +        'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html', +        'md5': 'dd495cbd99f2413502a1713a1156ac8a', +        'info_dict': { +            'id': '38184146', +            'ext': 'mp4', +            'title': 'BILD hat sie getestet', +            'thumbnail': 'http://bilder.bild.de/fotos/stand-das-koennen-die-neuen-ipads-38184138/Bild/1.bild.jpg', +            'duration': 196, +            'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml" +        doc = self._download_xml(xml_url, video_id) + +        duration = int_or_none(doc.attrib.get('duration'), scale=1000) + +        return { +            'id': video_id, +            'title': doc.attrib['ueberschrift'], +            'description': doc.attrib.get('text'), +            'url': doc.attrib['src'], +            'thumbnail': doc.attrib.get('img'), +            'duration': duration, +        } diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 1bfc9f35b..2c0e5eea2 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -4,37 +4,61 @@ import re  import json  from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    parse_age_limit, +)  class BreakIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?break\.com/video/([^/]+)' -    _TEST = { +    _VALID_URL = r'http://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)' +    _TESTS = [{          'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', -        'md5': 'a3513fb1547fba4fb6cfac1bffc6c46b', +        'md5': '33aa4ff477ecd124d18d7b5d23b87ce5',          'info_dict': {              'id': '2468056',              'ext': 'mp4',              'title': 'When Girls Act Like D-Bags',          } -    } +    }, { +        'url': 'http://www.break.com/video/ugc/baby-flex-2773063', +        'only_matching': True, +    }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group(1).split("-")[-1] -        embed_url = 'http://www.break.com/embed/%s' % video_id -        webpage = self._download_webpage(embed_url, video_id) -        info_json = self._search_regex(r'var embedVars = ({.*})\s*?</script>', -            webpage, 'info json', flags=re.DOTALL) -        info = json.loads(info_json) -        video_url = info['videoUri'] +        video_id = self._match_id(url) +        webpage = self._download_webpage( +            'http://www.break.com/embed/%s' % video_id, video_id) +        info = json.loads(self._search_regex( +            r'var embedVars = ({.*})\s*?</script>', +            webpage, 'info json', flags=re.DOTALL)) +          youtube_id = info.get('youtubeId')          if youtube_id:              return self.url_result(youtube_id, 'Youtube') -        final_url = video_url + '?' + info['AuthToken'] +        formats = [{ +            'url': media['uri'] + '?' + info['AuthToken'], +            'tbr': media['bitRate'], +            'width': media['width'], +            'height': media['height'], +        } for media in info['media']] + +        if not formats: +            formats.append({ +                'url': info['videoUri'] +            }) + +        self._sort_formats(formats) + +        duration = int_or_none(info.get('videoLengthInSeconds')) +        age_limit = parse_age_limit(info.get('audienceRating')) +          return {              'id': video_id, -            'url': final_url,              'title': info['contentName'],              'thumbnail': info['thumbUri'], +            'duration': duration, +            'age_limit': age_limit, +            'formats': formats,          } diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 294670386..ad22cbafd 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -87,6 +87,15 @@ class BrightcoveIE(InfoExtractor):                  'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',              },          }, +        { +            # playlist test +            # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players +            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL', +            'info_dict': { +                'title': 'Sealife', +            }, +            'playlist_mincount': 7, +        },      ]      @classmethod diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 496271be4..d064a28f9 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -42,7 +42,7 @@ class CinemassacreIE(InfoExtractor):          webpage = self._download_webpage(url, display_id)          video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') -        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) +        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)          if not mobj:              raise ExtractorError('Can\'t extract embed url and video id')          playerdata_url = mobj.group('embed_url') @@ -53,17 +53,22 @@ class CinemassacreIE(InfoExtractor):          video_description = self._html_search_regex(              r'<div class="entry-content">(?P<description>.+?)</div>',              webpage, 'description', flags=re.DOTALL, fatal=False) +        video_thumbnail = self._og_search_thumbnail(webpage)          playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage') -        video_thumbnail = self._search_regex( -            r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False) -        sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file') -        videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url') +        vidurl = self._search_regex( +            r'\'vidurl\'\s*:\s*"([^\']+)"', playerdata, 'vidurl').replace('\\/', '/') +        vidid = self._search_regex( +            r'\'vidid\'\s*:\s*"([^\']+)"', playerdata, 'vidid') +        videoserver = self._html_search_regex( +            r"'videoserver'\s*:\s*'([^']+)'", playerdata, 'videoserver') + +        videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)          videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')          formats = [] -        baseurl = sd_url[:sd_url.rfind('/')+1] +        baseurl = vidurl[:vidurl.rfind('/')+1]          for video in videolist.findall('.//video'):              src = video.get('src')              if not src: diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index dae40c136..78877b1cf 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import (  class CNNIE(InfoExtractor):      _VALID_URL = r'''(?x)https?://((edition|www)\.)?cnn\.com/video/(data/.+?|\?)/ -        (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))''' +        (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn(-ap)?|(?=&)))'''      _TESTS = [{          'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f43a0a569..cf3781cd6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -89,6 +89,10 @@ class InfoExtractor(object):                                   format, irrespective of the file format.                                   -1 for default (order by other properties),                                   -2 or smaller for less than default. +                    * source_preference  Order number for this video source +                                  (quality takes higher priority) +                                 -1 for default (order by other properties), +                                 -2 or smaller for less than default.                      * http_referer  HTTP Referer header value to set.                      * http_method  HTTP method to use for the download.                      * http_headers  A dictionary of additional HTTP headers @@ -138,6 +142,8 @@ class InfoExtractor(object):      Unless mentioned otherwise, the fields should be Unicode strings. +    Unless mentioned otherwise, None is equivalent to absence of information. +      Subclasses of this one should re-define the _real_initialize() and      _real_extract() methods and define a _VALID_URL regexp.      Probably, they should also be added to the list of extractors. @@ -279,6 +285,12 @@ class InfoExtractor(object):              raw_filename = basen + '.dump'              filename = sanitize_filename(raw_filename, restricted=True)              self.to_screen('Saving request to ' + filename) +            # Working around MAX_PATH limitation on Windows (see +            # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) +            if os.name == 'nt': +                absfilepath = os.path.abspath(filename) +                if len(absfilepath) > 259: +                    filename = '\\\\?\\' + absfilepath              with open(filename, 'wb') as outf:                  outf.write(webpage_bytes) @@ -334,7 +346,11 @@ class InfoExtractor(object):          try:              return json.loads(json_string)          except ValueError as ve: -            raise ExtractorError('Failed to download JSON', cause=ve) +            errmsg = '%s: Failed to parse JSON ' % video_id +            if fatal: +                raise ExtractorError(errmsg, cause=ve) +            else: +                self.report_warning(errmsg + str(ve))      def report_warning(self, msg, video_id=None):          idstr = '' if video_id is None else '%s: ' % video_id @@ -601,12 +617,13 @@ class InfoExtractor(object):                  audio_ext_preference,                  f.get('filesize') if f.get('filesize') is not None else -1,                  f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, +                f.get('source_preference') if f.get('source_preference') is not None else -1,                  f.get('format_id'),              )          formats.sort(key=_formats_key)      def http_scheme(self): -        """ Either "https:" or "https:", depending on the user's preferences """ +        """ Either "http:" or "https:", depending on the user's preferences """          return (              'http:'              if self._downloader.params.get('prefer_insecure', False) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index ffbe4903b..7a7e79360 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -34,6 +34,8 @@ class CondeNastIE(InfoExtractor):      _VALID_URL = r'http://(video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())      IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) +    EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed)/.+?' % '|'.join(_SITES.keys()) +      _TEST = {          'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',          'md5': '1921f713ed48aabd715691f774c451f7', diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f99888ecc..e3057d900 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -39,6 +39,7 @@ class CrunchyrollIE(SubtitlesInfoExtractor):              'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',              'uploader': 'Yomiuri Telecasting Corporation (YTV)',              'upload_date': '20131013', +            'url': 're:(?!.*&)',          },          'params': {              # rtmp @@ -237,12 +238,14 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text              streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+'&media%5Fid='+stream_id+'&video%5Fformat='+stream_format              streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')              streamdata_req.add_header('Content-Length', str(len(streamdata_req.data))) -            streamdata = self._download_webpage(streamdata_req, video_id, note='Downloading media info for '+video_format) -            video_url = self._search_regex(r'<host>([^<]+)', streamdata, 'video_url') -            video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, 'video_play_path') +            streamdata = self._download_xml( +                streamdata_req, video_id, +                note='Downloading media info for %s' % video_format) +            video_url = streamdata.find('.//host').text +            video_play_path = streamdata.find('.//file').text              formats.append({                  'url': video_url, -                'play_path':   video_play_path, +                'play_path': video_play_path,                  'ext': 'flv',                  'format': video_format,                  'format_id': video_format, diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 66a8f16d9..dbcf5d6a7 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -82,11 +82,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):      ]      def _real_extract(self, url): -        # Extract id and simplified title from URL -        mobj = re.match(self._VALID_URL, url) - -        video_id = mobj.group('id') - +        video_id = self._match_id(url)          url = 'http://www.dailymotion.com/video/%s' % video_id          # Retrieve video webpage to extract further information @@ -147,18 +143,23 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):              self._list_available_subtitles(video_id, webpage)              return -        view_count = self._search_regex( -            r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, 'view count', fatal=False) -        if view_count is not None: -            view_count = str_to_int(view_count) +        view_count = str_to_int(self._search_regex( +            r'video_views_count[^>]+>\s+([\d\.,]+)', +            webpage, 'view count', fatal=False)) + +        title = self._og_search_title(webpage, default=None) +        if title is None: +            title = self._html_search_regex( +                r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage, +                'title')          return { -            'id':       video_id, +            'id': video_id,              'formats': formats,              'uploader': info['owner.screenname'], -            'upload_date':  video_upload_date, -            'title':    self._og_search_title(webpage), -            'subtitles':    video_subtitles, +            'upload_date': video_upload_date, +            'title': title, +            'subtitles': video_subtitles,              'thumbnail': info['thumbnail_url'],              'age_limit': age_limit,              'view_count': view_count, diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 817a9bd61..5f24ac721 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -29,9 +29,8 @@ class DropboxIE(InfoExtractor):          video_id = mobj.group('id')          fn = compat_urllib_parse_unquote(url_basename(url))          title = os.path.splitext(fn)[0] -        video_url = ( -            re.sub(r'[?&]dl=0', '', url) + -            ('?' if '?' in url else '&') + 'dl=1') +        video_url = re.sub(r'[?&]dl=0', '', url) +        video_url += ('?' if '?' not in video_url else '&') + 'dl=1'          return {              'id': video_id, diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 0b3374d97..566e20d76 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -46,7 +46,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):                          f4m_format['preference'] = 1                      formats.extend(f4m_formats)              elif video_url.endswith('.m3u8'): -                formats.extend(self._extract_m3u8_formats(video_url, video_id)) +                formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4'))              elif video_url.startswith('rtmp'):                  formats.append({                      'url': video_url, @@ -58,7 +58,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):                  formats.append({                      'url': video_url,                      'format_id': format_id, -                    'preference': 2, +                    'preference': -1,                  })          self._sort_formats(formats) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index d966e8403..ec6d96ada 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -37,7 +37,7 @@ class FunnyOrDieIE(InfoExtractor):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id) -        links = re.findall(r'<source src="([^"]+/v)\d+\.([^"]+)" type=\'video', webpage) +        links = re.findall(r'<source src="([^"]+/v)[^"]+\.([^"]+)" type=\'video', webpage)          if not links:              raise ExtractorError('No media links available for %s' % video_id) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 742bc2856..9b6498894 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -28,6 +28,7 @@ from .brightcove import BrightcoveIE  from .ooyala import OoyalaIE  from .rutv import RUTVIE  from .smotri import SmotriIE +from .condenast import CondeNastIE  class GenericIE(InfoExtractor): @@ -225,21 +226,6 @@ class GenericIE(InfoExtractor):                  'skip_download': 'Requires rtmpdump'              }          }, -        # smotri embed -        { -            'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml', -            'md5': 'ec40048448e9284c9a1de77bb188108b', -            'info_dict': { -                'id': 'v27008541fad', -                'ext': 'mp4', -                'title': 'Крым и Севастополь вошли в состав России', -                'description': 'md5:fae01b61f68984c7bd2fa741e11c3175', -                'duration': 900, -                'upload_date': '20140318', -                'uploader': 'rbctv_2012_4', -                'uploader_id': 'rbctv_2012_4', -            }, -        },          # Condé Nast embed          {              'url': 'http://www.wired.com/2014/04/honda-asimo/', @@ -394,6 +380,17 @@ class GenericIE(InfoExtractor):                  'uploader': 'education-portal.com',              },          }, +        { +            'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', +            'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', +            'info_dict': { +                'id': 'uxjb0lwrcz', +                'ext': 'mp4', +                'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', +                'duration': 1715.0, +                'uploader': 'thoughtworks.wistia.com', +            },    +        },      ]      def report_following_redirect(self, new_url): @@ -490,7 +487,8 @@ class GenericIE(InfoExtractor):                       'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'                      ) % (url, url), expected=True)              else: -                assert ':' in default_search +                if ':' not in default_search: +                    default_search += ':'                  return self.url_result(default_search + url)          url, smuggled_data = unsmuggle_url(url) @@ -623,13 +621,13 @@ class GenericIE(InfoExtractor):          if mobj:              player_url = unescapeHTML(mobj.group('url'))              surl = smuggle_url(player_url, {'Referer': url}) -            return self.url_result(surl, 'Vimeo') +            return self.url_result(surl)          # Look for embedded (swf embed) Vimeo player          mobj = re.search( -            r'<embed[^>]+?src="(https?://(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) +            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)          if mobj: -            return self.url_result(mobj.group(1), 'Vimeo') +            return self.url_result(mobj.group(1))          # Look for embedded YouTube player          matches = re.findall(r'''(?x) @@ -654,19 +652,32 @@ class GenericIE(InfoExtractor):              return _playlist_from_matches(                  matches, lambda m: unescapeHTML(m[1])) +        # Look for embedded Dailymotion playlist player (#3822) +        m = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) +        if m: +            playlists = re.findall( +                r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) +            if playlists: +                return _playlist_from_matches( +                    playlists, lambda p: '//dailymotion.com/playlist/%s' % p) +          # Look for embedded Wistia player          match = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) +            r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)          if match: +            embed_url = self._proto_relative_url( +                unescapeHTML(match.group('url')))              return {                  '_type': 'url_transparent', -                'url': unescapeHTML(match.group('url')), +                'url': embed_url,                  'ie_key': 'Wistia',                  'uploader': video_uploader,                  'title': video_title,                  'id': video_id,              } -        match = re.search(r'(?:id=["\']wistia_|data-wistiaid=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) +             +        match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)          if match:              return {                  '_type': 'url_transparent', @@ -852,47 +863,57 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result(mobj.group('url'), 'MLB') +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, +            webpage) +        if mobj is not None: +            return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') + +        def check_video(vurl): +            vpath = compat_urlparse.urlparse(vurl).path +            vext = determine_ext(vpath) +            return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml') + +        def filter_video(urls): +            return list(filter(check_video, urls)) +          # Start with something easy: JW Player in SWFObject -        found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) +        found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))          if not found:              # Look for gorilla-vid style embedding -            found = re.findall(r'''(?sx) +            found = filter_video(re.findall(r'''(?sx)                  (?:                      jw_plugins|                      JWPlayerOptions|                      jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup                  ) -                .*?file\s*:\s*["\'](.*?)["\']''', webpage) +                .*?file\s*:\s*["\'](.*?)["\']''', webpage))          if not found:              # Broaden the search a little bit -            found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) +            found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))          if not found:              # Broaden the findall a little bit: JWPlayer JS loader -            found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) +            found = filter_video(re.findall( +                r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))          if not found:              # Flow player -            found = re.findall(r'''(?xs) +            found = filter_video(re.findall(r'''(?xs)                  flowplayer\("[^"]+",\s*                      \{[^}]+?\}\s*,                      \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*                          ["']?url["']?\s*:\s*["']([^"']+)["'] -            ''', webpage) +            ''', webpage))          if not found:              # Try to find twitter cards info -            found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) +            found = filter_video(re.findall( +                r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))          if not found:              # We look for Open Graph info:              # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)              m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)              # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:              if m_video_type is not None: -                def check_video(vurl): -                    vpath = compat_urlparse.urlparse(vurl).path -                    vext = determine_ext(vpath) -                    return '.' in vpath and vext not in ('swf', 'png', 'jpg') -                found = list(filter( -                    check_video, -                    re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))) +                found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))          if not found:              # HTML5 video              found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage) diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py new file mode 100644 index 000000000..9561ed5fb --- /dev/null +++ b/youtube_dl/extractor/glide.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class GlideIE(InfoExtractor): +    IE_DESC = 'Glide mobile video messages (glide.me)' +    _VALID_URL = r'https?://share\.glide\.me/(?P<id>[A-Za-z0-9\-=_+]+)' +    _TEST = { +        'url': 'http://share.glide.me/UZF8zlmuQbe4mr+7dCiQ0w==', +        'md5': '4466372687352851af2d131cfaa8a4c7', +        'info_dict': { +            'id': 'UZF8zlmuQbe4mr+7dCiQ0w==', +            'ext': 'mp4', +            'title': 'Damon Timm\'s Glide message', +            'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        title = self._html_search_regex( +            r'<title>(.*?)</title>', webpage, 'title') +        video_url = self.http_scheme() + self._search_regex( +            r'<source src="(.*?)" type="video/mp4">', webpage, 'video URL') +        thumbnail_url = self._search_regex( +            r'<img id="video-thumbnail" src="(.*?)"', +            webpage, 'thumbnail url', fatal=False) +        thumbnail = ( +            thumbnail_url if thumbnail_url is None +            else self.http_scheme() + thumbnail_url) + +        return { +            'id': video_id, +            'title': title, +            'url': video_url, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py new file mode 100644 index 000000000..77c3ad4fc --- /dev/null +++ b/youtube_dl/extractor/globo.py @@ -0,0 +1,398 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import math + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    float_or_none, +    compat_str, +    compat_chr, +    compat_ord, +) + + +class GloboIE(InfoExtractor): +    _VALID_URL = 'https?://.+?\.globo\.com/(?P<id>.+)' + +    _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' +    _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=2.9.9.50&resource_id=%s' + +    _VIDEOID_REGEXES = [ +        r'\bdata-video-id="(\d+)"', +        r'\bdata-player-videosids="(\d+)"', +        r'<div[^>]+\bid="(\d+)"', +    ] + +    _RESIGN_EXPIRATION = 86400 + +    _TESTS = [ +        { +            'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/', +            'md5': '03ebf41cb7ade43581608b7d9b71fab0', +            'info_dict': { +                'id': '3654973', +                'ext': 'mp4', +                'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão', +                'duration': 251.585, +                'uploader': 'SporTV', +                'uploader_id': 698, +                'like_count': int, +            } +        }, +        { +            'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', +            'md5': 'b3ccc801f75cd04a914d51dadb83a78d', +            'info_dict': { +                'id': '3607726', +                'ext': 'mp4', +                'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', +                'duration': 103.204, +                'uploader': 'Globo.com', +                'uploader_id': 265, +                'like_count': int, +            } +        }, +        { +            'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', +            'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', +            'info_dict': { +                'id': '3652183', +                'ext': 'mp4', +                'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', +                'duration': 110.711, +                'uploader': 'Rede Globo', +                'uploader_id': 196, +                'like_count': int, +            } +        }, +    ] + +    class MD5(): +        HEX_FORMAT_LOWERCASE = 0 +        HEX_FORMAT_UPPERCASE = 1 +        BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = '' +        BASE64_PAD_CHARACTER_RFC_COMPLIANCE = '=' +        PADDING = '=0xFF01DD' +        hexcase = 0 +        b64pad = '' + +        def __init__(self): +            pass + +        class JSArray(list): +            def __getitem__(self, y): +                try: +                    return list.__getitem__(self, y) +                except IndexError: +                    return 0 + +            def __setitem__(self, i, y): +                try: +                    return list.__setitem__(self, i, y) +                except IndexError: +                    self.extend([0] * (i - len(self) + 1)) +                    self[-1] = y + +        @classmethod +        def hex_md5(cls, param1): +            return cls.rstr2hex(cls.rstr_md5(cls.str2rstr_utf8(param1))) + +        @classmethod +        def b64_md5(cls, param1, param2=None): +            return cls.rstr2b64(cls.rstr_md5(cls.str2rstr_utf8(param1, param2))) + +        @classmethod +        def any_md5(cls, param1, param2): +            return cls.rstr2any(cls.rstr_md5(cls.str2rstr_utf8(param1)), param2) + +        @classmethod +        def rstr_md5(cls, param1): +            return cls.binl2rstr(cls.binl_md5(cls.rstr2binl(param1), len(param1) * 8)) + +        @classmethod +        def rstr2hex(cls, param1): +            _loc_2 = '0123456789ABCDEF' if cls.hexcase else '0123456789abcdef' +            _loc_3 = '' +            for _loc_5 in range(0, len(param1)): +                _loc_4 = compat_ord(param1[_loc_5]) +                _loc_3 += _loc_2[_loc_4 >> 4 & 15] + _loc_2[_loc_4 & 15] +            return _loc_3 + +        @classmethod +        def rstr2b64(cls, param1): +            _loc_2 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' +            _loc_3 = '' +            _loc_4 = len(param1) +            for _loc_5 in range(0, _loc_4, 3): +                _loc_6_1 = compat_ord(param1[_loc_5]) << 16 +                _loc_6_2 = compat_ord(param1[_loc_5 + 1]) << 8 if _loc_5 + 1 < _loc_4 else 0 +                _loc_6_3 = compat_ord(param1[_loc_5 + 2]) if _loc_5 + 2 < _loc_4 else 0 +                _loc_6 = _loc_6_1 | _loc_6_2 | _loc_6_3 +                for _loc_7 in range(0, 4): +                    if _loc_5 * 8 + _loc_7 * 6 > len(param1) * 8: +                        _loc_3 += cls.b64pad +                    else: +                        _loc_3 += _loc_2[_loc_6 >> 6 * (3 - _loc_7) & 63] +            return _loc_3 + +        @staticmethod +        def rstr2any(param1, param2): +            _loc_3 = len(param2) +            _loc_4 = [] +            _loc_9 = [0] * ((len(param1) >> 2) + 1) +            for _loc_5 in range(0, len(_loc_9)): +                _loc_9[_loc_5] = compat_ord(param1[_loc_5 * 2]) << 8 | compat_ord(param1[_loc_5 * 2 + 1]) + +            while len(_loc_9) > 0: +                _loc_8 = [] +                _loc_7 = 0 +                for _loc_5 in range(0, len(_loc_9)): +                    _loc_7 = (_loc_7 << 16) + _loc_9[_loc_5] +                    _loc_6 = math.floor(_loc_7 / _loc_3) +                    _loc_7 -= _loc_6 * _loc_3 +                    if len(_loc_8) > 0 or _loc_6 > 0: +                        _loc_8[len(_loc_8)] = _loc_6 + +                _loc_4[len(_loc_4)] = _loc_7 +                _loc_9 = _loc_8 + +            _loc_10 = '' +            _loc_5 = len(_loc_4) - 1 +            while _loc_5 >= 0: +                _loc_10 += param2[_loc_4[_loc_5]] +                _loc_5 -= 1 + +            return _loc_10 + +        @classmethod +        def str2rstr_utf8(cls, param1, param2=None): +            _loc_3 = '' +            _loc_4 = -1 +            if not param2: +                param2 = cls.PADDING +            param1 = param1 + param2[1:9] +            while True: +                _loc_4 += 1 +                if _loc_4 >= len(param1): +                    break +                _loc_5 = compat_ord(param1[_loc_4]) +                _loc_6 = compat_ord(param1[_loc_4 + 1]) if _loc_4 + 1 < len(param1) else 0 +                if 55296 <= _loc_5 <= 56319 and 56320 <= _loc_6 <= 57343: +                    _loc_5 = 65536 + ((_loc_5 & 1023) << 10) + (_loc_6 & 1023) +                    _loc_4 += 1 +                if _loc_5 <= 127: +                    _loc_3 += compat_chr(_loc_5) +                    continue +                if _loc_5 <= 2047: +                    _loc_3 += compat_chr(192 | _loc_5 >> 6 & 31) + compat_chr(128 | _loc_5 & 63) +                    continue +                if _loc_5 <= 65535: +                    _loc_3 += compat_chr(224 | _loc_5 >> 12 & 15) + compat_chr(128 | _loc_5 >> 6 & 63) + compat_chr( +                        128 | _loc_5 & 63) +                    continue +                if _loc_5 <= 2097151: +                    _loc_3 += compat_chr(240 | _loc_5 >> 18 & 7) + compat_chr(128 | _loc_5 >> 12 & 63) + compat_chr( +                        128 | _loc_5 >> 6 & 63) + compat_chr(128 | _loc_5 & 63) +            return _loc_3 + +        @staticmethod +        def rstr2binl(param1): +            _loc_2 = [0] * ((len(param1) >> 2) + 1) +            for _loc_3 in range(0, len(_loc_2)): +                _loc_2[_loc_3] = 0 +            for _loc_3 in range(0, len(param1) * 8, 8): +                _loc_2[_loc_3 >> 5] |= (compat_ord(param1[_loc_3 // 8]) & 255) << _loc_3 % 32 +            return _loc_2 + +        @staticmethod +        def binl2rstr(param1): +            _loc_2 = '' +            for _loc_3 in range(0, len(param1) * 32, 8): +                _loc_2 += compat_chr(param1[_loc_3 >> 5] >> _loc_3 % 32 & 255) +            return _loc_2 + +        @classmethod +        def binl_md5(cls, param1, param2): +            param1 = cls.JSArray(param1) +            param1[param2 >> 5] |= 128 << param2 % 32 +            param1[(param2 + 64 >> 9 << 4) + 14] = param2 +            _loc_3 = 1732584193 +            _loc_4 = -271733879 +            _loc_5 = -1732584194 +            _loc_6 = 271733878 +            for _loc_7 in range(0, len(param1), 16): +                _loc_8 = _loc_3 +                _loc_9 = _loc_4 +                _loc_10 = _loc_5 +                _loc_11 = _loc_6 +                _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 7, -680876936) +                _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 1], 12, -389564586) +                _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 17, 606105819) +                _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 3], 22, -1044525330) +                _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 7, -176418897) +                _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 5], 12, 1200080426) +                _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 17, -1473231341) +                _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 7], 22, -45705983) +                _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 7, 1770035416) +                _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 9], 12, -1958414417) +                _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 17, -42063) +                _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 11], 22, -1990404162) +                _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 7, 1804603682) +                _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 13], 12, -40341101) +                _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 17, -1502002290) +                _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 15], 22, 1236535329) +                _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 5, -165796510) +                _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 6], 9, -1069501632) +                _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 14, 643717713) +                _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 0], 20, -373897302) +                _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 5, -701558691) +                _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 10], 9, 38016083) +                _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 14, -660478335) +                _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 4], 20, -405537848) +                _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 5, 568446438) +                _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 14], 9, -1019803690) +                _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 14, -187363961) +                _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 8], 20, 1163531501) +                _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 5, -1444681467) +                _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 2], 9, -51403784) +                _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 14, 1735328473) +                _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 12], 20, -1926607734) +                _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 4, -378558) +                _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 8], 11, -2022574463) +                _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 16, 1839030562) +                _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 14], 23, -35309556) +                _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 4, -1530992060) +                _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 4], 11, 1272893353) +                _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 16, -155497632) +                _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 10], 23, -1094730640) +                _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 4, 681279174) +                _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 0], 11, -358537222) +                _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 16, -722521979) +                _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 6], 23, 76029189) +                _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 4, -640364487) +                _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 12], 11, -421815835) +                _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 16, 530742520) +                _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 2], 23, -995338651) +                _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 6, -198630844) +                _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 7], 10, 1126891415) +                _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 15, -1416354905) +                _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 5], 21, -57434055) +                _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 6, 1700485571) +                _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 3], 10, -1894986606) +                _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 15, -1051523) +                _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 1], 21, -2054922799) +                _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 6, 1873313359) +                _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 15], 10, -30611744) +                _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 15, -1560198380) +                _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 13], 21, 1309151649) +                _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 6, -145523070) +                _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 11], 10, -1120210379) +                _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 15, 718787259) +                _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 9], 21, -343485551) +                _loc_3 = cls.safe_add(_loc_3, _loc_8) +                _loc_4 = cls.safe_add(_loc_4, _loc_9) +                _loc_5 = cls.safe_add(_loc_5, _loc_10) +                _loc_6 = cls.safe_add(_loc_6, _loc_11) +            return [_loc_3, _loc_4, _loc_5, _loc_6] + +        @classmethod +        def md5_cmn(cls, param1, param2, param3, param4, param5, param6): +            return cls.safe_add( +                cls.bit_rol(cls.safe_add(cls.safe_add(param2, param1), cls.safe_add(param4, param6)), param5), param3) + +        @classmethod +        def md5_ff(cls, param1, param2, param3, param4, param5, param6, param7): +            return cls.md5_cmn(param2 & param3 | ~param2 & param4, param1, param2, param5, param6, param7) + +        @classmethod +        def md5_gg(cls, param1, param2, param3, param4, param5, param6, param7): +            return cls.md5_cmn(param2 & param4 | param3 & ~param4, param1, param2, param5, param6, param7) + +        @classmethod +        def md5_hh(cls, param1, param2, param3, param4, param5, param6, param7): +            return cls.md5_cmn(param2 ^ param3 ^ param4, param1, param2, param5, param6, param7) + +        @classmethod +        def md5_ii(cls, param1, param2, param3, param4, param5, param6, param7): +            return cls.md5_cmn(param3 ^ (param2 | ~param4), param1, param2, param5, param6, param7) + +        @classmethod +        def safe_add(cls, param1, param2): +            _loc_3 = (param1 & 65535) + (param2 & 65535) +            _loc_4 = (param1 >> 16) + (param2 >> 16) + (_loc_3 >> 16) +            return cls.lshift(_loc_4, 16) | _loc_3 & 65535 + +        @classmethod +        def bit_rol(cls, param1, param2): +            return cls.lshift(param1, param2) | (param1 & 0xFFFFFFFF) >> (32 - param2) + +        @staticmethod +        def lshift(value, count): +            r = (0xFFFFFFFF & value) << count +            return -(~(r - 1) & 0xFFFFFFFF) if r > 0x7FFFFFFF else r + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) +        video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id') + +        video = self._download_json( +            self._API_URL_TEMPLATE % video_id, video_id)['videos'][0] + +        title = video['title'] +        duration = float_or_none(video['duration'], 1000) +        like_count = video['likes'] +        uploader = video['channel'] +        uploader_id = video['channel_id'] + +        formats = [] + +        for resource in video['resources']: +            resource_id = resource.get('_id') +            if not resource_id: +                continue + +            security = self._download_json( +                self._SECURITY_URL_TEMPLATE % (video_id, resource_id), +                video_id, 'Downloading security hash for %s' % resource_id) + +            security_hash = security.get('hash') +            if not security_hash: +                message = security.get('message') +                if message: +                    raise ExtractorError( +                        '%s returned error: %s' % (self.IE_NAME, message), expected=True) +                continue + +            hash_code = security_hash[:2] +            received_time = int(security_hash[2:12]) +            received_random = security_hash[12:22] +            received_md5 = security_hash[22:] + +            sign_time = received_time + self._RESIGN_EXPIRATION +            padding = '%010d' % random.randint(1, 10000000000) + +            signed_md5 = self.MD5.b64_md5(received_md5 + compat_str(sign_time) + padding) +            signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5 + +            formats.append({ +                'url': '%s?h=%s&k=%s' % (resource['url'], signed_hash, 'flash'), +                'format_id': resource_id, +                'height': resource['height'] +            }) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'duration': duration, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'like_count': like_count, +            'formats': formats +        }
\ No newline at end of file diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index 07d994b44..fcefe54cd 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -1,13 +1,11 @@  # coding: utf-8  from __future__ import unicode_literals -import datetime  import re +import codecs  from .common import InfoExtractor -from ..utils import ( -    ExtractorError, -) +from ..utils import unified_strdate  class GooglePlusIE(InfoExtractor): @@ -19,74 +17,57 @@ class GooglePlusIE(InfoExtractor):          'info_dict': {              'id': 'ZButuJc6CtH',              'ext': 'flv', +            'title': '嘆きの天使 降臨',              'upload_date': '20120613',              'uploader': '井上ヨシマサ', -            'title': '嘆きの天使 降臨',          }      }      def _real_extract(self, url): -        # Extract id from URL -        mobj = re.match(self._VALID_URL, url) - -        video_id = mobj.group('id') +        video_id = self._match_id(url)          # Step 1, Retrieve post webpage to extract further information          webpage = self._download_webpage(url, video_id, 'Downloading entry webpage') -        self.report_extraction(video_id) - -        # Extract update date -        upload_date = self._html_search_regex( +        title = self._og_search_description(webpage).splitlines()[0] +        upload_date = unified_strdate(self._html_search_regex(              r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*>                      ([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''', -            webpage, 'upload date', fatal=False, flags=re.VERBOSE) -        if upload_date: -            # Convert timestring to a format suitable for filename -            upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d") -            upload_date = upload_date.strftime('%Y%m%d') - -        # Extract uploader -        uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>', -            webpage, 'uploader', fatal=False) - -        # Extract title -        # Get the first line for title -        video_title = self._og_search_description(webpage).splitlines()[0] +            webpage, 'upload date', fatal=False, flags=re.VERBOSE)) +        uploader = self._html_search_regex( +            r'rel="author".*?>(.*?)</a>', webpage, 'uploader', fatal=False)          # Step 2, Simulate clicking the image box to launch video          DOMAIN = 'https://plus.google.com/' -        video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN), +        video_page = self._search_regex( +            r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),              webpage, 'video page URL')          if not video_page.startswith(DOMAIN):              video_page = DOMAIN + video_page          webpage = self._download_webpage(video_page, video_id, 'Downloading video page') -        # Extract video links all sizes -        pattern = r'\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"' -        mobj = re.findall(pattern, webpage) -        if len(mobj) == 0: -            raise ExtractorError('Unable to extract video links') - -        # Sort in resolution -        links = sorted(mobj) +        def unicode_escape(s): +            decoder = codecs.getdecoder('unicode_escape') +            return re.sub( +                r'\\u[0-9a-fA-F]{4,}', +                lambda m: decoder(m.group(0))[0], +                s) -        # Choose the lowest of the sort, i.e. highest resolution -        video_url = links[-1] -        # Only get the url. The resolution part in the tuple has no use anymore -        video_url = video_url[-1] -        # Treat escaped \u0026 style hex -        try: -            video_url = video_url.decode("unicode_escape") -        except AttributeError: # Python 3 -            video_url = bytes(video_url, 'ascii').decode('unicode-escape') +        # Extract video links all sizes +        formats = [{ +            'url': unicode_escape(video_url), +            'ext': 'flv', +            'width': int(width), +            'height': int(height), +        } for width, height, video_url in re.findall( +            r'\d+,(\d+),(\d+),"(https?://redirector\.googlevideo\.com.*?)"', webpage)] +        self._sort_formats(formats)          return {              'id': video_id, -            'url': video_url, +            'title': title,              'uploader': uploader,              'upload_date': upload_date, -            'title': video_title, -            'ext': 'flv', +            'formats': formats,          } diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py index ca5f7c417..45cca1d24 100644 --- a/youtube_dl/extractor/gorillavid.py +++ b/youtube_dl/extractor/gorillavid.py @@ -5,6 +5,7 @@ import re  from .common import InfoExtractor  from ..utils import ( +    ExtractorError,      determine_ext,      compat_urllib_parse,      compat_urllib_request, @@ -12,20 +13,22 @@ from ..utils import (  class GorillaVidIE(InfoExtractor): -    IE_DESC = 'GorillaVid.in and daclips.in' +    IE_DESC = 'GorillaVid.in, daclips.in and movpod.in'      _VALID_URL = r'''(?x)          https?://(?P<host>(?:www\.)? -            (?:daclips\.in|gorillavid\.in))/ +            (?:daclips\.in|gorillavid\.in|movpod\.in))/          (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?      ''' +    _FILE_NOT_FOUND_REGEX = r'>(?:404 - )?File Not Found<' +      _TESTS = [{          'url': 'http://gorillavid.in/06y9juieqpmi',          'md5': '5ae4a3580620380619678ee4875893ba',          'info_dict': {              'id': '06y9juieqpmi',              'ext': 'flv', -            'title': 'Rebecca Black My Moment Official Music Video Reaction', +            'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ',              'thumbnail': 're:http://.*\.jpg',          },      }, { @@ -46,6 +49,9 @@ class GorillaVidIE(InfoExtractor):              'title': 'Micro Pig piglets ready on 16th July 2009',              'thumbnail': 're:http://.*\.jpg',          }, +    }, { +        'url': 'http://movpod.in/0wguyyxi1yca', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -54,6 +60,9 @@ class GorillaVidIE(InfoExtractor):          webpage = self._download_webpage('http://%s/%s' % (mobj.group('host'), video_id), video_id) +        if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None: +            raise ExtractorError('Video %s does not exist' % video_id, expected=True) +          fields = dict(re.findall(r'''(?x)<input\s+              type="hidden"\s+              name="([^"]+)"\s+ @@ -69,14 +78,14 @@ class GorillaVidIE(InfoExtractor):              webpage = self._download_webpage(req, video_id, 'Downloading video page') -        title = self._search_regex(r'style="z-index: [0-9]+;">([0-9a-zA-Z ]+)(?:-.+)?</span>', webpage, 'title') -        thumbnail = self._search_regex(r'image:\'(http[^\']+)\',', webpage, 'thumbnail') -        url = self._search_regex(r'file: \'(http[^\']+)\',', webpage, 'file url') +        title = self._search_regex(r'style="z-index: [0-9]+;">([^<]+)</span>', webpage, 'title') +        video_url = self._search_regex(r'file\s*:\s*\'(http[^\']+)\',', webpage, 'file url') +        thumbnail = self._search_regex(r'image\s*:\s*\'(http[^\']+)\',', webpage, 'thumbnail', fatal=False)          formats = [{              'format_id': 'sd', -            'url': url, -            'ext': determine_ext(url), +            'url': video_url, +            'ext': determine_ext(video_url),              'quality': 1,          }] diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py index 5bdd08afa..b6cc15b6f 100644 --- a/youtube_dl/extractor/hark.py +++ b/youtube_dl/extractor/hark.py @@ -1,37 +1,33 @@  # -*- coding: utf-8 -*- - -import re -import json +from __future__ import unicode_literals  from .common import InfoExtractor -from ..utils import determine_ext +  class HarkIE(InfoExtractor): -    _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+' +    _VALID_URL = r'https?://www\.hark\.com/clips/(?P<id>.+?)-.+'      _TEST = { -        u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', -        u'file': u'mmbzyhkgny.mp3', -        u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', -        u'info_dict': { -            u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013", -            u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', -            u'duration': 11, +        'url': 'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', +        'md5': '6783a58491b47b92c7c1af5a77d4cbee', +        'info_dict': { +            'id': 'mmbzyhkgny', +            'ext': 'mp3', +            'title': 'Obama: \'Beyond The Afghan Theater, We Only Target Al Qaeda\' on May 23, 2013', +            'description': 'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', +            'duration': 11,          }      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group(1) -        json_url = "http://www.hark.com/clips/%s.json" %(video_id) -        info_json = self._download_webpage(json_url, video_id) -        info = json.loads(info_json) -        final_url = info['url'] +        video_id = self._match_id(url) +        data = self._download_json( +            'http://www.hark.com/clips/%s.json' % video_id, video_id) -        return {'id': video_id, -                'url' : final_url, -                'title': info['name'], -                'ext': determine_ext(final_url), -                'description': info['description'], -                'thumbnail': info['image_original'], -                'duration': info['duration'], -                } +        return { +            'id': video_id, +            'url': data['url'], +            'title': data['name'], +            'description': data.get('description'), +            'thumbnail': data.get('image_original'), +            'duration': data.get('duration'), +        } diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py index 68684b997..fccc23884 100644 --- a/youtube_dl/extractor/howstuffworks.py +++ b/youtube_dl/extractor/howstuffworks.py @@ -28,13 +28,13 @@ class HowStuffWorksIE(InfoExtractor):              }          },          { -            'url': 'http://adventure.howstuffworks.com/39516-deadliest-catch-jakes-farewell-pots-video.htm', +            'url': 'http://adventure.howstuffworks.com/7199-survival-zone-food-and-water-in-the-savanna-video.htm',              'info_dict': { -                'id': '553470', -                'display_id': 'deadliest-catch-jakes-farewell-pots', +                'id': '453464', +                'display_id': 'survival-zone-food-and-water-in-the-savanna',                  'ext': 'mp4', -                'title': 'Deadliest Catch: Jake\'s Farewell Pots', -                'description': 'md5:9632c346d5e43ee238028c9cefd8dbbc', +                'title': 'Survival Zone: Food and Water In the Savanna', +                'description': 'md5:7e1c89f6411434970c15fa094170c371',                  'thumbnail': 're:^https?://.*\.jpg$',              },              'params': { diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py index 94e7cf790..4ccf6b9b8 100644 --- a/youtube_dl/extractor/huffpost.py +++ b/youtube_dl/extractor/huffpost.py @@ -33,8 +33,7 @@ class HuffPostIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id          data = self._download_json(api_url, video_id)['data'] diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index a83dd249f..07ef682ee 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -63,7 +63,8 @@ class IzleseneIE(InfoExtractor):          title = self._og_search_title(webpage)          description = self._og_search_description(webpage) -        thumbnail = self._og_search_thumbnail(webpage) +        thumbnail = self._proto_relative_url( +            self._og_search_thumbnail(webpage), scheme='http:')          uploader = self._html_search_regex(              r"adduserUsername\s*=\s*'([^']+)';", diff --git a/youtube_dl/extractor/jpopsukitv.py b/youtube_dl/extractor/jpopsukitv.py index aad782578..122e2dd8c 100644 --- a/youtube_dl/extractor/jpopsukitv.py +++ b/youtube_dl/extractor/jpopsukitv.py @@ -1,8 +1,6 @@  # coding=utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  from ..utils import (      int_or_none, @@ -12,14 +10,14 @@ from ..utils import (  class JpopsukiIE(InfoExtractor):      IE_NAME = 'jpopsuki.tv' -    _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/video/(.*?)/(?P<id>\S+)' +    _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/(?:category/)?video/[^/]+/(?P<id>\S+)'      _TEST = {          'url': 'http://www.jpopsuki.tv/video/ayumi-hamasaki---evolution/00be659d23b0b40508169cdee4545771',          'md5': '88018c0c1a9b1387940e90ec9e7e198e', -        'file': '00be659d23b0b40508169cdee4545771.mp4',          'info_dict': {              'id': '00be659d23b0b40508169cdee4545771', +            'ext': 'mp4',              'title': 'ayumi hamasaki - evolution',              'description': 'Release date: 2001.01.31\r\n浜崎あゆみ - evolution',              'thumbnail': 'http://www.jpopsuki.tv/cache/89722c74d2a2ebe58bcac65321c115b2.jpg', @@ -30,8 +28,7 @@ class JpopsukiIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) @@ -47,11 +44,9 @@ class JpopsukiIE(InfoExtractor):          uploader_id = self._html_search_regex(              r'<li>from: <a href="/user/view/user/\S*?/uid/(\d*)',              webpage, 'video uploader_id', fatal=False) -        upload_date = self._html_search_regex( +        upload_date = unified_strdate(self._html_search_regex(              r'<li>uploaded: (.*?)</li>', webpage, 'video upload_date', -            fatal=False) -        if upload_date is not None: -            upload_date = unified_strdate(upload_date) +            fatal=False))          view_count_str = self._html_search_regex(              r'<li>Hits: ([0-9]+?)</li>', webpage, 'video view_count',              fatal=False) diff --git a/youtube_dl/extractor/justintv.py b/youtube_dl/extractor/justintv.py deleted file mode 100644 index 27017e89f..000000000 --- a/youtube_dl/extractor/justintv.py +++ /dev/null @@ -1,155 +0,0 @@ -from __future__ import unicode_literals - -import itertools -import json -import os -import re - -from .common import InfoExtractor -from ..utils import ( -    compat_str, -    ExtractorError, -    formatSeconds, -) - - -class JustinTVIE(InfoExtractor): -    """Information extractor for justin.tv and twitch.tv""" -    # TODO: One broadcast may be split into multiple videos. The key -    # 'broadcast_id' is the same for all parts, and 'broadcast_part' -    # starts at 1 and increases. Can we treat all parts as one video? - -    _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/ -        (?: -            (?P<channelid>[^/]+)| -            (?:(?:[^/]+)/b/(?P<videoid>[^/]+))| -            (?:(?:[^/]+)/c/(?P<chapterid>[^/]+)) -        ) -        /?(?:\#.*)?$ -        """ -    _JUSTIN_PAGE_LIMIT = 100 -    IE_NAME = 'justin.tv' -    IE_DESC = 'justin.tv and twitch.tv' -    _TEST = { -        'url': 'http://www.twitch.tv/thegamedevhub/b/296128360', -        'md5': 'ecaa8a790c22a40770901460af191c9a', -        'info_dict': { -            'id': '296128360', -            'ext': 'flv', -            'upload_date': '20110927', -            'uploader_id': 25114803, -            'uploader': 'thegamedevhub', -            'title': 'Beginner Series - Scripting With Python Pt.1' -        } -    } - -    # Return count of items, list of *valid* items -    def _parse_page(self, url, video_id, counter): -        info_json = self._download_webpage( -            url, video_id, -            'Downloading video info JSON on page %d' % counter, -            'Unable to download video info JSON %d' % counter) - -        response = json.loads(info_json) -        if type(response) != list: -            error_text = response.get('error', 'unknown error') -            raise ExtractorError('Justin.tv API: %s' % error_text) -        info = [] -        for clip in response: -            video_url = clip['video_file_url'] -            if video_url: -                video_extension = os.path.splitext(video_url)[1][1:] -                video_date = re.sub('-', '', clip['start_time'][:10]) -                video_uploader_id = clip.get('user_id', clip.get('channel_id')) -                video_id = clip['id'] -                video_title = clip.get('title', video_id) -                info.append({ -                    'id': compat_str(video_id), -                    'url': video_url, -                    'title': video_title, -                    'uploader': clip.get('channel_name', video_uploader_id), -                    'uploader_id': video_uploader_id, -                    'upload_date': video_date, -                    'ext': video_extension, -                }) -        return (len(response), info) - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) - -        api_base = 'http://api.justin.tv' -        paged = False -        if mobj.group('channelid'): -            paged = True -            video_id = mobj.group('channelid') -            api = api_base + '/channel/archives/%s.json' % video_id -        elif mobj.group('chapterid'): -            chapter_id = mobj.group('chapterid') - -            webpage = self._download_webpage(url, chapter_id) -            m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage) -            if not m: -                raise ExtractorError('Cannot find archive of a chapter') -            archive_id = m.group(1) - -            api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id -            doc = self._download_xml( -                api, chapter_id, -                note='Downloading chapter information', -                errnote='Chapter information download failed') -            for a in doc.findall('.//archive'): -                if archive_id == a.find('./id').text: -                    break -            else: -                raise ExtractorError('Could not find chapter in chapter information') - -            video_url = a.find('./video_file_url').text -            video_ext = video_url.rpartition('.')[2] or 'flv' - -            chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id -            chapter_info = self._download_json( -                chapter_api_url, 'c' + chapter_id, -                note='Downloading chapter metadata', -                errnote='Download of chapter metadata failed') - -            bracket_start = int(doc.find('.//bracket_start').text) -            bracket_end = int(doc.find('.//bracket_end').text) - -            # TODO determine start (and probably fix up file) -            #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457 -            #video_url += '?start=' + TODO:start_timestamp -            # bracket_start is 13290, but we want 51670615 -            self._downloader.report_warning('Chapter detected, but we can just download the whole file. ' -                                            'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end))) - -            info = { -                'id': 'c' + chapter_id, -                'url': video_url, -                'ext': video_ext, -                'title': chapter_info['title'], -                'thumbnail': chapter_info['preview'], -                'description': chapter_info['description'], -                'uploader': chapter_info['channel']['display_name'], -                'uploader_id': chapter_info['channel']['name'], -            } -            return info -        else: -            video_id = mobj.group('videoid') -            api = api_base + '/broadcast/by_archive/%s.json' % video_id - -        entries = [] -        offset = 0 -        limit = self._JUSTIN_PAGE_LIMIT -        for counter in itertools.count(1): -            page_url = api + ('?offset=%d&limit=%d' % (offset, limit)) -            page_count, page_info = self._parse_page( -                page_url, video_id, counter) -            entries.extend(page_info) -            if not paged or page_count != limit: -                break -            offset += limit -        return { -            '_type': 'playlist', -            'id': video_id, -            'entries': entries, -        } diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py index 5341ac773..8a73ecfa0 100644 --- a/youtube_dl/extractor/kontrtube.py +++ b/youtube_dl/extractor/kontrtube.py @@ -34,7 +34,7 @@ class KontrTubeIE(InfoExtractor):          video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL')          thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False)          title = self._html_search_regex( -            r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage, 'video title') +            r'<title>(.+?)</title>', webpage, 'video title')          description = self._html_search_meta('description', webpage, 'video description')          mobj = re.search( diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index fca0bfef0..db5df4078 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -22,7 +22,7 @@ class LRTIE(InfoExtractor):              'id': '54391',              'ext': 'mp4',              'title': 'Septynios Kauno dienos', -            'description': 'Kauno miesto ir apskrities naujienos', +            'description': 'md5:24d84534c7dc76581e59f5689462411a',              'duration': 1783,          },          'params': { diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 979f3d692..6691521e5 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -6,6 +6,7 @@ import json  from .common import InfoExtractor  from ..utils import (      compat_urllib_parse, +    compat_urlparse,      get_element_by_attribute,      parse_duration,      strip_jsonp, @@ -39,13 +40,21 @@ class MiTeleIE(InfoExtractor):          ).replace('\'', '"')          embed_data = json.loads(embed_data_json) -        info_url = embed_data['flashvars']['host'] +        domain = embed_data['mediaUrl'] +        if not domain.startswith('http'): +            # only happens in telecinco.es videos +            domain = 'http://' + domain +        info_url = compat_urlparse.urljoin( +            domain, +            compat_urllib_parse.unquote(embed_data['flashvars']['host']) +        )          info_el = self._download_xml(info_url, episode).find('./video/info')          video_link = info_el.find('videoUrl/link').text          token_query = compat_urllib_parse.urlencode({'id': video_link})          token_info = self._download_json( -            'http://token.mitele.es/?' + token_query, episode, +            embed_data['flashvars']['ov_tk'] + '?' + token_query, +            episode,              transform_source=strip_jsonp          ) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 520f27fca..bb8937c4d 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -33,22 +33,22 @@ class MixcloudIE(InfoExtractor):          },      } -    def check_urls(self, url_list): -        """Returns 1st active url from list""" -        for url in url_list: +    def _get_url(self, track_id, template_url): +        server_count = 30 +        for i in range(server_count): +            url = template_url % i              try:                  # We only want to know if the request succeed                  # don't download the whole file -                self._request_webpage(HEADRequest(url), None, False) +                self._request_webpage( +                    HEADRequest(url), track_id, +                    'Checking URL %d/%d ...' % (i + 1, server_count + 1))                  return url              except ExtractorError: -                url = None +                pass          return None -    def _get_url(self, template_url): -        return self.check_urls(template_url % i for i in range(30)) -      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          uploader = mobj.group(1) @@ -61,16 +61,16 @@ class MixcloudIE(InfoExtractor):              r'\s(?:data-preview-url|m-preview)="(.+?)"', webpage, 'preview url')          song_url = preview_url.replace('/previews/', '/c/originals/')          template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) -        final_song_url = self._get_url(template_url) +        final_song_url = self._get_url(track_id, template_url)          if final_song_url is None:              self.to_screen('Trying with m4a extension')              template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') -            final_song_url = self._get_url(template_url) +            final_song_url = self._get_url(track_id, template_url)          if final_song_url is None:              raise ExtractorError('Unable to extract track url')          PREFIX = ( -            r'<div class="cloudcast-play-button-container"' +            r'<div class="cloudcast-play-button-container[^"]*?"'              r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')          title = self._html_search_regex(              PREFIX + r'm-title="([^"]+)"', webpage, 'title') diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index bfdb462eb..42aa2e227 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -6,7 +6,6 @@ from .common import InfoExtractor  from ..utils import (      parse_duration,      parse_iso8601, -    find_xpath_attr,  ) @@ -88,8 +87,9 @@ class MLBIE(InfoExtractor):          duration = parse_duration(detail.find('./duration').text)          timestamp = parse_iso8601(detail.attrib['date'][:-5]) -        thumbnail = find_xpath_attr( -            detail, './thumbnailScenarios/thumbnailScenario', 'type', '45').text +        thumbnails = [{ +            'url': thumbnail.text, +        } for thumbnail in detail.findall('./thumbnailScenarios/thumbnailScenario')]          formats = []          for media_url in detail.findall('./url'): @@ -116,5 +116,5 @@ class MLBIE(InfoExtractor):              'duration': duration,              'timestamp': timestamp,              'formats': formats, -            'thumbnail': thumbnail, +            'thumbnails': thumbnails,          } diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6229b2173..3621ff99e 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -5,20 +5,20 @@ import re  from .common import InfoExtractor  from ..utils import ( -    int_or_none, +    str_to_int,      unified_strdate,  )  class MotherlessIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?motherless\.com/(?P<id>[A-Z0-9]+)' +    _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'      _TESTS = [          {              'url': 'http://motherless.com/AC3FFE1', -            'md5': '5527fef81d2e529215dad3c2d744a7d9', +            'md5': '310f62e325a9fafe64f68c0bccb6e75f',              'info_dict': {                  'id': 'AC3FFE1', -                'ext': 'flv', +                'ext': 'mp4',                  'title': 'Fucked in the ass while playing PS3',                  'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],                  'upload_date': '20100913', @@ -40,33 +40,51 @@ class MotherlessIE(InfoExtractor):                  'thumbnail': 're:http://.*\.jpg',                  'age_limit': 18,              } +        }, +        { +            'url': 'http://motherless.com/g/cosplay/633979F', +            'md5': '0b2a43f447a49c3e649c93ad1fafa4a0', +            'info_dict': { +                'id': '633979F', +                'ext': 'mp4', +                'title': 'Turtlette', +                'categories': ['superheroine heroine  superher'], +                'upload_date': '20140827', +                'uploader_id': 'shade0230', +                'thumbnail': 're:http://.*\.jpg', +                'age_limit': 18, +            }          }      ] -    def _real_extract(self,url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') - +    def _real_extract(self, url): +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        title = self._html_search_regex(r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') -         -        video_url = self._html_search_regex(r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video_url') +        title = self._html_search_regex( +            r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') +        video_url = self._html_search_regex( +            r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL')          age_limit = self._rta_search(webpage) - -        view_count = self._html_search_regex(r'<strong>Views</strong>\s+([^<]+)<', webpage, 'view_count') +        view_count = str_to_int(self._html_search_regex( +            r'<strong>Views</strong>\s+([^<]+)<', +            webpage, 'view count', fatal=False)) +        like_count = str_to_int(self._html_search_regex( +            r'<strong>Favorited</strong>\s+([^<]+)<', +            webpage, 'like count', fatal=False)) -        upload_date = self._html_search_regex(r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload_date') +        upload_date = self._html_search_regex( +            r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload date')          if 'Ago' in upload_date:              days = int(re.search(r'([0-9]+)', upload_date).group(1))              upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d')          else:              upload_date = unified_strdate(upload_date) -        like_count = self._html_search_regex(r'<strong>Favorited</strong>\s+([^<]+)<', webpage, 'like_count') -          comment_count = webpage.count('class="media-comment-contents"') -        uploader_id = self._html_search_regex(r'"thumb-member-username">\s+<a href="/m/([^"]+)"', webpage, 'uploader_id') +        uploader_id = self._html_search_regex( +            r'"thumb-member-username">\s+<a href="/m/([^"]+)"', +            webpage, 'uploader_id')          categories = self._html_search_meta('keywords', webpage)          if categories: @@ -79,8 +97,8 @@ class MotherlessIE(InfoExtractor):              'uploader_id': uploader_id,              'thumbnail': self._og_search_thumbnail(webpage),              'categories': categories, -            'view_count': int_or_none(view_count.replace(',', '')), -            'like_count': int_or_none(like_count.replace(',', '')), +            'view_count': view_count, +            'like_count': like_count,              'comment_count': comment_count,              'age_limit': age_limit,              'url': video_url, diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 94d5ba982..add4b3e5d 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -18,16 +18,16 @@ class NDRIE(InfoExtractor):      _TESTS = [          { -            'url': 'http://www.ndr.de/fernsehen/media/dienordreportage325.html', -            'md5': '4a4eeafd17c3058b65f0c8f091355855', +            'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html', +            'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c',              'note': 'Video file',              'info_dict': { -                'id': '325', +                'id': '25866',                  'ext': 'mp4', -                'title': 'Blaue Bohnen aus Blocken', -                'description': 'md5:190d71ba2ccddc805ed01547718963bc', -                'duration': 1715, -            }, +                'title': 'Kartoffeltage in der Lewitz', +                'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8', +                'duration': 166, +            }          },          {              'url': 'http://www.ndr.de/info/audio51535.html', diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index 4832b3ce4..cc7c921c3 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -6,7 +6,7 @@ import re  from .common import InfoExtractor  from ..utils import (      ExtractorError, -    compat_urllib_parse, +    compat_urllib_parse_urlparse,      int_or_none,      remove_end,  ) @@ -90,7 +90,7 @@ class NFLIE(InfoExtractor):          cdn_data = video_data.get('cdnData', {})          streams = cdn_data.get('bitrateInfo', [])          if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM': -            parts = compat_urllib_parse.urlparse(cdn_data.get('uri')) +            parts = compat_urllib_parse_urlparse(cdn_data.get('uri'))              protocol, host = parts.scheme, parts.netloc              for stream in streams:                  formats.append( diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 072d9cf8e..d66c2c6f8 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -22,21 +22,23 @@ class NHLBaseInfoExtractor(InfoExtractor):          self.report_extraction(video_id)          initial_video_url = info['publishPoint'] -        data = compat_urllib_parse.urlencode({ -            'type': 'fvod', -            'path': initial_video_url.replace('.mp4', '_sd.mp4'), -        }) -        path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data -        path_doc = self._download_xml( -            path_url, video_id, 'Downloading final video url') -        video_url = path_doc.find('path').text +        if info['formats'] == '1': +            data = compat_urllib_parse.urlencode({ +                'type': 'fvod', +                'path': initial_video_url.replace('.mp4', '_sd.mp4'), +            }) +            path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data +            path_doc = self._download_xml( +                path_url, video_id, 'Downloading final video url') +            video_url = path_doc.find('path').text +        else: +           video_url = initial_video_url          join = compat_urlparse.urljoin          return {              'id': video_id,              'title': info['name'],              'url': video_url, -            'ext': determine_ext(video_url),              'description': info['description'],              'duration': int(info['duration']),              'thumbnail': join(join(video_url, '/u/'), info['bigImage']), @@ -46,10 +48,11 @@ class NHLBaseInfoExtractor(InfoExtractor):  class NHLIE(NHLBaseInfoExtractor):      IE_NAME = 'nhl.com' -    _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console(?:\?(?:.*?[?&])?)id=(?P<id>[0-9]+)' +    _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console(?:\?(?:.*?[?&])?)id=(?P<id>[0-9a-z-]+)'      _TESTS = [{          'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', +        'md5': 'db704a4ea09e8d3988c85e36cc892d09',          'info_dict': {              'id': '453614',              'ext': 'mp4', @@ -59,6 +62,17 @@ class NHLIE(NHLBaseInfoExtractor):              'upload_date': '20131006',          },      }, { +        'url': 'http://video.nhl.com/videocenter/console?id=2014020024-628-h', +        'md5': 'd22e82bc592f52d37d24b03531ee9696', +        'info_dict': { +            'id': '2014020024-628-h', +            'ext': 'mp4', +            'title': 'Alex Galchenyuk Goal on Ray Emery (14:40/3rd)', +            'description': 'Home broadcast - Montreal Canadiens at Philadelphia Flyers - October 11, 2014', +            'duration': 0, +            'upload_date': '20141011', +        }, +    }, {          'url': 'http://video.flames.nhl.com/videocenter/console?id=630616',          'only_matching': True,      }] diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index c0c139b5d..7b85589b7 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -39,18 +39,17 @@ class NiconicoIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/((?:[a-z]{2})?[0-9]+)'      _NETRC_MACHINE = 'niconico' -    # Determine whether the downloader uses authentication to download video -    _AUTHENTICATE = False +    # Determine whether the downloader used authentication to download video +    _AUTHENTICATED = False      def _real_initialize(self): -        if self._downloader.params.get('username', None) is not None: -            self._AUTHENTICATE = True - -        if self._AUTHENTICATE: -            self._login() +        self._login()      def _login(self):          (username, password) = self._get_login_info() +        # No authentication to be performed +        if not username: +            return True          # Log in          login_form_strs = { @@ -68,6 +67,8 @@ class NiconicoIE(InfoExtractor):          if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:              self._downloader.report_warning('unable to log in: bad username or password')              return False +        # Successful login +        self._AUTHENTICATED = True          return True      def _real_extract(self, url): @@ -82,7 +83,7 @@ class NiconicoIE(InfoExtractor):              'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,              note='Downloading video info page') -        if self._AUTHENTICATE: +        if self._AUTHENTICATED:              # Get flv info              flv_info_webpage = self._download_webpage(                  'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8f140d626..6118ed5c2 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -80,8 +80,14 @@ class PBSIE(InfoExtractor):                  'thumbnail': 're:^https?://.*\.jpg$',                  'upload_date': '20140122',              } +        }, +        { +            'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', +            'info_dict': { +                'id': 'united-states-of-secrets', +            }, +            'playlist_count': 2,          } -      ]      def _extract_webpage(self, url): @@ -96,6 +102,12 @@ class PBSIE(InfoExtractor):                  r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"',                  webpage, 'upload date', default=None)) +            # tabbed frontline videos +            tabbed_videos = re.findall( +                r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', webpage) +            if tabbed_videos: +                return tabbed_videos, presumptive_id, upload_date +              MEDIA_ID_REGEXES = [                  r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed                  r'class="coveplayerid">([^<]+)<',                       # coveplayer @@ -130,6 +142,12 @@ class PBSIE(InfoExtractor):      def _real_extract(self, url):          video_id, display_id, upload_date = self._extract_webpage(url) +        if isinstance(video_id, list): +            entries = [self.url_result( +                'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id) +                for vid_id in video_id] +            return self.playlist_result(entries, display_id) +          info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id          info = self._download_json(info_url, display_id) diff --git a/youtube_dl/extractor/planetaplay.py b/youtube_dl/extractor/planetaplay.py new file mode 100644 index 000000000..596c621d7 --- /dev/null +++ b/youtube_dl/extractor/planetaplay.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class PlanetaPlayIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?planetaplay\.com/\?sng=(?P<id>[0-9]+)' +    _API_URL = 'http://planetaplay.com/action/playlist/?sng={0:}' +    _THUMBNAIL_URL = 'http://planetaplay.com/img/thumb/{thumb:}' +    _TEST = { +        'url': 'http://planetaplay.com/?sng=3586', +        'md5': '9d569dceb7251a4e01355d5aea60f9db', +        'info_dict': { +            'id': '3586', +            'ext': 'flv', +            'title': 'md5:e829428ee28b1deed00de90de49d1da1', +        } +    } + +    _SONG_FORMATS = { +        'lq': (0, 'http://www.planetaplay.com/videoplayback/{med_hash:}'), +        'hq': (1, 'http://www.planetaplay.com/videoplayback/hi/{med_hash:}'), +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        response = self._download_json( +            self._API_URL.format(video_id), video_id)['response'] +        try: +            data = response.get('data')[0] +        except IndexError: +            raise ExtractorError( +                '%s: failed to get the playlist' % self.IE_NAME, expected=True) + +        title = '{song_artists:} - {sng_name:}'.format(**data) +        thumbnail = self._THUMBNAIL_URL.format(**data) + +        formats = [] +        for format_id, (quality, url_template) in self._SONG_FORMATS.items(): +            formats.append({ +                'format_id': format_id, +                'url': url_template.format(**data), +                'quality': quality, +                'ext': 'flv', +            }) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 48ce6e730..bac484c67 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -4,19 +4,27 @@ import re  import json  from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( +    int_or_none, +    js_to_json, +    qualities, +    determine_ext, +)  class PornHdIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)' +    _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?'      _TEST = {          'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',          'md5': '956b8ca569f7f4d8ec563e2c41598441',          'info_dict': {              'id': '1962', +            'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video',              'ext': 'mp4',              'title': 'Sierra loves doing laundry',              'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', +            'thumbnail': 're:^https?://.*\.jpg', +            'view_count': int,              'age_limit': 18,          }      } @@ -24,8 +32,9 @@ class PornHdIE(InfoExtractor):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') +        display_id = mobj.group('display_id') -        webpage = self._download_webpage(url, video_id) +        webpage = self._download_webpage(url, display_id or video_id)          title = self._html_search_regex(              r'<title>(.+) porn HD.+?</title>', webpage, 'title') @@ -33,38 +42,21 @@ class PornHdIE(InfoExtractor):              r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)          view_count = int_or_none(self._html_search_regex(              r'(\d+) views\s*</span>', webpage, 'view count', fatal=False)) +        thumbnail = self._search_regex( +            r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) -        videos = re.findall( -            r'var __video([\da-zA-Z]+?)(Low|High)StreamUrl = \'(http://.+?)\?noProxy=1\'', webpage) - -        mobj = re.search(r'flashVars = (?P<flashvars>{.+?});', webpage) -        if mobj: -            flashvars = json.loads(mobj.group('flashvars')) -            for key, quality in [('hashlink', 'low'), ('hd', 'high')]: -                redirect_url = flashvars.get(key) -                if redirect_url: -                    videos.append(('flv', quality, redirect_url)) -            thumbnail = flashvars['urlWallpaper'] -        else: -            thumbnail = self._og_search_thumbnail(webpage) - -        formats = [] -        for format_, quality, redirect_url in videos: -            format_id = '%s-%s' % (format_.lower(), quality.lower()) -            video_url = self._download_webpage( -                redirect_url, video_id, 'Downloading %s video link' % format_id, fatal=False) -            if not video_url: -                continue -            formats.append({ -                'url': video_url, -                'ext': format_.lower(), -                'format_id': format_id, -                'quality': 1 if quality.lower() == 'high' else 0, -            }) +        quality = qualities(['SD', 'HD']) +        formats = [{ +            'url': source['file'], +            'format_id': '%s-%s' % (source['label'], determine_ext(source['file'])), +            'quality': quality(source['label']), +        } for source in json.loads(js_to_json(self._search_regex( +            r"(?s)'sources'\s*:\s*(\[.+?\])", webpage, 'sources')))]          self._sort_formats(formats)          return {              'id': video_id, +            'display_id': display_id,              'title': title,              'description': description,              'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py new file mode 100644 index 000000000..c833fc8ee --- /dev/null +++ b/youtube_dl/extractor/sexykarma.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    unified_strdate, +    parse_duration, +    int_or_none, +) + + +class SexyKarmaIE(InfoExtractor): +    IE_DESC = 'Sexy Karma and Watch Indian Porn' +    _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' +    _TESTS = [{ +        'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html', +        'md5': 'b9798e7d1ef1765116a8f516c8091dbd', +        'info_dict': { +            'id': 'yHI70cOyIHt', +            'display_id': 'taking-a-quick-pee', +            'ext': 'mp4', +            'title': 'Taking a quick pee.', +            'thumbnail': 're:^https?://.*\.jpg$', +            'uploader': 'wildginger7', +            'upload_date': '20141007', +            'duration': 22, +            'view_count': int, +            'comment_count': int, +            'categories': list, +        } +    }, { +        'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', +        'md5': 'dd216c68d29b49b12842b9babe762a5d', +        'info_dict': { +            'id': '8Id6EZPbuHf', +            'display_id': 'pot-pixie-tribute', +            'ext': 'mp4', +            'title': 'pot_pixie tribute', +            'thumbnail': 're:^https?://.*\.jpg$', +            'uploader': 'banffite', +            'upload_date': '20141013', +            'duration': 16, +            'view_count': int, +            'comment_count': int, +            'categories': list, +        } +    }, { +        'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', +        'md5': '9afb80675550406ed9a63ac2819ef69d', +        'info_dict': { +            'id': 'dW2mtctxJfs', +            'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number', +            'ext': 'mp4', +            'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number', +            'thumbnail': 're:^https?://.*\.jpg$', +            'uploader': 'Don', +            'upload_date': '20140213', +            'duration': 83, +            'view_count': int, +            'comment_count': int, +            'categories': list, +        } +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        display_id = mobj.group('display_id') + +        webpage = self._download_webpage(url, display_id) + +        video_url = self._html_search_regex( +            r"url: escape\('([^']+)'\)", webpage, 'url') + +        title = self._html_search_regex( +            r'<h2 class="he2"><span>(.*?)</span>', +            webpage, 'title') +        thumbnail = self._html_search_regex( +            r'<span id="container"><img\s+src="([^"]+)"', +            webpage, 'thumbnail', fatal=False) + +        uploader = self._html_search_regex( +            r'class="aupa">\s*(.*?)</a>', +            webpage, 'uploader') +        upload_date = unified_strdate(self._html_search_regex( +            r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False)) + +        duration = parse_duration(self._search_regex( +            r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>', +            webpage, 'duration', fatal=False)) + +        view_count = int_or_none(self._search_regex( +            r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', +            webpage, 'view count', fatal=False)) +        comment_count = int_or_none(self._search_regex( +            r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', +            webpage, 'comment count', fatal=False)) + +        categories = re.findall( +            r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>', +            webpage) + +        return { +            'id': video_id, +            'display_id': display_id, +            'url': video_url, +            'title': title, +            'thumbnail': thumbnail, +            'uploader': uploader, +            'upload_date': upload_date, +            'duration': duration, +            'view_count': view_count, +            'comment_count': comment_count, +            'categories': categories, +        } diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 4719ba45c..c77671fd3 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -40,14 +40,15 @@ class SoundcloudIE(InfoExtractor):      _TESTS = [          {              'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', -            'file': '62986583.mp3',              'md5': 'ebef0a451b909710ed1d7787dddbf0d7',              'info_dict': { -                "upload_date": "20121011", -                "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd", -                "uploader": "E.T. ExTerrestrial Music", -                "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1", -                "duration": 143, +                'id': '62986583', +                'ext': 'mp3', +                'upload_date': '20121011', +                'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', +                'uploader': 'E.T. ExTerrestrial Music', +                'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', +                'duration': 143,              }          },          # not streamable song @@ -103,7 +104,7 @@ class SoundcloudIE(InfoExtractor):                  'id': '128590877',                  'ext': 'mp3',                  'title': 'Bus Brakes', -                'description': 'md5:0170be75dd395c96025d210d261c784e', +                'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66',                  'uploader': 'oddsamples',                  'upload_date': '20140109',                  'duration': 17, diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 21491027a..94602e89e 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -45,7 +45,7 @@ class SpankwireIE(InfoExtractor):              r'<div\s+id="descriptionContent">([^<]+)<',              webpage, 'description', fatal=False)          thumbnail = self._html_search_regex( -            r'playerData\.screenShot\s*=\s*"([^"]+)"', +            r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']',              webpage, 'thumbnail', fatal=False)          uploader = self._html_search_regex( @@ -67,7 +67,7 @@ class SpankwireIE(InfoExtractor):          video_urls = list(map(              compat_urllib_parse.unquote, -            re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*"([^"]+)', webpage))) +            re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage)))          if webpage.find('flashvars\.encrypted = "true"') != -1:              password = self._html_search_regex(                  r'flashvars\.video_title = "([^"]+)', diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py new file mode 100644 index 000000000..b9cd35109 --- /dev/null +++ b/youtube_dl/extractor/sportbox.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    parse_duration, +    parse_iso8601, +    int_or_none, +) + + +class SportBoxIE(InfoExtractor): +    _VALID_URL = r'https?://news\.sportbox\.ru/Vidy_sporta/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)' +    _TESTS = [ +        { +            'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', +            'md5': 'ff56a598c2cf411a9a38a69709e97079', +            'info_dict': { +                'id': '80822', +                'ext': 'mp4', +                'title': 'Гонка 2  заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', +                'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed', +                'thumbnail': 're:^https?://.*\.jpg$', +                'timestamp': 1411896237, +                'upload_date': '20140928', +                'duration': 4846, +            }, +            'params': { +                # m3u8 download +                'skip_download': True, +            }, +        }, { +            'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', +            'only_matching': True, +        } +    ] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        display_id = mobj.group('display_id') + +        webpage = self._download_webpage(url, display_id) + +        video_id = self._search_regex( +            r'src="/vdl/player/media/(\d+)"', webpage, 'video id') + +        player = self._download_webpage( +            'http://news.sportbox.ru/vdl/player/media/%s' % video_id, +            display_id, 'Downloading player webpage') + +        hls = self._search_regex( +            r"var\s+original_hls_file\s*=\s*'([^']+)'", player, 'hls file') + +        formats = self._extract_m3u8_formats(hls, display_id, 'mp4') + +        title = self._html_search_regex( +            r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title') +        description = self._html_search_regex( +            r'(?s)<div itemprop="description">(.+?)</div>', webpage, 'description', fatal=False) +        thumbnail = self._og_search_thumbnail(webpage) +        timestamp = parse_iso8601(self._search_regex( +            r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False)) +        duration = parse_duration(self._html_search_regex( +            r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False)) + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'timestamp': timestamp, +            'duration': duration, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index 7de3c9dd5..263f09b46 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -39,10 +39,10 @@ class SunPornoIE(InfoExtractor):              r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False)          duration = parse_duration(self._search_regex( -            r'<span>Duration: (\d+:\d+)</span>', webpage, 'duration', fatal=False)) +            r'Duration:\s*(\d+:\d+)\s*<', webpage, 'duration', fatal=False))          view_count = int_or_none(self._html_search_regex( -            r'<span class="views">(\d+)</span>', webpage, 'view count', fatal=False)) +            r'class="views">\s*(\d+)\s*<', webpage, 'view count', fatal=False))          comment_count = int_or_none(self._html_search_regex(              r'(\d+)</b> Comments?', webpage, 'comment count', fatal=False)) diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py new file mode 100644 index 000000000..77e056242 --- /dev/null +++ b/youtube_dl/extractor/tapely.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    clean_html, +    compat_urllib_request, +    float_or_none, +    parse_iso8601, +) + + +class TapelyIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?tape\.ly/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?' +    _API_URL = 'http://tape.ly/showtape?id={0:}' +    _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}' +    _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}' +    _TESTS = [ +        { +            'url': 'http://tape.ly/my-grief-as-told-by-water', +            'info_dict': { +                'id': 23952, +                'title': 'my grief as told by water', +                'thumbnail': 're:^https?://.*\.png$', +                'uploader_id': 16484, +                'timestamp': 1411848286, +                'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.', +            }, +            'playlist_count': 13, +        }, +        { +            'url': 'http://tape.ly/my-grief-as-told-by-water/1', +            'md5': '79031f459fdec6530663b854cbc5715c', +            'info_dict': { +                'id': 258464, +                'title': 'Dreaming Awake  (My Brightest Diamond)', +                'ext': 'm4a', +            }, +        }, +    ] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        display_id = mobj.group('id') + +        playlist_url = self._API_URL.format(display_id) +        request = compat_urllib_request.Request(playlist_url) +        request.add_header('X-Requested-With', 'XMLHttpRequest') +        request.add_header('Accept', 'application/json') + +        playlist = self._download_json(request, display_id) + +        tape = playlist['tape'] + +        entries = [] +        for s in tape['songs']: +            song = s['song'] +            entry = { +                'id': song['id'], +                'duration': float_or_none(song.get('songduration'), 1000), +                'title': song['title'], +            } +            if song['source'] == 'S3': +                entry.update({ +                    'url': self._S3_SONG_URL.format(song['filename']), +                }) +                entries.append(entry) +            elif song['source'] == 'YT': +                self.to_screen('YouTube video detected') +                yt_id = song['filename'].replace('/youtube/', '') +                entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id)) +                entries.append(entry) +            elif song['source'] == 'SC': +                self.to_screen('SoundCloud song detected') +                sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename']) +                entry.update(self.url_result(sc_url, 'Soundcloud')) +                entries.append(entry) +            else: +                self.report_warning('Unknown song source: %s' % song['source']) + +        if mobj.group('songnr'): +            songnr = int(mobj.group('songnr')) - 1 +            try: +                return entries[songnr] +            except IndexError: +                raise ExtractorError( +                    'No song with index: %s' % mobj.group('songnr'), +                    expected=True) + +        return { +            '_type': 'playlist', +            'id': tape['id'], +            'display_id': display_id, +            'title': tape['name'], +            'entries': entries, +            'thumbnail': tape.get('image_url'), +            'description': clean_html(tape.get('subtext')), +            'like_count': tape.get('likescount'), +            'uploader_id': tape.get('user_id'), +            'timestamp': parse_iso8601(tape.get('published_at')), +        } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index d5e28efad..cd4af96fd 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -65,6 +65,22 @@ class TEDIE(SubtitlesInfoExtractor):              'title': 'Who are the hackers?',          },          'playlist_mincount': 6, +    }, { +        # contains a youtube video +        'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', +        'add_ie': ['Youtube'], +        'info_dict': { +            'id': '_ZG8HBuDjgc', +            'ext': 'mp4', +            'title': 'Douglas Adams: Parrots the Universe and Everything', +            'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', +            'uploader': 'University of California Television (UCTV)', +            'uploader_id': 'UCtelevision', +            'upload_date': '20080522', +        }, +        'params': { +            'skip_download': True, +        },      }]      _NATIVE_FORMATS = { @@ -114,6 +130,13 @@ class TEDIE(SubtitlesInfoExtractor):          talk_info = self._extract_info(webpage)['talks'][0] +        if talk_info.get('external') is not None: +            self.to_screen('Found video from %s' % talk_info['external']['service']) +            return { +                '_type': 'url', +                'url': talk_info['external']['uri'], +            } +          formats = [{              'url': format_url,              'format_id': format_id, diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py new file mode 100644 index 000000000..db9788c18 --- /dev/null +++ b/youtube_dl/extractor/telecinco.py @@ -0,0 +1,19 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .mitele import MiTeleIE + + +class TelecincoIE(MiTeleIE): +    IE_NAME = 'telecinco.es' +    _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/[^/]+/(?P<episode>.*?)\.html' + +    _TEST = { +        'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', +        'info_dict': { +            'id': 'MDSVID20141015_0058', +            'ext': 'mp4', +            'title': 'Con Martín Berasategui, hacer un bacalao al ...', +            'duration': 662, +        }, +    } diff --git a/youtube_dl/extractor/theonion.py b/youtube_dl/extractor/theonion.py new file mode 100644 index 000000000..b65d8e03f --- /dev/null +++ b/youtube_dl/extractor/theonion.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class TheOnionIE(InfoExtractor): +    _VALID_URL = r'(?x)https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<article_id>[0-9]+)/?' +    _TEST = { +        'url': 'http://www.theonion.com/video/man-wearing-mm-jacket-gods-image,36918/', +        'md5': '19eaa9a39cf9b9804d982e654dc791ee', +        'info_dict': { +            'id': '2133', +            'ext': 'mp4', +            'title': 'Man Wearing M&M Jacket Apparently Made In God\'s Image', +            'description': 'md5:cc12448686b5600baae9261d3e180910', +            'thumbnail': 're:^https?://.*\.jpg\?\d+$', +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        article_id = mobj.group('article_id') + +        webpage = self._download_webpage(url, article_id) + +        video_id = self._search_regex( +            r'"videoId":\s(\d+),', webpage, 'video ID') +        title = self._og_search_title(webpage) +        description = self._og_search_description(webpage) +        thumbnail = self._og_search_thumbnail(webpage) + +        sources = re.findall(r'<source src="([^"]+)" type="([^"]+)"', webpage) +        if not sources: +            raise ExtractorError( +                'No sources found for video %s' % video_id, expected=True) + +        formats = [] +        for src, type_ in sources: +            if type_ == 'video/mp4': +                formats.append({ +                    'format_id': 'mp4_sd', +                    'preference': 1, +                    'url': src, +                }) +            elif type_ == 'video/webm': +                formats.append({ +                    'format_id': 'webm_sd', +                    'preference': 0, +                    'url': src, +                }) +            elif type_ == 'application/x-mpegURL': +                formats.extend( +                    self._extract_m3u8_formats(src, video_id, preference=-1)) +            else: +                self.report_warning( +                    'Encountered unexpected format: %s' % type_) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +            'thumbnail': thumbnail, +            'description': description, +        } diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py new file mode 100644 index 000000000..a77c6a2fc --- /dev/null +++ b/youtube_dl/extractor/thesixtyone.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class TheSixtyOneIE(InfoExtractor): +    _VALID_URL = r'''(?x)https?://(?:www\.)?thesixtyone\.com/ +        (?:.*?/)* +        (?: +            s| +            song/comments/list| +            song +        )/(?P<id>[A-Za-z0-9]+)/?$''' +    _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}' +    _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}.thesixtyone.com/thesixtyone_production/audio/{0:}_stream' +    _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop' +    _TESTS = [ +        { +            'url': 'http://www.thesixtyone.com/s/SrE3zD7s1jt/', +            'md5': '821cc43b0530d3222e3e2b70bb4622ea', +            'info_dict': { +                'id': 'SrE3zD7s1jt', +                'ext': 'mp3', +                'title': 'CASIO - Unicorn War Mixtape', +                'thumbnail': 're:^https?://.*_desktop$', +                'upload_date': '20071217', +                'duration': 3208, +            } +        }, +        { +            'url': 'http://www.thesixtyone.com/song/comments/list/SrE3zD7s1jt', +            'only_matching': True, +        }, +        { +            'url': 'http://www.thesixtyone.com/s/ULoiyjuJWli#/s/SrE3zD7s1jt/', +            'only_matching': True, +        }, +        { +            'url': 'http://www.thesixtyone.com/#/s/SrE3zD7s1jt/', +            'only_matching': True, +        }, +        { +            'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/', +            'only_matching': True, +        }, +    ] + +    _DECODE_MAP = { +        "x": "a", +        "m": "b", +        "w": "c", +        "q": "d", +        "n": "e", +        "p": "f", +        "a": "0", +        "h": "1", +        "e": "2", +        "u": "3", +        "s": "4", +        "i": "5", +        "o": "6", +        "y": "7", +        "r": "8", +        "c": "9" +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        song_id = mobj.group('id') + +        webpage = self._download_webpage( +            self._SONG_URL_TEMPLATE.format(song_id), song_id) + +        song_data = json.loads(self._search_regex( +            r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data')) +        keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']] +        url = self._SONG_FILE_URL_TEMPLATE.format( +            "".join(reversed(keys)), **song_data) + +        formats = [{ +            'format_id': 'sd', +            'url': url, +            'ext': 'mp3', +        }] + +        return { +            'id': song_id, +            'title': '{artist:} - {name:}'.format(**song_data), +            'formats': formats, +            'comment_count': song_data.get('comments_count'), +            'duration': song_data.get('play_time'), +            'like_count': song_data.get('score'), +            'thumbnail': self._THUMBNAIL_URL_TEMPLATE.format(**song_data), +            'upload_date': unified_strdate(song_data.get('publish_date')), +        } diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 306fe8974..40c53ff17 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,9 +4,6 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import ( -    ExtractorError, -)  class TumblrIE(InfoExtractor): @@ -18,7 +15,7 @@ class TumblrIE(InfoExtractor):              'id': '54196191430',              'ext': 'mp4',              'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', -            'description': 'md5:dfac39636969fe6bf1caa2d50405f069', +            'description': 'md5:37db8211e40b50c7c44e95da14f630b7',              'thumbnail': 're:http://.*\.jpg',          }      }, { @@ -27,7 +24,7 @@ class TumblrIE(InfoExtractor):          'info_dict': {              'id': '90208453769',              'ext': 'mp4', -            'title': '5SOS STRUM ;)', +            'title': '5SOS STRUM ;]',              'description': 'md5:dba62ac8639482759c8eb10ce474586a',              'thumbnail': 're:http://.*\.jpg',          } @@ -41,18 +38,12 @@ class TumblrIE(InfoExtractor):          url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)          webpage = self._download_webpage(url, video_id) -        re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id) -        video = re.search(re_video, webpage) -        if video is None: -            raise ExtractorError('Unable to extract video') -        video_url = video.group('video_url') -        ext = video.group('ext') - -        video_thumbnail = self._search_regex( -            r'posters.*?\[\\x22(.*?)\\x22', -            webpage, 'thumbnail', fatal=False)  # We pick the first poster -        if video_thumbnail: -            video_thumbnail = video_thumbnail.replace('\\\\/', '/') +        iframe_url = self._search_regex( +            r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', +            webpage, 'iframe url') +        iframe = self._download_webpage(iframe_url, video_id) +        video_url = self._search_regex(r'<source src="([^"]+)"', +            iframe, 'video url')          # The only place where you can get a title, it's not complete,          # but searching in other places doesn't work for all videos @@ -62,9 +53,9 @@ class TumblrIE(InfoExtractor):          return {              'id': video_id, -             'url': video_url, -             'title': video_title, -             'description': self._html_search_meta('description', webpage), -             'thumbnail': video_thumbnail, -             'ext': ext, +            'url': video_url, +            'ext': 'mp4', +            'title': video_title, +            'description': self._og_search_description(webpage), +            'thumbnail': self._og_search_thumbnail(webpage),          } diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py new file mode 100644 index 000000000..36aa1ad6e --- /dev/null +++ b/youtube_dl/extractor/twitch.py @@ -0,0 +1,187 @@ +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    parse_iso8601, +) + + +class TwitchIE(InfoExtractor): +    # TODO: One broadcast may be split into multiple videos. The key +    # 'broadcast_id' is the same for all parts, and 'broadcast_part' +    # starts at 1 and increases. Can we treat all parts as one video? +    _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?twitch\.tv/ +        (?: +            (?P<channelid>[^/]+)| +            (?:(?:[^/]+)/b/(?P<videoid>[^/]+))| +            (?:(?:[^/]+)/c/(?P<chapterid>[^/]+)) +        ) +        /?(?:\#.*)?$ +        """ +    _PAGE_LIMIT = 100 +    _API_BASE = 'https://api.twitch.tv' +    _TESTS = [{ +        'url': 'http://www.twitch.tv/riotgames/b/577357806', +        'info_dict': { +            'id': 'a577357806', +            'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG', +        }, +        'playlist_mincount': 12, +    }, { +        'url': 'http://www.twitch.tv/acracingleague/c/5285812', +        'info_dict': { +            'id': 'c5285812', +            'title': 'ACRL Off Season - Sports Cars @ Nordschleife', +        }, +        'playlist_mincount': 3, +    }, { +        'url': 'http://www.twitch.tv/vanillatv', +        'info_dict': { +            'id': 'vanillatv', +            'title': 'VanillaTV', +        }, +        'playlist_mincount': 412, +    }] + +    def _handle_error(self, response): +        if not isinstance(response, dict): +            return +        error = response.get('error') +        if error: +            raise ExtractorError( +                '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')), +                expected=True) + +    def _download_json(self, url, video_id, note='Downloading JSON metadata'): +        response = super(TwitchIE, self)._download_json(url, video_id, note) +        self._handle_error(response) +        return response + +    def _extract_media(self, item, item_id): +        ITEMS = { +            'a': 'video', +            'c': 'chapter', +        } +        info = self._extract_info(self._download_json( +            '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id, +            'Downloading %s info JSON' % ITEMS[item])) +        response = self._download_json( +            '%s/api/videos/%s%s' % (self._API_BASE, item, item_id), item_id, +            'Downloading %s playlist JSON' % ITEMS[item]) +        entries = [] +        chunks = response['chunks'] +        qualities = list(chunks.keys()) +        for num, fragment in enumerate(zip(*chunks.values()), start=1): +            formats = [] +            for fmt_num, fragment_fmt in enumerate(fragment): +                format_id = qualities[fmt_num] +                fmt = { +                    'url': fragment_fmt['url'], +                    'format_id': format_id, +                    'quality': 1 if format_id == 'live' else 0, +                } +                m = re.search(r'^(?P<height>\d+)[Pp]', format_id) +                if m: +                    fmt['height'] = int(m.group('height')) +                formats.append(fmt) +            self._sort_formats(formats) +            entry = dict(info) +            entry['id'] = '%s_%d' % (entry['id'], num) +            entry['title'] = '%s part %d' % (entry['title'], num) +            entry['formats'] = formats +            entries.append(entry) +        return self.playlist_result(entries, info['id'], info['title']) + +    def _extract_info(self, info): +        return { +            'id': info['_id'], +            'title': info['title'], +            'description': info['description'], +            'duration': info['length'], +            'thumbnail': info['preview'], +            'uploader': info['channel']['display_name'], +            'uploader_id': info['channel']['name'], +            'timestamp': parse_iso8601(info['recorded_at']), +            'view_count': info['views'], +        } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        if mobj.group('chapterid'): +            return self._extract_media('c', mobj.group('chapterid')) + +            """ +            webpage = self._download_webpage(url, chapter_id) +            m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage) +            if not m: +                raise ExtractorError('Cannot find archive of a chapter') +            archive_id = m.group(1) + +            api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id +            doc = self._download_xml( +                api, chapter_id, +                note='Downloading chapter information', +                errnote='Chapter information download failed') +            for a in doc.findall('.//archive'): +                if archive_id == a.find('./id').text: +                    break +            else: +                raise ExtractorError('Could not find chapter in chapter information') + +            video_url = a.find('./video_file_url').text +            video_ext = video_url.rpartition('.')[2] or 'flv' + +            chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id +            chapter_info = self._download_json( +                chapter_api_url, 'c' + chapter_id, +                note='Downloading chapter metadata', +                errnote='Download of chapter metadata failed') + +            bracket_start = int(doc.find('.//bracket_start').text) +            bracket_end = int(doc.find('.//bracket_end').text) + +            # TODO determine start (and probably fix up file) +            #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457 +            #video_url += '?start=' + TODO:start_timestamp +            # bracket_start is 13290, but we want 51670615 +            self._downloader.report_warning('Chapter detected, but we can just download the whole file. ' +                                            'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end))) + +            info = { +                'id': 'c' + chapter_id, +                'url': video_url, +                'ext': video_ext, +                'title': chapter_info['title'], +                'thumbnail': chapter_info['preview'], +                'description': chapter_info['description'], +                'uploader': chapter_info['channel']['display_name'], +                'uploader_id': chapter_info['channel']['name'], +            } +            return info +            """ +        elif mobj.group('videoid'): +            return self._extract_media('a', mobj.group('videoid')) +        elif mobj.group('channelid'): +            channel_id = mobj.group('channelid') +            info = self._download_json( +                '%s/kraken/channels/%s' % (self._API_BASE, channel_id), +                channel_id, 'Downloading channel info JSON') +            channel_name = info.get('display_name') or info.get('name') +            entries = [] +            offset = 0 +            limit = self._PAGE_LIMIT +            for counter in itertools.count(1): +                response = self._download_json( +                    '%s/kraken/channels/%s/videos/?offset=%d&limit=%d' +                    % (self._API_BASE, channel_id, offset, limit), +                    channel_id, 'Downloading channel videos JSON page %d' % counter) +                videos = response['videos'] +                if not videos: +                    break +                entries.extend([self.url_result(video['url'], 'Twitch') for video in videos]) +                offset += limit +            return self.playlist_result(entries, channel_id, channel_name) diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 9328ef4a2..0faa729c6 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -1,55 +1,85 @@ -import json -import re +from __future__ import unicode_literals  from .common import InfoExtractor +from ..utils import ( +    float_or_none, +    int_or_none, +)  class ViddlerIE(InfoExtractor): -    _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler\.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)' +    _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)'      _TEST = { -        u"url": u"http://www.viddler.com/v/43903784", -        u'file': u'43903784.mp4', -        u'md5': u'fbbaedf7813e514eb7ca30410f439ac9', -        u'info_dict': { -            u"title": u"Video Made Easy", -            u"uploader": u"viddler", -            u"duration": 100.89, +        "url": "http://www.viddler.com/v/43903784", +        'md5': 'ae43ad7cb59431ce043f0ff7fa13cbf4', +        'info_dict': { +            'id': '43903784', +            'ext': 'mp4', +            "title": "Video Made Easy", +            'description': 'You don\'t need to be a professional to make high-quality video content. Viddler provides some quick and easy tips on how to produce great video content with limited resources. ', +            "uploader": "viddler", +            'timestamp': 1335371429, +            'upload_date': '20120425', +            "duration": 100.89, +            'thumbnail': 're:^https?://.*\.jpg$', +            'view_count': int, +            'categories': ['video content', 'high quality video', 'video made easy', 'how to produce video with limited resources', 'viddler'],          }      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') - -        embed_url = mobj.group('domain') + u'/embed/' + video_id -        webpage = self._download_webpage(embed_url, video_id) - -        video_sources_code = self._search_regex( -            r"(?ms)sources\s*:\s*(\{.*?\})", webpage, u'video URLs') -        video_sources = json.loads(video_sources_code.replace("'", '"')) - -        formats = [{ -            'url': video_url, -            'format': format_id, -        } for video_url, format_id in video_sources.items()] - -        title = self._html_search_regex( -            r"title\s*:\s*'([^']*)'", webpage, u'title') -        uploader = self._html_search_regex( -            r"authorName\s*:\s*'([^']*)'", webpage, u'uploader', fatal=False) -        duration_s = self._html_search_regex( -            r"duration\s*:\s*([0-9.]*)", webpage, u'duration', fatal=False) -        duration = float(duration_s) if duration_s else None -        thumbnail = self._html_search_regex( -            r"thumbnail\s*:\s*'([^']*)'", -            webpage, u'thumbnail', fatal=False) +        video_id = self._match_id(url) + +        json_url = ( +            'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?video_id=%s&key=v0vhrt7bg2xq1vyxhkct' % +            video_id) +        data = self._download_json(json_url, video_id)['video'] + +        formats = [] +        for filed in data['files']: +            if filed.get('status', 'ready') != 'ready': +                continue +            f = { +                'format_id': filed['profile_id'], +                'format_note': filed['profile_name'], +                'url': self._proto_relative_url(filed['url']), +                'width': int_or_none(filed.get('width')), +                'height': int_or_none(filed.get('height')), +                'filesize': int_or_none(filed.get('size')), +                'ext': filed.get('ext'), +                'source_preference': -1, +            } +            formats.append(f) + +            if filed.get('cdn_url'): +                f = f.copy() +                f['url'] = self._proto_relative_url(filed['cdn_url']) +                f['format_id'] = filed['profile_id'] + '-cdn' +                f['source_preference'] = 1 +                formats.append(f) + +            if filed.get('html5_video_source'): +                f = f.copy() +                f['url'] = self._proto_relative_url( +                    filed['html5_video_source']) +                f['format_id'] = filed['profile_id'] + '-html5' +                f['source_preference'] = 0 +                formats.append(f) +        self._sort_formats(formats) + +        categories = [ +            t.get('text') for t in data.get('tags', []) if 'text' in t]          return {              '_type': 'video',              'id': video_id, -            'title': title, -            'thumbnail': thumbnail, -            'uploader': uploader, -            'duration': duration, +            'title': data['title'],              'formats': formats, +            'description': data.get('description'), +            'timestamp': int_or_none(data.get('upload_time')), +            'thumbnail': self._proto_relative_url(data.get('thumbnail_url')), +            'uploader': data.get('author'), +            'duration': float_or_none(data.get('length')), +            'view_count': int_or_none(data.get('view_count')), +            'categories': categories,          } diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py new file mode 100644 index 000000000..669979e13 --- /dev/null +++ b/youtube_dl/extractor/vidzi.py @@ -0,0 +1,33 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class VidziIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)' +    _TEST = { +        'url': 'http://vidzi.tv/cghql9yq6emu.html', +        'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', +        'info_dict': { +            'id': 'cghql9yq6emu', +            'ext': 'mp4', +            'title': 'youtube-dl test video  1\\\\2\'3/4<5\\\\6ä7↭', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +         +        webpage = self._download_webpage(url, video_id) +        video_url = self._html_search_regex( +            r'{\s*file\s*:\s*"([^"]+)"\s*}', webpage, 'video url') +        title = self._html_search_regex( +            r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') +         +        return { +            'id': video_id, +            'title': title, +            'url': video_url, +        } +        
\ No newline at end of file diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index d2c36b58a..07959d3fe 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -56,7 +56,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):      # _VALID_URL matches Vimeo URLs      _VALID_URL = r'''(?x) -        (?P<proto>(?:https?:)?//)? +        https?://          (?:(?:www|(?P<player>player))\.)?          vimeo(?P<pro>pro)?\.com/          (?!channels/[^/?#]+/?(?:$|[?#])|album/) @@ -157,6 +157,18 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):                  'duration': 62,              }          }, +        { +            # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/ +            'url': 'https://player.vimeo.com/video/98044508', +            'note': 'The js code contains assignments to the same variable as the config', +            'info_dict': { +                'id': '98044508', +                'ext': 'mp4', +                'title': 'Pier Solar OUYA Official Trailer', +                'uploader': 'Tulio Gonçalves', +                'uploader_id': 'user28849593', +            }, +        },      ]      def _verify_video_password(self, url, video_id, webpage): @@ -244,7 +256,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):                  # We try to find out to which variable is assigned the config dic                  m_variable_name = re.search('(\w)\.video\.id', webpage)                  if m_variable_name is not None: -                    config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1)) +                    config_re = r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))                  else:                      config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']                  config = self._search_regex(config_re, webpage, 'info section', diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py new file mode 100644 index 000000000..57ef8dc30 --- /dev/null +++ b/youtube_dl/extractor/vrt.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import float_or_none + + +class VRTIE(InfoExtractor): +    _VALID_URL = r'https?://(?:deredactie|sporza|cobra)\.be/cm/(?:[^/]+/)+(?P<id>[^/]+)/*' +    _TESTS = [ +        # deredactie.be +        { +            'url': 'http://deredactie.be/cm/vrtnieuws/videozone/programmas/journaal/EP_141025_JOL', +            'md5': '4cebde1eb60a53782d4f3992cbd46ec8', +            'info_dict': { +                'id': '2129880', +                'ext': 'flv', +                'title': 'Het journaal L - 25/10/14', +                'description': None, +                'timestamp': 1414271750.949, +                'upload_date': '20141025', +                'duration': 929, +            } +        }, +        # sporza.be +        { +            'url': 'http://sporza.be/cm/sporza/videozone/programmas/extratime/EP_141020_Extra_time', +            'md5': '11f53088da9bf8e7cfc42456697953ff', +            'info_dict': { +                'id': '2124639', +                'ext': 'flv', +                'title': 'Bekijk Extra Time van 20 oktober', +                'description': 'md5:83ac5415a4f1816c6a93f8138aef2426', +                'timestamp': 1413835980.560, +                'upload_date': '20141020', +                'duration': 3238, +            }   +        }, +        # cobra.be +        { +            'url': 'http://cobra.be/cm/cobra/videozone/rubriek/film-videozone/141022-mv-ellis-cafecorsari', +            'md5': '78a2b060a5083c4f055449a72477409d', +            'info_dict': { +                'id': '2126050', +                'ext': 'flv', +                'title': 'Bret Easton Ellis in Café Corsari', +                'description': 'md5:f699986e823f32fd6036c1855a724ee9', +                'timestamp': 1413967500.494, +                'upload_date': '20141022', +                'duration': 661, +            } +        }, +    ] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        video_id = self._search_regex( +            r'data-video-id="([^"]+)_[^"]+"', webpage, 'video id', fatal=False) + +        formats = [] +        mobj = re.search( +            r'data-video-iphone-server="(?P<server>[^"]+)"\s+data-video-iphone-path="(?P<path>[^"]+)"', +            webpage) +        if mobj: +            formats.extend(self._extract_m3u8_formats( +                '%s/%s' % (mobj.group('server'), mobj.group('path')), +                video_id, 'mp4')) +        mobj = re.search(r'data-video-src="(?P<src>[^"]+)"', webpage) +        if mobj: +            formats.extend(self._extract_f4m_formats( +                '%s/manifest.f4m' % mobj.group('src'), video_id)) +        self._sort_formats(formats) + +        title = self._og_search_title(webpage) +        description = self._og_search_description(webpage, default=None) +        thumbnail = self._og_search_thumbnail(webpage) +        timestamp = float_or_none(self._search_regex( +            r'data-video-sitestat-pubdate="(\d+)"', webpage, 'timestamp', fatal=False), 1000) +        duration = float_or_none(self._search_regex( +            r'data-video-duration="(\d+)"', webpage, 'duration', fatal=False), 1000) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'timestamp': timestamp, +            'duration': duration, +            'formats': formats, +        }
\ No newline at end of file diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py new file mode 100644 index 000000000..672bda7a7 --- /dev/null +++ b/youtube_dl/extractor/walla.py @@ -0,0 +1,89 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .subtitles import SubtitlesInfoExtractor +from ..utils import ( +    xpath_text, +    int_or_none, +) + + +class WallaIE(SubtitlesInfoExtractor): +    _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)' +    _TEST = { +        'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', +        'info_dict': { +            'id': '2642630', +            'display_id': 'one-direction-all-for-one', +            'ext': 'flv', +            'title': 'וואן דיירקשן: ההיסטריה', +            'description': 'md5:de9e2512a92442574cdb0913c49bc4d8', +            'thumbnail': 're:^https?://.*\.jpg', +            'duration': 3600, +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        } +    } + +    _SUBTITLE_LANGS = { +        'עברית': 'heb', +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        display_id = mobj.group('display_id') + +        video = self._download_xml( +            'http://video2.walla.co.il/?w=null/null/%s/@@/video/flv_pl' % video_id, +            display_id) + +        item = video.find('./items/item') + +        title = xpath_text(item, './title', 'title') +        description = xpath_text(item, './synopsis', 'description') +        thumbnail = xpath_text(item, './preview_pic', 'thumbnail') +        duration = int_or_none(xpath_text(item, './duration', 'duration')) + +        subtitles = {} +        for subtitle in item.findall('./subtitles/subtitle'): +            lang = xpath_text(subtitle, './title') +            subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = xpath_text(subtitle, './src') + +        if self._downloader.params.get('listsubtitles', False): +            self._list_available_subtitles(video_id, subtitles) +            return + +        subtitles = self.extract_subtitles(video_id, subtitles) + +        formats = [] +        for quality in item.findall('./qualities/quality'): +            format_id = xpath_text(quality, './title') +            fmt = { +                'url': 'rtmp://wafla.walla.co.il/vod', +                'play_path': xpath_text(quality, './src'), +                'player_url': 'http://isc.walla.co.il/w9/swf/video_swf/vod/WallaMediaPlayerAvod.swf', +                'page_url': url, +                'ext': 'flv', +                'format_id': xpath_text(quality, './title'), +            } +            m = re.search(r'^(?P<height>\d+)[Pp]', format_id) +            if m: +                fmt['height'] = int(m.group('height')) +            formats.append(fmt) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'formats': formats, +            'subtitles': subtitles, +        } diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 221341c13..117f0856a 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -1,3 +1,4 @@ +# coding: utf-8  from __future__ import unicode_literals  import itertools @@ -6,6 +7,7 @@ import re  from .common import InfoExtractor, SearchInfoExtractor  from ..utils import ( +    ExtractorError,      compat_urllib_parse,      compat_urlparse,      clean_html, @@ -15,7 +17,7 @@ from ..utils import (  class YahooIE(InfoExtractor):      IE_DESC = 'Yahoo screen and movies' -    _VALID_URL = r'(?P<url>https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)' +    _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+?)-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'      _TESTS = [          {              'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', @@ -25,6 +27,7 @@ class YahooIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'Julian Smith & Travis Legg Watch Julian Smith',                  'description': 'Julian and Travis watch Julian Smith', +                'duration': 6863,              },          },          { @@ -34,7 +37,8 @@ class YahooIE(InfoExtractor):                  'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',                  'ext': 'mp4',                  'title': 'Codefellas - The Cougar Lies with Spanish Moss', -                'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', +                'description': 'md5:66b627ab0a282b26352136ca96ce73c1', +                'duration': 151,              },          },          { @@ -45,15 +49,95 @@ class YahooIE(InfoExtractor):                  'ext': 'mp4',                  'title': "Yahoo Saves 'Community'",                  'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', +                'duration': 170,              }          }, +        { +            'url': 'https://tw.screen.yahoo.com/taipei-opinion-poll/選情站報-街頭民調-台北市篇-102823042.html', +            'md5': '92a7fdd8a08783c68a174d7aa067dde8', +            'info_dict': { +                'id': '7a23b569-7bea-36cb-85b9-bd5301a0a1fb', +                'ext': 'mp4', +                'title': '選情站報 街頭民調 台北市篇', +                'description': '選情站報 街頭民調 台北市篇', +                'duration': 429, +            } +        }, +        { +            'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', +            'md5': '0b51660361f0e27c9789e7037ef76f4b', +            'info_dict': { +                'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', +                'ext': 'mp4', +                'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', +                'description': 'md5:f66c890e1490f4910a9953c941dee944', +                'duration': 97, +            } +        }, +        { +            'url': 'https://ca.sports.yahoo.com/video/program-makes-hockey-more-affordable-013127711.html', +            'md5': '57e06440778b1828a6079d2f744212c4', +            'info_dict': { +                'id': 'c9fa2a36-0d4d-3937-b8f6-cc0fb1881e73', +                'ext': 'mp4', +                'title': 'Program that makes hockey more affordable not offered in Manitoba', +                'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4', +                'duration': 121, +            } +        }, { +            'url': 'https://ca.finance.yahoo.com/news/20-most-valuable-brands-world-112600775.html', +            'md5': '3e401e4eed6325aa29d9b96125fd5b4f', +            'info_dict': { +                'id': 'c1b4c09c-8ed8-3b65-8b05-169c55358a83', +                'ext': 'mp4', +                'title': "Apple Is The World's Most Valuable Brand", +                'description': 'md5:73eabc1a11c6f59752593b2ceefa1262', +                'duration': 21, +            } +        }, { +            'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', +            'md5': '67010fdf3a08d290e060a4dd96baa07b', +            'info_dict': { +                'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521', +                'ext': 'mp4', +                'title': 'China Moses Is Crazy About the Blues', +                'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', +                'duration': 128, +            } +        }, { +            'url': 'https://in.lifestyle.yahoo.com/video/connect-dots-dark-side-virgo-090247395.html', +            'md5': 'd9a083ccf1379127bf25699d67e4791b', +            'info_dict': { +                'id': '52aeeaa3-b3d1-30d8-9ef8-5d0cf05efb7c', +                'ext': 'mp4', +                'title': 'Connect the Dots: Dark Side of Virgo', +                'description': 'md5:1428185051cfd1949807ad4ff6d3686a', +                'duration': 201, +            } +        }, { +            'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', +            'only_matching': True, +        }      ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        display_id = mobj.group('display_id')          url = mobj.group('url') -        webpage = self._download_webpage(url, video_id) +        host = mobj.group('host') +        webpage = self._download_webpage(url, display_id) + +        # Look for iframed media first +        iframe_m = re.search(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage) +        if iframe_m: +            iframepage = self._download_webpage( +                host + iframe_m.group(1), display_id, 'Downloading iframe webpage') +            items_json = self._search_regex( +                r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None) +            if items_json: +                items = json.loads(items_json) +                video_id = items[0]['id'] +                return self._get_info(video_id, display_id, webpage)          items_json = self._search_regex(              r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, @@ -64,20 +148,22 @@ class YahooIE(InfoExtractor):                  r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',                  r'"first_videoid"\s*:\s*"([^"]+)"',              ] -            long_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID') -            video_id = long_id +            video_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')          else:              items = json.loads(items_json)              info = items['mediaItems']['query']['results']['mediaObj'][0]              # The 'meta' field is not always in the video webpage, we request it              # from another page -            long_id = info['id'] -        return self._get_info(long_id, video_id, webpage) +            video_id = info['id'] +        return self._get_info(video_id, display_id, webpage) -    def _get_info(self, long_id, video_id, webpage): +    def _get_info(self, video_id, display_id, webpage): +        region = self._search_regex( +            r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', +            webpage, 'region', fatal=False, default='US')          query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"' -                 ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"' -                 ' AND protocol="http"' % long_id) +                 ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="%s"' +                 ' AND protocol="http"' % (video_id, region))          data = compat_urllib_parse.urlencode({              'q': query,              'env': 'prod', @@ -85,9 +171,17 @@ class YahooIE(InfoExtractor):          })          query_result = self._download_json(              'http://video.query.yahoo.com/v1/public/yql?' + data, -            video_id, 'Downloading video info') +            display_id, 'Downloading video info') +          info = query_result['query']['results']['mediaObj'][0] -        meta = info['meta'] +        meta = info.get('meta') + +        if not meta: +            msg = info['status'].get('msg') +            if msg: +                raise ExtractorError( +                    '%s returned error: %s' % (self.IE_NAME, msg), expected=True) +            raise ExtractorError('Unable to extract media object meta')          formats = []          for s in info['streams']: @@ -114,36 +208,15 @@ class YahooIE(InfoExtractor):          return {              'id': video_id, +            'display_id': display_id,              'title': meta['title'],              'formats': formats,              'description': clean_html(meta['description']),              'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage), +            'duration': int_or_none(meta.get('duration')),          } -class YahooNewsIE(YahooIE): -    IE_NAME = 'yahoo:news' -    _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html' - -    _TESTS = [{ -        'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', -        'md5': '67010fdf3a08d290e060a4dd96baa07b', -        'info_dict': { -            'id': '104538833', -            'ext': 'mp4', -            'title': 'China Moses Is Crazy About the Blues', -            'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', -        }, -    }] - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        webpage = self._download_webpage(url, video_id) -        long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id') -        return self._get_info(long_id, video_id, webpage) - -  class YahooSearchIE(SearchInfoExtractor):      IE_DESC = 'Yahoo screen search'      _MAX_RESULTS = 1000 diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9041cfa87..4ab56e0ac 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -191,8 +191,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):      def _real_initialize(self):          if self._downloader is None:              return -        if not self._set_language(): -            return +        if self._get_login_info()[0] is not None: +            if not self._set_language(): +                return          if not self._login():              return          self._confirm_age() @@ -286,6 +287,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},          '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},          '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, +        '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},          '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, @@ -938,7 +940,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')          # Look for the DASH manifest -        if (self._downloader.params.get('youtube_include_dash_manifest', False)): +        if self._downloader.params.get('youtube_include_dash_manifest', True):              try:                  # The DASH manifest used needs to be the one from the original video_webpage.                  # The one found in get_video_info seems to be using different signatures. diff --git a/youtube_dl/options.py b/youtube_dl/options.py index e6f9f33a2..98e20d549 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -77,7 +77,8 @@ def parseOpts(overrideArguments=None):          if len(opts) > 1:              opts.insert(1, ', ') -        if option.takes_value(): opts.append(' %s' % option.metavar) +        if option.takes_value(): +            opts.append(' %s' % option.metavar)          return "".join(opts) @@ -89,68 +90,69 @@ def parseOpts(overrideArguments=None):          for private_opt in ['-p', '--password', '-u', '--username', '--video-password']:              try:                  i = opts.index(private_opt) -                opts[i+1] = 'PRIVATE' +                opts[i + 1] = 'PRIVATE'              except ValueError:                  pass          return opts -    max_width = 80 -    max_help_position = 80 -      # No need to wrap help messages if we're on a wide console      columns = get_term_width() -    if columns: max_width = columns +    max_width = columns if columns else 80 +    max_help_position = 80      fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)      fmt.format_option_strings = _format_option_string      kw = { -        'version'   : __version__, -        'formatter' : fmt, -        'usage' : '%prog [options] url [url...]', -        'conflict_handler' : 'resolve', +        'version': __version__, +        'formatter': fmt, +        'usage': '%prog [options] url [url...]', +        'conflict_handler': 'resolve',      }      parser = optparse.OptionParser(**kw) -    # option groups -    general        = optparse.OptionGroup(parser, 'General Options') -    selection      = optparse.OptionGroup(parser, 'Video Selection') -    authentication = optparse.OptionGroup(parser, 'Authentication Options') -    video_format   = optparse.OptionGroup(parser, 'Video Format Options') -    subtitles      = optparse.OptionGroup(parser, 'Subtitle Options') -    downloader     = optparse.OptionGroup(parser, 'Download Options') -    postproc       = optparse.OptionGroup(parser, 'Post-processing Options') -    filesystem     = optparse.OptionGroup(parser, 'Filesystem Options') -    workarounds    = optparse.OptionGroup(parser, 'Workarounds') -    verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') - -    general.add_option('-h', '--help', -            action='help', help='print this help text and exit') -    general.add_option('-v', '--version', -            action='version', help='print program version and exit') -    general.add_option('-U', '--update', -            action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)') -    general.add_option('-i', '--ignore-errors', -            action='store_true', dest='ignoreerrors', help='continue on download errors, for example to skip unavailable videos in a playlist', default=False) -    general.add_option('--abort-on-error', -            action='store_false', dest='ignoreerrors', -            help='Abort downloading of further videos (in the playlist or the command line) if an error occurs') -    general.add_option('--dump-user-agent', -            action='store_true', dest='dump_user_agent', -            help='display the current browser identification', default=False) -    general.add_option('--list-extractors', -            action='store_true', dest='list_extractors', -            help='List all supported extractors and the URLs they would handle', default=False) -    general.add_option('--extractor-descriptions', -            action='store_true', dest='list_extractor_descriptions', -            help='Output descriptions of all supported extractors', default=False) +    general = optparse.OptionGroup(parser, 'General Options') +    general.add_option( +        '-h', '--help', +        action='help', +        help='print this help text and exit') +    general.add_option( +        '-v', '--version', +        action='version', +        help='print program version and exit') +    general.add_option( +        '-U', '--update', +        action='store_true', dest='update_self', +        help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)') +    general.add_option( +        '-i', '--ignore-errors', +        action='store_true', dest='ignoreerrors', default=False, +        help='continue on download errors, for example to skip unavailable videos in a playlist') +    general.add_option( +        '--abort-on-error', +        action='store_false', dest='ignoreerrors', +        help='Abort downloading of further videos (in the playlist or the command line) if an error occurs') +    general.add_option( +        '--dump-user-agent', +        action='store_true', dest='dump_user_agent', default=False, +        help='display the current browser identification')      general.add_option( -        '--proxy', dest='proxy', default=None, metavar='URL', +        '--list-extractors', +        action='store_true', dest='list_extractors', default=False, +        help='List all supported extractors and the URLs they would handle') +    general.add_option( +        '--extractor-descriptions', +        action='store_true', dest='list_extractor_descriptions', default=False, +        help='Output descriptions of all supported extractors') +    general.add_option( +        '--proxy', dest='proxy', +        default=None, metavar='URL',          help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')      general.add_option( -        '--socket-timeout', dest='socket_timeout', -        type=float, default=None, help=u'Time to wait before giving up, in seconds') +        '--socket-timeout', +        dest='socket_timeout', type=float, default=None, +        help='Time to wait before giving up, in seconds')      general.add_option(          '--default-search',          dest='default_search', metavar='PREFIX', @@ -159,7 +161,13 @@ def parseOpts(overrideArguments=None):          '--ignore-config',          action='store_true',          help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)') +    general.add_option( +        '--flat-playlist', +        action='store_const', dest='extract_flat', const='in_playlist', +        default=False, +        help='Do not extract the videos of a playlist, only list them.') +    selection = optparse.OptionGroup(parser, 'Video Selection')      selection.add_option(          '--playlist-start',          dest='playliststart', metavar='NUMBER', default=1, type=int, @@ -168,245 +176,375 @@ def parseOpts(overrideArguments=None):          '--playlist-end',          dest='playlistend', metavar='NUMBER', default=None, type=int,          help='playlist video to end at (default is last)') -    selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)') -    selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)') -    selection.add_option('--max-downloads', metavar='NUMBER', -                         dest='max_downloads', type=int, default=None, -                         help='Abort after downloading NUMBER files') -    selection.add_option('--min-filesize', metavar='SIZE', dest='min_filesize', help="Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)", default=None) -    selection.add_option('--max-filesize', metavar='SIZE', dest='max_filesize', help="Do not download any videos larger than SIZE (e.g. 50k or 44.6m)", default=None) -    selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None)      selection.add_option( -        '--datebefore', metavar='DATE', dest='datebefore', default=None, +        '--match-title', +        dest='matchtitle', metavar='REGEX', +        help='download only matching titles (regex or caseless sub-string)') +    selection.add_option( +        '--reject-title', +        dest='rejecttitle', metavar='REGEX', +        help='skip download for matching titles (regex or caseless sub-string)') +    selection.add_option( +        '--max-downloads', +        dest='max_downloads', metavar='NUMBER', type=int, default=None, +        help='Abort after downloading NUMBER files') +    selection.add_option( +        '--min-filesize', +        metavar='SIZE', dest='min_filesize', default=None, +        help='Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)') +    selection.add_option( +        '--max-filesize', +        metavar='SIZE', dest='max_filesize', default=None, +        help='Do not download any videos larger than SIZE (e.g. 50k or 44.6m)') +    selection.add_option( +        '--date', +        metavar='DATE', dest='date', default=None, +        help='download only videos uploaded in this date') +    selection.add_option( +        '--datebefore', +        metavar='DATE', dest='datebefore', default=None,          help='download only videos uploaded on or before this date (i.e. inclusive)')      selection.add_option( -        '--dateafter', metavar='DATE', dest='dateafter', default=None, +        '--dateafter', +        metavar='DATE', dest='dateafter', default=None,          help='download only videos uploaded on or after this date (i.e. inclusive)')      selection.add_option( -        '--min-views', metavar='COUNT', dest='min_views', -        default=None, type=int, -        help="Do not download any videos with less than COUNT views",) +        '--min-views', +        metavar='COUNT', dest='min_views', default=None, type=int, +        help='Do not download any videos with less than COUNT views',)      selection.add_option( -        '--max-views', metavar='COUNT', dest='max_views', -        default=None, type=int, -        help="Do not download any videos with more than COUNT views",) -    selection.add_option('--no-playlist', action='store_true', dest='noplaylist', help='download only the currently playing video', default=False) -    selection.add_option('--age-limit', metavar='YEARS', dest='age_limit', -                         help='download only videos suitable for the given age', -                         default=None, type=int) -    selection.add_option('--download-archive', metavar='FILE', -                         dest='download_archive', -                         help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.') +        '--max-views', +        metavar='COUNT', dest='max_views', default=None, type=int, +        help='Do not download any videos with more than COUNT views')      selection.add_option( -        '--include-ads', dest='include_ads', -        action='store_true', -        help='Download advertisements as well (experimental)') +        '--no-playlist', +        action='store_true', dest='noplaylist', default=False, +        help='download only the currently playing video') +    selection.add_option( +        '--age-limit', +        metavar='YEARS', dest='age_limit', default=None, type=int, +        help='download only videos suitable for the given age') +    selection.add_option( +        '--download-archive', metavar='FILE', +        dest='download_archive', +        help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')      selection.add_option( -        '--youtube-include-dash-manifest', action='store_true', -        dest='youtube_include_dash_manifest', default=False, -        help='Try to download the DASH manifest on YouTube videos (experimental)') - -    authentication.add_option('-u', '--username', -            dest='username', metavar='USERNAME', help='account username') -    authentication.add_option('-p', '--password', -            dest='password', metavar='PASSWORD', help='account password') -    authentication.add_option('-2', '--twofactor', -            dest='twofactor', metavar='TWOFACTOR', help='two-factor auth code') -    authentication.add_option('-n', '--netrc', -            action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False) -    authentication.add_option('--video-password', -            dest='videopassword', metavar='PASSWORD', help='video password (vimeo, smotri)') - - -    video_format.add_option('-f', '--format', -            action='store', dest='format', metavar='FORMAT', default=None, -            help='video format code, specify the order of preference using slashes: -f 22/17/18 .  -f mp4 , -f m4a and  -f flv  are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality. Use commas to download multiple audio formats, such as  -f  136/137/mp4/bestvideo,140/m4a/bestaudio') -    video_format.add_option('--all-formats', -            action='store_const', dest='format', help='download all available video formats', const='all') -    video_format.add_option('--prefer-free-formats', -            action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested') -    video_format.add_option('--max-quality', -            action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') -    video_format.add_option('-F', '--list-formats', -            action='store_true', dest='listformats', help='list all available formats') - -    subtitles.add_option('--write-sub', '--write-srt', -            action='store_true', dest='writesubtitles', -            help='write subtitle file', default=False) -    subtitles.add_option('--write-auto-sub', '--write-automatic-sub', -            action='store_true', dest='writeautomaticsub', -            help='write automatic subtitle file (youtube only)', default=False) -    subtitles.add_option('--all-subs', -            action='store_true', dest='allsubtitles', -            help='downloads all the available subtitles of the video', default=False) -    subtitles.add_option('--list-subs', -            action='store_true', dest='listsubtitles', -            help='lists all available subtitles for the video', default=False) -    subtitles.add_option('--sub-format', -            action='store', dest='subtitlesformat', metavar='FORMAT', -            help='subtitle format (default=srt) ([sbv/vtt] youtube only)', default='srt') -    subtitles.add_option('--sub-lang', '--sub-langs', '--srt-lang', -            action='callback', dest='subtitleslangs', metavar='LANGS', type='str', -            default=[], callback=_comma_separated_values_options_callback, -            help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'') - -    downloader.add_option('-r', '--rate-limit', -            dest='ratelimit', metavar='LIMIT', help='maximum download rate in bytes per second (e.g. 50K or 4.2M)') -    downloader.add_option('-R', '--retries', -            dest='retries', metavar='RETRIES', help='number of retries (default is %default)', default=10) -    downloader.add_option('--buffer-size', -            dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16K) (default is %default)', default="1024") -    downloader.add_option('--no-resize-buffer', -            action='store_true', dest='noresizebuffer', -            help='do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.', default=False) -    downloader.add_option('--test', action='store_true', dest='test', default=False, help=optparse.SUPPRESS_HELP) +        '--include-ads', +        dest='include_ads', action='store_true', +        help='Download advertisements as well (experimental)') +    authentication = optparse.OptionGroup(parser, 'Authentication Options') +    authentication.add_option( +        '-u', '--username', +        dest='username', metavar='USERNAME', +        help='login with this account ID') +    authentication.add_option( +        '-p', '--password', +        dest='password', metavar='PASSWORD', +        help='account password') +    authentication.add_option( +        '-2', '--twofactor', +        dest='twofactor', metavar='TWOFACTOR', +        help='two-factor auth code') +    authentication.add_option( +        '-n', '--netrc', +        action='store_true', dest='usenetrc', default=False, +        help='use .netrc authentication data') +    authentication.add_option( +        '--video-password', +        dest='videopassword', metavar='PASSWORD', +        help='video password (vimeo, smotri)') + +    video_format = optparse.OptionGroup(parser, 'Video Format Options') +    video_format.add_option( +        '-f', '--format', +        action='store', dest='format', metavar='FORMAT', default=None, +        help='video format code, specify the order of preference using slashes: -f 22/17/18 .  -f mp4 , -f m4a and  -f flv  are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality. Use commas to download multiple audio formats, such as  -f  136/137/mp4/bestvideo,140/m4a/bestaudio') +    video_format.add_option( +        '--all-formats', +        action='store_const', dest='format', const='all', +        help='download all available video formats') +    video_format.add_option( +        '--prefer-free-formats', +        action='store_true', dest='prefer_free_formats', default=False, +        help='prefer free video formats unless a specific one is requested') +    video_format.add_option( +        '--max-quality', +        action='store', dest='format_limit', metavar='FORMAT', +        help='highest quality format to download') +    video_format.add_option( +        '-F', '--list-formats', +        action='store_true', dest='listformats', +        help='list all available formats') +    video_format.add_option( +        '--youtube-include-dash-manifest', +        action='store_true', dest='youtube_include_dash_manifest', default=True, +        help=optparse.SUPPRESS_HELP) +    video_format.add_option( +        '--youtube-skip-dash-manifest', +        action='store_false', dest='youtube_include_dash_manifest', +        help='Do not download the DASH manifest on YouTube videos') + +    subtitles = optparse.OptionGroup(parser, 'Subtitle Options') +    subtitles.add_option( +        '--write-sub', '--write-srt', +        action='store_true', dest='writesubtitles', default=False, +        help='write subtitle file') +    subtitles.add_option( +        '--write-auto-sub', '--write-automatic-sub', +        action='store_true', dest='writeautomaticsub', default=False, +        help='write automatic subtitle file (youtube only)') +    subtitles.add_option( +        '--all-subs', +        action='store_true', dest='allsubtitles', default=False, +        help='downloads all the available subtitles of the video') +    subtitles.add_option( +        '--list-subs', +        action='store_true', dest='listsubtitles', default=False, +        help='lists all available subtitles for the video') +    subtitles.add_option( +        '--sub-format', +        action='store', dest='subtitlesformat', metavar='FORMAT', default='srt', +        help='subtitle format (default=srt) ([sbv/vtt] youtube only)') +    subtitles.add_option( +        '--sub-lang', '--sub-langs', '--srt-lang', +        action='callback', dest='subtitleslangs', metavar='LANGS', type='str', +        default=[], callback=_comma_separated_values_options_callback, +        help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'') + +    downloader = optparse.OptionGroup(parser, 'Download Options') +    downloader.add_option( +        '-r', '--rate-limit', +        dest='ratelimit', metavar='LIMIT', +        help='maximum download rate in bytes per second (e.g. 50K or 4.2M)') +    downloader.add_option( +        '-R', '--retries', +        dest='retries', metavar='RETRIES', default=10, +        help='number of retries (default is %default)') +    downloader.add_option( +        '--buffer-size', +        dest='buffersize', metavar='SIZE', default='1024', +        help='size of download buffer (e.g. 1024 or 16K) (default is %default)') +    downloader.add_option( +        '--no-resize-buffer', +        action='store_true', dest='noresizebuffer', default=False, +        help='do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.') +    downloader.add_option( +        '--test', +        action='store_true', dest='test', default=False, +        help=optparse.SUPPRESS_HELP) + +    workarounds = optparse.OptionGroup(parser, 'Workarounds')      workarounds.add_option( -        '--encoding', dest='encoding', metavar='ENCODING', +        '--encoding', +        dest='encoding', metavar='ENCODING',          help='Force the specified encoding (experimental)')      workarounds.add_option( -        '--no-check-certificate', action='store_true', -        dest='no_check_certificate', default=False, +        '--no-check-certificate', +        action='store_true', dest='no_check_certificate', default=False,          help='Suppress HTTPS certificate validation.')      workarounds.add_option( -        '--prefer-insecure', '--prefer-unsecure', action='store_true', dest='prefer_insecure', +        '--prefer-insecure', +        '--prefer-unsecure', action='store_true', dest='prefer_insecure',          help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)')      workarounds.add_option( -        '--user-agent', metavar='UA', -        dest='user_agent', help='specify a custom user agent') +        '--user-agent', +        metavar='UA', dest='user_agent', +        help='specify a custom user agent')      workarounds.add_option( -        '--referer', metavar='REF', -        dest='referer', default=None, +        '--referer', +        metavar='URL', dest='referer', default=None,          help='specify a custom referer, use if the video access is restricted to one domain',      )      workarounds.add_option( -        '--add-header', metavar='FIELD:VALUE', -        dest='headers', action='append', +        '--add-header', +        metavar='FIELD:VALUE', dest='headers', action='append',          help='specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times',      )      workarounds.add_option( -        '--bidi-workaround', dest='bidi_workaround', action='store_true', -        help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') +        '--bidi-workaround', +        dest='bidi_workaround', action='store_true', +        help='Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') -    verbosity.add_option('-q', '--quiet', -            action='store_true', dest='quiet', help='activates quiet mode', default=False) +    verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') +    verbosity.add_option( +        '-q', '--quiet', +        action='store_true', dest='quiet', default=False, +        help='activates quiet mode')      verbosity.add_option(          '--no-warnings',          dest='no_warnings', action='store_true', default=False,          help='Ignore warnings') -    verbosity.add_option('-s', '--simulate', -            action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False) -    verbosity.add_option('--skip-download', -            action='store_true', dest='skip_download', help='do not download the video', default=False) -    verbosity.add_option('-g', '--get-url', -            action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False) -    verbosity.add_option('-e', '--get-title', -            action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False) -    verbosity.add_option('--get-id', -            action='store_true', dest='getid', help='simulate, quiet but print id', default=False) -    verbosity.add_option('--get-thumbnail', -            action='store_true', dest='getthumbnail', -            help='simulate, quiet but print thumbnail URL', default=False) -    verbosity.add_option('--get-description', -            action='store_true', dest='getdescription', -            help='simulate, quiet but print video description', default=False) -    verbosity.add_option('--get-duration', -            action='store_true', dest='getduration', -            help='simulate, quiet but print video length', default=False) -    verbosity.add_option('--get-filename', -            action='store_true', dest='getfilename', -            help='simulate, quiet but print output filename', default=False) -    verbosity.add_option('--get-format', -            action='store_true', dest='getformat', -            help='simulate, quiet but print output format', default=False) -    verbosity.add_option('-j', '--dump-json', -            action='store_true', dest='dumpjson', -            help='simulate, quiet but print JSON information. See --output for a description of available keys.', default=False) -    verbosity.add_option('--newline', -            action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False) -    verbosity.add_option('--no-progress', -            action='store_true', dest='noprogress', help='do not print progress bar', default=False) -    verbosity.add_option('--console-title', -            action='store_true', dest='consoletitle', -            help='display progress in console titlebar', default=False) -    verbosity.add_option('-v', '--verbose', -            action='store_true', dest='verbose', help='print various debugging information', default=False) -    verbosity.add_option('--dump-intermediate-pages', -            action='store_true', dest='dump_intermediate_pages', default=False, -            help='print downloaded pages to debug problems (very verbose)') -    verbosity.add_option('--write-pages', -            action='store_true', dest='write_pages', default=False, -            help='Write downloaded intermediary pages to files in the current directory to debug problems') -    verbosity.add_option('--youtube-print-sig-code', -            action='store_true', dest='youtube_print_sig_code', default=False, -            help=optparse.SUPPRESS_HELP) -    verbosity.add_option('--print-traffic', -            dest='debug_printtraffic', action='store_true', default=False, -            help='Display sent and read HTTP traffic') - - -    filesystem.add_option('-a', '--batch-file', -            dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)') -    filesystem.add_option('--id', -            action='store_true', dest='useid', help='use only video ID in file name', default=False) -    filesystem.add_option('-A', '--auto-number', -            action='store_true', dest='autonumber', -            help='number downloaded files starting from 00000', default=False) -    filesystem.add_option('-o', '--output', -            dest='outtmpl', metavar='TEMPLATE', -            help=('output filename template. Use %(title)s to get the title, ' -                  '%(uploader)s for the uploader name, %(uploader_id)s for the uploader nickname if different, ' -                  '%(autonumber)s to get an automatically incremented number, ' -                  '%(ext)s for the filename extension, ' -                  '%(format)s for the format description (like "22 - 1280x720" or "HD"), ' -                  '%(format_id)s for the unique id of the format (like Youtube\'s itags: "137"), ' -                  '%(upload_date)s for the upload date (YYYYMMDD), ' -                  '%(extractor)s for the provider (youtube, metacafe, etc), ' -                  '%(id)s for the video id, %(playlist)s for the playlist the video is in, ' -                  '%(playlist_index)s for the position in the playlist and %% for a literal percent. ' -                  '%(height)s and %(width)s for the width and height of the video format. ' -                  '%(resolution)s for a textual description of the resolution of the video format. ' -                  'Use - to output to stdout. Can also be used to download to a different directory, ' -                  'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .')) -    filesystem.add_option('--autonumber-size', -            dest='autonumber_size', metavar='NUMBER', -            help='Specifies the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given') -    filesystem.add_option('--restrict-filenames', -            action='store_true', dest='restrictfilenames', -            help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames', default=False) -    filesystem.add_option('-t', '--title', -            action='store_true', dest='usetitle', help='[deprecated] use title in file name (default)', default=False) -    filesystem.add_option('-l', '--literal', -            action='store_true', dest='usetitle', help='[deprecated] alias of --title', default=False) -    filesystem.add_option('-w', '--no-overwrites', -            action='store_true', dest='nooverwrites', help='do not overwrite files', default=False) -    filesystem.add_option('-c', '--continue', -            action='store_true', dest='continue_dl', help='force resume of partially downloaded files. By default, youtube-dl will resume downloads if possible.', default=True) -    filesystem.add_option('--no-continue', -            action='store_false', dest='continue_dl', -            help='do not resume partially downloaded files (restart from beginning)') -    filesystem.add_option('--no-part', -            action='store_true', dest='nopart', help='do not use .part files', default=False) -    filesystem.add_option('--no-mtime', -            action='store_false', dest='updatetime', -            help='do not use the Last-modified header to set the file modification time', default=True) -    filesystem.add_option('--write-description', -            action='store_true', dest='writedescription', -            help='write video description to a .description file', default=False) -    filesystem.add_option('--write-info-json', -            action='store_true', dest='writeinfojson', -            help='write video metadata to a .info.json file', default=False) -    filesystem.add_option('--write-annotations', -            action='store_true', dest='writeannotations', -            help='write video annotations to a .annotation file', default=False) -    filesystem.add_option('--write-thumbnail', -            action='store_true', dest='writethumbnail', -            help='write thumbnail image to disk', default=False) -    filesystem.add_option('--load-info', -            dest='load_info_filename', metavar='FILE', -            help='json file containing the video information (created with the "--write-json" option)') -    filesystem.add_option('--cookies', -            dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in') +    verbosity.add_option( +        '-s', '--simulate', +        action='store_true', dest='simulate', default=False, +        help='do not download the video and do not write anything to disk',) +    verbosity.add_option( +        '--skip-download', +        action='store_true', dest='skip_download', default=False, +        help='do not download the video',) +    verbosity.add_option( +        '-g', '--get-url', +        action='store_true', dest='geturl', default=False, +        help='simulate, quiet but print URL') +    verbosity.add_option( +        '-e', '--get-title', +        action='store_true', dest='gettitle', default=False, +        help='simulate, quiet but print title') +    verbosity.add_option( +        '--get-id', +        action='store_true', dest='getid', default=False, +        help='simulate, quiet but print id') +    verbosity.add_option( +        '--get-thumbnail', +        action='store_true', dest='getthumbnail', default=False, +        help='simulate, quiet but print thumbnail URL') +    verbosity.add_option( +        '--get-description', +        action='store_true', dest='getdescription', default=False, +        help='simulate, quiet but print video description') +    verbosity.add_option( +        '--get-duration', +        action='store_true', dest='getduration', default=False, +        help='simulate, quiet but print video length') +    verbosity.add_option( +        '--get-filename', +        action='store_true', dest='getfilename', default=False, +        help='simulate, quiet but print output filename') +    verbosity.add_option( +        '--get-format', +        action='store_true', dest='getformat', default=False, +        help='simulate, quiet but print output format') +    verbosity.add_option( +        '-j', '--dump-json', +        action='store_true', dest='dumpjson', default=False, +        help='simulate, quiet but print JSON information. See --output for a description of available keys.') +    verbosity.add_option( +        '-J', '--dump-single-json', +        action='store_true', dest='dump_single_json', default=False, +        help='simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole playlist information in a single line.') +    verbosity.add_option( +        '--newline', +        action='store_true', dest='progress_with_newline', default=False, +        help='output progress bar as new lines') +    verbosity.add_option( +        '--no-progress', +        action='store_true', dest='noprogress', default=False, +        help='do not print progress bar') +    verbosity.add_option( +        '--console-title', +        action='store_true', dest='consoletitle', default=False, +        help='display progress in console titlebar') +    verbosity.add_option( +        '-v', '--verbose', +        action='store_true', dest='verbose', default=False, +        help='print various debugging information') +    verbosity.add_option( +        '--dump-intermediate-pages', +        action='store_true', dest='dump_intermediate_pages', default=False, +        help='print downloaded pages to debug problems (very verbose)') +    verbosity.add_option( +        '--write-pages', +        action='store_true', dest='write_pages', default=False, +        help='Write downloaded intermediary pages to files in the current directory to debug problems') +    verbosity.add_option( +        '--youtube-print-sig-code', +        action='store_true', dest='youtube_print_sig_code', default=False, +        help=optparse.SUPPRESS_HELP) +    verbosity.add_option( +        '--print-traffic', +        dest='debug_printtraffic', action='store_true', default=False, +        help='Display sent and read HTTP traffic') + +    filesystem = optparse.OptionGroup(parser, 'Filesystem Options') +    filesystem.add_option( +        '-a', '--batch-file', +        dest='batchfile', metavar='FILE', +        help='file containing URLs to download (\'-\' for stdin)') +    filesystem.add_option( +        '--id', default=False, +        action='store_true', dest='useid', help='use only video ID in file name') +    filesystem.add_option( +        '-A', '--auto-number', +        action='store_true', dest='autonumber', default=False, +        help='number downloaded files starting from 00000') +    filesystem.add_option( +        '-o', '--output', +        dest='outtmpl', metavar='TEMPLATE', +        help=('output filename template. Use %(title)s to get the title, ' +              '%(uploader)s for the uploader name, %(uploader_id)s for the uploader nickname if different, ' +              '%(autonumber)s to get an automatically incremented number, ' +              '%(ext)s for the filename extension, ' +              '%(format)s for the format description (like "22 - 1280x720" or "HD"), ' +              '%(format_id)s for the unique id of the format (like Youtube\'s itags: "137"), ' +              '%(upload_date)s for the upload date (YYYYMMDD), ' +              '%(extractor)s for the provider (youtube, metacafe, etc), ' +              '%(id)s for the video id, %(playlist)s for the playlist the video is in, ' +              '%(playlist_index)s for the position in the playlist and %% for a literal percent. ' +              '%(height)s and %(width)s for the width and height of the video format. ' +              '%(resolution)s for a textual description of the resolution of the video format. ' +              'Use - to output to stdout. Can also be used to download to a different directory, ' +              'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .')) +    filesystem.add_option( +        '--autonumber-size', +        dest='autonumber_size', metavar='NUMBER', +        help='Specifies the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given') +    filesystem.add_option( +        '--restrict-filenames', +        action='store_true', dest='restrictfilenames', default=False, +        help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames') +    filesystem.add_option( +        '-t', '--title', +        action='store_true', dest='usetitle', default=False, +        help='[deprecated] use title in file name (default)') +    filesystem.add_option( +        '-l', '--literal', default=False, +        action='store_true', dest='usetitle', +        help='[deprecated] alias of --title') +    filesystem.add_option( +        '-w', '--no-overwrites', +        action='store_true', dest='nooverwrites', default=False, +        help='do not overwrite files') +    filesystem.add_option( +        '-c', '--continue', +        action='store_true', dest='continue_dl', default=True, +        help='force resume of partially downloaded files. By default, youtube-dl will resume downloads if possible.') +    filesystem.add_option( +        '--no-continue', +        action='store_false', dest='continue_dl', +        help='do not resume partially downloaded files (restart from beginning)') +    filesystem.add_option( +        '--no-part', +        action='store_true', dest='nopart', default=False, +        help='do not use .part files - write directly into output file') +    filesystem.add_option( +        '--no-mtime', +        action='store_false', dest='updatetime', default=True, +        help='do not use the Last-modified header to set the file modification time') +    filesystem.add_option( +        '--write-description', +        action='store_true', dest='writedescription', default=False, +        help='write video description to a .description file') +    filesystem.add_option( +        '--write-info-json', +        action='store_true', dest='writeinfojson', default=False, +        help='write video metadata to a .info.json file') +    filesystem.add_option( +        '--write-annotations', +        action='store_true', dest='writeannotations', default=False, +        help='write video annotations to a .annotation file') +    filesystem.add_option( +        '--write-thumbnail', +        action='store_true', dest='writethumbnail', default=False, +        help='write thumbnail image to disk') +    filesystem.add_option( +        '--load-info', +        dest='load_info_filename', metavar='FILE', +        help='json file containing the video information (created with the "--write-json" option)') +    filesystem.add_option( +        '--cookies', +        dest='cookiefile', metavar='FILE', +        help='file to read cookies from and dump cookie jar in')      filesystem.add_option(          '--cache-dir', dest='cachedir', default=None, metavar='DIR',          help='Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.') @@ -414,36 +552,61 @@ def parseOpts(overrideArguments=None):          '--no-cache-dir', action='store_const', const=False, dest='cachedir',          help='Disable filesystem caching')      filesystem.add_option( -        '--rm-cache-dir', action='store_true', dest='rm_cachedir', +        '--rm-cache-dir', +        action='store_true', dest='rm_cachedir',          help='Delete all filesystem cache files') - -    postproc.add_option('-x', '--extract-audio', action='store_true', dest='extractaudio', default=False, -            help='convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') -    postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best', -            help='"best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; best by default') -    postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='5', -            help='ffmpeg/avconv audio quality specification, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default 5)') -    postproc.add_option('--recode-video', metavar='FORMAT', dest='recodevideo', default=None, -            help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv)') -    postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False, -            help='keeps the video file on disk after the post-processing; the video is erased by default') -    postproc.add_option('--no-post-overwrites', action='store_true', dest='nopostoverwrites', default=False, -            help='do not overwrite post-processed files; the post-processed files are overwritten by default') -    postproc.add_option('--embed-subs', action='store_true', dest='embedsubtitles', default=False, -            help='embed subtitles in the video (only for mp4 videos)') -    postproc.add_option('--embed-thumbnail', action='store_true', dest='embedthumbnail', default=False, -            help='embed thumbnail in the audio as cover art') -    postproc.add_option('--add-metadata', action='store_true', dest='addmetadata', default=False, -            help='write metadata to the video file') -    postproc.add_option('--xattrs', action='store_true', dest='xattrs', default=False, -            help='write metadata to the video file\'s xattrs (using dublin core and xdg standards)') -    postproc.add_option('--prefer-avconv', action='store_false', dest='prefer_ffmpeg', +    postproc = optparse.OptionGroup(parser, 'Post-processing Options') +    postproc.add_option( +        '-x', '--extract-audio', +        action='store_true', dest='extractaudio', default=False, +        help='convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') +    postproc.add_option( +        '--audio-format', metavar='FORMAT', dest='audioformat', default='best', +        help='"best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "%default" by default') +    postproc.add_option( +        '--audio-quality', metavar='QUALITY', +        dest='audioquality', default='5', +        help='ffmpeg/avconv audio quality specification, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default %default)') +    postproc.add_option( +        '--recode-video', +        metavar='FORMAT', dest='recodevideo', default=None, +        help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv)') +    postproc.add_option( +        '-k', '--keep-video', +        action='store_true', dest='keepvideo', default=False, +        help='keeps the video file on disk after the post-processing; the video is erased by default') +    postproc.add_option( +        '--no-post-overwrites', +        action='store_true', dest='nopostoverwrites', default=False, +        help='do not overwrite post-processed files; the post-processed files are overwritten by default') +    postproc.add_option( +        '--embed-subs', +        action='store_true', dest='embedsubtitles', default=False, +        help='embed subtitles in the video (only for mp4 videos)') +    postproc.add_option( +        '--embed-thumbnail', +        action='store_true', dest='embedthumbnail', default=False, +        help='embed thumbnail in the audio as cover art') +    postproc.add_option( +        '--add-metadata', +        action='store_true', dest='addmetadata', default=False, +        help='write metadata to the video file') +    postproc.add_option( +        '--xattrs', +        action='store_true', dest='xattrs', default=False, +        help='write metadata to the video file\'s xattrs (using dublin core and xdg standards)') +    postproc.add_option( +        '--prefer-avconv', +        action='store_false', dest='prefer_ffmpeg',          help='Prefer avconv over ffmpeg for running the postprocessors (default)') -    postproc.add_option('--prefer-ffmpeg', action='store_true', dest='prefer_ffmpeg', +    postproc.add_option( +        '--prefer-ffmpeg', +        action='store_true', dest='prefer_ffmpeg',          help='Prefer ffmpeg over avconv for running the postprocessors')      postproc.add_option( -        '--exec', metavar='CMD', dest='exec_cmd', +        '--exec', +        metavar='CMD', dest='exec_cmd',          help='Execute a command on the file after downloading, similar to find\'s -exec syntax. Example: --exec \'adb push {} /sdcard/Music/ && rm {}\'' )      parser.add_option_group(general) @@ -460,7 +623,7 @@ def parseOpts(overrideArguments=None):      if overrideArguments is not None:          opts, args = parser.parse_args(overrideArguments)          if opts.verbose: -            write_string(u'[debug] Override config: ' + repr(overrideArguments) + '\n') +            write_string('[debug] Override config: ' + repr(overrideArguments) + '\n')      else:          commandLineConf = sys.argv[1:]          if '--ignore-config' in commandLineConf: @@ -476,8 +639,8 @@ def parseOpts(overrideArguments=None):          opts, args = parser.parse_args(argv)          if opts.verbose: -            write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n') -            write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n') -            write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n') +            write_string('[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n') +            write_string('[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n') +            write_string('[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n')      return parser, opts, args diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 8c5f7c43b..6f010a9c7 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -487,7 +487,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):  class FFmpegMergerPP(FFmpegPostProcessor):      def run(self, info):          filename = info['filepath'] -        args = ['-c', 'copy'] +        args = ['-c', 'copy', '-map', '0:v:0', '-map', '1:a:0', '-shortest']          self._downloader.to_screen(u'[ffmpeg] Merging formats into "%s"' % filename)          self.run_ffmpeg_multiple_files(info['__files_to_merge'], filename, args)          return True, info diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 526d2cc02..9287edd8d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -749,6 +749,8 @@ class ExtractorError(Exception):              expected = True          if video_id is not None:              msg = video_id + ': ' + msg +        if cause: +            msg += u' (caused by %r)' % cause          if not expected:              msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'          super(ExtractorError, self).__init__(msg) @@ -968,6 +970,7 @@ def unified_strdate(date_str):          '%Y/%m/%d %H:%M:%S',          '%d/%m/%Y %H:%M:%S',          '%Y-%m-%d %H:%M:%S', +        '%Y-%m-%d %H:%M:%S.%f',          '%d.%m.%Y %H:%M',          '%d.%m.%Y %H.%M',          '%Y-%m-%dT%H:%M:%SZ', @@ -1651,33 +1654,37 @@ US_RATINGS = {  } +def parse_age_limit(s): +    if s is None: +        return None +    m = re.match(r'^(?P<age>\d{1,2})\+?$', s) +    return int(m.group('age')) if m else US_RATINGS.get(s, None) + +  def strip_jsonp(code):      return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)  def js_to_json(code):      def fix_kv(m): -        key = m.group(2) -        if key.startswith("'"): -            assert key.endswith("'") -            assert '"' not in key -            key = '"%s"' % key[1:-1] -        elif not key.startswith('"'): -            key = '"%s"' % key - -        value = m.group(4) -        if value.startswith("'"): -            assert value.endswith("'") -            assert '"' not in value -            value = '"%s"' % value[1:-1] - -        return m.group(1) + key + m.group(3) + value +        v = m.group(0) +        if v in ('true', 'false', 'null'): +            return v +        if v.startswith('"'): +            return v +        if v.startswith("'"): +            v = v[1:-1] +            v = re.sub(r"\\\\|\\'|\"", lambda m: { +                '\\\\': '\\\\', +                "\\'": "'", +                '"': '\\"', +            }[m.group(0)], v) +        return '"%s"' % v      res = re.sub(r'''(?x) -            ([{,]\s*) -            ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+) -            (:\s*) -            ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{) +        "(?:[^"\\]*(?:\\\\|\\")?)*"| +        '(?:[^'\\]*(?:\\\\|\\')?)*'| +        [a-zA-Z_][a-zA-Z_0-9]*          ''', fix_kv, code)      res = re.sub(r',(\s*\])', lambda m: m.group(1), res)      return res diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1384b496b..d822ae330 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.29.2' +__version__ = '2014.10.25' | 
