diff options
Diffstat (limited to 'youtube_dl')
38 files changed, 1716 insertions, 239 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 14a1d06ab..a671d6450 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -162,6 +162,7 @@ class YoutubeDL(object):      default_search:    Prepend this string if an input url is not valid.                         'auto' for elaborate guessing      encoding:          Use this encoding instead of the system-specified. +    extract_flat:      Do not resolve URLs, return the immediate result.      The following parameters are not used by YoutubeDL itself, they are used by      the FileDownloader: @@ -479,7 +480,10 @@ class YoutubeDL(object):                  return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)          age_limit = self.params.get('age_limit')          if age_limit is not None: -            if age_limit < info_dict.get('age_limit', 0): +            actual_age_limit = info_dict.get('age_limit') +            if actual_age_limit is None: +                actual_age_limit = 0 +            if age_limit < actual_age_limit:                  return 'Skipping "' + title + '" because it is age restricted'          if self.in_download_archive(info_dict):              return '%s has already been recorded in archive' % video_title @@ -558,7 +562,12 @@ class YoutubeDL(object):          Returns the resolved ie_result.          """ -        result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system +        result_type = ie_result.get('_type', 'video') + +        if self.params.get('extract_flat', False): +            if result_type in ('url', 'url_transparent'): +                return ie_result +          if result_type == 'video':              self.add_extra_info(ie_result, extra_info)              return self.process_video_result(ie_result, download=download) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 80de211e7..a96bf9b5c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -69,6 +69,10 @@ __authors__  = (      'Dobrosław Żybort',      'David Fabijan',      'Sebastian Haas', +    'Alexander Kirk', +    'Erik Johnson', +    'Keith Beckman', +    'Ole Ernst',  )  __license__ = 'Public Domain' diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index f79e6a995..d01d1897e 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -27,8 +27,16 @@ class HttpFD(FileDownloader):              headers['Youtubedl-user-agent'] = info_dict['user_agent']          if 'http_referer' in info_dict:              headers['Referer'] = info_dict['http_referer'] -        basic_request = compat_urllib_request.Request(url, None, headers) -        request = compat_urllib_request.Request(url, None, headers) +        add_headers = info_dict.get('http_headers') +        if add_headers: +            headers.update(add_headers) +        data = info_dict.get('http_post_data') +        http_method = info_dict.get('http_method') +        basic_request = compat_urllib_request.Request(url, data, headers) +        request = compat_urllib_request.Request(url, data, headers) +        if http_method is not None: +            basic_request.get_method = lambda: http_method +            request.get_method = lambda: http_method          is_test = self.params.get('test', False) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 27602e0c0..de6e8ee30 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -69,6 +69,7 @@ from .dfb import DFBIE  from .dotsub import DotsubIE  from .dreisat import DreiSatIE  from .drtv import DRTVIE +from .dump import DumpIE  from .defense import DefenseGouvFrIE  from .discovery import DiscoveryIE  from .divxstage import DivxStageIE @@ -77,12 +78,17 @@ from .ebaumsworld import EbaumsWorldIE  from .ehow import EHowIE  from .eighttracks import EightTracksIE  from .eitb import EitbIE +from .ellentv import ( +    EllenTVIE, +    EllenTVClipsIE, +)  from .elpais import ElPaisIE  from .empflix import EmpflixIE  from .engadget import EngadgetIE  from .escapist import EscapistIE  from .everyonesmixtape import EveryonesMixtapeIE  from .exfm import ExfmIE +from .expotv import ExpoTVIE  from .extremetube import ExtremeTubeIE  from .facebook import FacebookIE  from .faz import FazIE @@ -110,7 +116,10 @@ from .freesound import FreesoundIE  from .freespeech import FreespeechIE  from .funnyordie import FunnyOrDieIE  from .gamekings import GamekingsIE -from .gameone import GameOneIE +from .gameone import ( +    GameOneIE, +    GameOnePlaylistIE, +)  from .gamespot import GameSpotIE  from .gamestar import GameStarIE  from .gametrailers import GametrailersIE @@ -121,6 +130,7 @@ from .googleplus import GooglePlusIE  from .googlesearch import GoogleSearchIE  from .gorillavid import GorillaVidIE  from .goshgay import GoshgayIE +from .grooveshark import GroovesharkIE  from .hark import HarkIE  from .helsinki import HelsinkiIE  from .hentaistigma import HentaiStigmaIE @@ -147,6 +157,7 @@ from .ivi import (  from .izlesene import IzleseneIE  from .jadorecettepub import JadoreCettePubIE  from .jeuxvideo import JeuxVideoIE +from .jove import JoveIE  from .jukebox import JukeboxIE  from .justintv import JustinTVIE  from .jpopsukitv import JpopsukiIE @@ -177,7 +188,9 @@ from .malemotion import MalemotionIE  from .mdr import MDRIE  from .metacafe import MetacafeIE  from .metacritic import MetacriticIE +from .ministrygrid import MinistryGridIE  from .mit import TechTVMITIE, MITIE, OCWMITIE +from .mitele import MiTeleIE  from .mixcloud import MixcloudIE  from .mlb import MLBIE  from .mpora import MporaIE @@ -187,6 +200,7 @@ from .mooshare import MooshareIE  from .morningstar import MorningstarIE  from .motherless import MotherlessIE  from .motorsport import MotorsportIE +from .movieclips import MovieClipsIE  from .moviezine import MoviezineIE  from .movshare import MovShareIE  from .mtv import ( @@ -233,8 +247,10 @@ from .orf import (      ORFFM4IE,  )  from .parliamentliveuk import ParliamentLiveUKIE +from .patreon import PatreonIE  from .pbs import PBSIE  from .photobucket import PhotobucketIE +from .playfm import PlayFMIE  from .playvid import PlayvidIE  from .podomatic import PodomaticIE  from .pornhd import PornHdIE @@ -252,9 +268,10 @@ from .ro220 import Ro220IE  from .rottentomatoes import RottenTomatoesIE  from .roxwel import RoxwelIE  from .rtbf import RTBFIE +from .rtlnl import RtlXlIE  from .rtlnow import RTLnowIE  from .rts import RTSIE -from .rtve import RTVEALaCartaIE +from .rtve import RTVEALaCartaIE, RTVELiveIE  from .ruhd import RUHDIE  from .rutube import (      RutubeIE, @@ -265,6 +282,7 @@ from .rutube import (  from .rutv import RUTVIE  from .sapo import SapoIE  from .savefrom import SaveFromIE +from .sbs import SBSIE  from .scivee import SciVeeIE  from .screencast import ScreencastIE  from .servingsys import ServingSysIE @@ -377,6 +395,7 @@ from .vuclip import VuClipIE  from .vulture import VultureIE  from .washingtonpost import WashingtonPostIE  from .wat import WatIE +from .wayofthemaster import WayOfTheMasterIE  from .wdr import (      WDRIE,      WDRMobileIE, diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 7e93bc4df..748608826 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -1,5 +1,7 @@  #coding: utf-8 +from __future__ import unicode_literals +  import re  from .common import InfoExtractor @@ -13,13 +15,14 @@ class AparatIE(InfoExtractor):      _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'      _TEST = { -        u'url': u'http://www.aparat.com/v/wP8On', -        u'file': u'wP8On.mp4', -        u'md5': u'6714e0af7e0d875c5a39c4dc4ab46ad1', -        u'info_dict': { -            u"title": u"تیم گلکسی 11 - زومیت", +        'url': 'http://www.aparat.com/v/wP8On', +        'md5': '6714e0af7e0d875c5a39c4dc4ab46ad1', +        'info_dict': { +            'id': 'wP8On', +            'ext': 'mp4', +            'title': 'تیم گلکسی 11 - زومیت',          }, -        #u'skip': u'Extremely unreliable', +        # 'skip': 'Extremely unreliable',      }      def _real_extract(self, url): @@ -29,8 +32,8 @@ class AparatIE(InfoExtractor):          # Note: There is an easier-to-parse configuration at          # http://www.aparat.com/video/video/config/videohash/%video_id          # but the URL in there does not work -        embed_url = (u'http://www.aparat.com/video/video/embed/videohash/' + -                     video_id + u'/vt/frame') +        embed_url = ('http://www.aparat.com/video/video/embed/videohash/' + +                     video_id + '/vt/frame')          webpage = self._download_webpage(embed_url, video_id)          video_urls = re.findall(r'fileList\[[0-9]+\]\s*=\s*"([^"]+)"', webpage) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index d86dbba8e..1c72b2ff6 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -177,16 +177,26 @@ class ArteTVPlus7IE(InfoExtractor):  # It also uses the arte_vp_url url from the webpage to extract the information  class ArteTVCreativeIE(ArteTVPlus7IE):      IE_NAME = 'arte.tv:creative' -    _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/magazine?/(?P<id>.+)' +    _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/(?:magazine?/)?(?P<id>[^?#]+)' -    _TEST = { +    _TESTS = [{          'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',          'info_dict': { -            'id': '050489-002', +            'id': '72176',              'ext': 'mp4', -            'title': 'Agentur Amateur / Agence Amateur #2 : Corporate Design', +            'title': 'Folge 2 - Corporate Design', +            'upload_date': '20131004',          }, -    } +    }, { +        'url': 'http://creative.arte.tv/fr/Monty-Python-Reunion', +        'info_dict': { +            'id': '160676', +            'ext': 'mp4', +            'title': 'Monty Python live (mostly)', +            'description': 'Événement ! Quarante-cinq ans après leurs premiers succès, les légendaires Monty Python remontent sur scène.\n', +            'upload_date': '20140805', +        } +    }]  class ArteTVFutureIE(ArteTVPlus7IE): diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index acfc4ad73..261ead98f 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -15,7 +15,7 @@ from ..utils import (  class BlipTVIE(SubtitlesInfoExtractor): -    _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+]+)))' +    _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+_TESTS]+)))'      _TESTS = [          { @@ -49,6 +49,21 @@ class BlipTVIE(SubtitlesInfoExtractor):                  'uploader_id': '792887',                  'duration': 279,              } +        }, +        { +            # https://bugzilla.redhat.com/show_bug.cgi?id=967465 +            'url': 'http://a.blip.tv/api.swf#h6Uag5KbVwI', +            'md5': '314e87b1ebe7a48fcbfdd51b791ce5a6', +            'info_dict': { +                'id': '6573122', +                'ext': 'mov', +                'upload_date': '20130520', +                'description': 'Two hapless space marines argue over what to do when they realize they have an astronomically huge problem on their hands.', +                'title': 'Red vs. Blue Season 11 Trailer', +                'timestamp': 1369029609, +                'uploader': 'redvsblue', +                'uploader_id': '792887', +            }          }      ] @@ -150,7 +165,7 @@ class BlipTVIE(SubtitlesInfoExtractor):  class BlipTVUserIE(InfoExtractor): -    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$' +    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$'      _PAGE_SIZE = 12      IE_NAME = 'blip.tv:user' diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 419951b62..294670386 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -154,12 +154,14 @@ class BrightcoveIE(InfoExtractor):      def _extract_brightcove_urls(cls, webpage):          """Return a list of all Brightcove URLs from the webpage """ -        url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage) +        url_m = re.search( +            r'<meta\s+property="og:video"\s+content="(https?://(?:secure|c)\.brightcove.com/[^"]+)"', +            webpage)          if url_m:              url = unescapeHTML(url_m.group(1))              # Some sites don't add it, we can't download with this url, for example:              # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/ -            if 'playerKey' in url: +            if 'playerKey' in url or 'videoId' in url:                  return [url]          matches = re.findall( @@ -188,9 +190,13 @@ class BrightcoveIE(InfoExtractor):              referer = smuggled_data.get('Referer', url)              return self._get_video_info(                  videoPlayer[0], query_str, query, referer=referer) -        else: +        elif 'playerKey' in query:              player_key = query['playerKey']              return self._get_playlist_info(player_key[0]) +        else: +            raise ExtractorError( +                'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', +                expected=True)      def _get_video_info(self, video_id, query_str, query, referer=None):          request_url = self._FEDERATED_URL_TEMPLATE % query_str @@ -202,6 +208,13 @@ class BrightcoveIE(InfoExtractor):              req.add_header('Referer', referer)          webpage = self._download_webpage(req, video_id) +        error_msg = self._html_search_regex( +            r"<h1>We're sorry.</h1>\s*<p>(.*?)</p>", webpage, +            'error message', default=None) +        if error_msg is not None: +            raise ExtractorError( +                'brightcove said: %s' % error_msg, expected=True) +          self.report_extraction(video_id)          info = self._search_regex(r'var experienceJSON = ({.*});', webpage, 'json')          info = json.loads(info)['data'] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 45a17f8ad..4d5b48167 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -84,6 +84,12 @@ class InfoExtractor(object):                                   format, irrespective of the file format.                                   -1 for default (order by other properties),                                   -2 or smaller for less than default. +                    * http_referer  HTTP Referer header value to set. +                    * http_method  HTTP method to use for the download. +                    * http_headers  A dictionary of additional HTTP headers +                                 to add to the request. +                    * http_post_data  Additional data to send with a POST +                                 request.      url:            Final video URL.      ext:            Video filename extension.      format:         The video format, defaults to ext (used for --get-format) @@ -479,8 +485,9 @@ class InfoExtractor(object):          return self._og_search_property('title', html, **kargs)      def _og_search_video_url(self, html, name='video url', secure=True, **kargs): -        regexes = self._og_regexes('video') -        if secure: regexes = self._og_regexes('video:secure_url') + regexes +        regexes = self._og_regexes('video') + self._og_regexes('video:url') +        if secure: +            regexes = self._og_regexes('video:secure_url') + regexes          return self._html_search_regex(regexes, html, name, **kargs)      def _og_search_url(self, html, **kargs): diff --git a/youtube_dl/extractor/dump.py b/youtube_dl/extractor/dump.py new file mode 100644 index 000000000..6b651778a --- /dev/null +++ b/youtube_dl/extractor/dump.py @@ -0,0 +1,39 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class DumpIE(InfoExtractor): +    _VALID_URL = r'^https?://(?:www\.)?dump\.com/(?P<id>[a-zA-Z0-9]+)/' + +    _TEST = { +        'url': 'http://www.dump.com/oneus/', +        'md5': 'ad71704d1e67dfd9e81e3e8b42d69d99', +        'info_dict': { +            'id': 'oneus', +            'ext': 'flv', +            'title': "He's one of us.", +            'thumbnail': 're:^https?://.*\.jpg$', +        }, +    } + +    def _real_extract(self, url): +        m = re.match(self._VALID_URL, url) +        video_id = m.group('id') + +        webpage = self._download_webpage(url, video_id) +        video_url = self._search_regex( +            r's1.addVariable\("file",\s*"([^"]+)"', webpage, 'video URL') + +        thumb = self._og_search_thumbnail(webpage) +        title = self._search_regex(r'<b>([^"]+)</b>', webpage, 'title') + +        return { +            'id': video_id, +            'title': title, +            'url': video_url, +            'thumbnail': thumb, +        } diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py index 877113d63..63c2549d3 100644 --- a/youtube_dl/extractor/ebaumsworld.py +++ b/youtube_dl/extractor/ebaumsworld.py @@ -1,19 +1,21 @@ +from __future__ import unicode_literals +  import re  from .common import InfoExtractor -from ..utils import determine_ext  class EbaumsWorldIE(InfoExtractor):      _VALID_URL = r'https?://www\.ebaumsworld\.com/video/watch/(?P<id>\d+)'      _TEST = { -        u'url': u'http://www.ebaumsworld.com/video/watch/83367677/', -        u'file': u'83367677.mp4', -        u'info_dict': { -            u'title': u'A Giant Python Opens The Door', -            u'description': u'This is how nightmares start...', -            u'uploader': u'jihadpizza', +        'url': 'http://www.ebaumsworld.com/video/watch/83367677/', +        'info_dict': { +            'id': '83367677', +            'ext': 'mp4', +            'title': 'A Giant Python Opens The Door', +            'description': 'This is how nightmares start...', +            'uploader': 'jihadpizza',          },      } @@ -28,7 +30,6 @@ class EbaumsWorldIE(InfoExtractor):              'id': video_id,              'title': config.find('title').text,              'url': video_url, -            'ext': determine_ext(video_url),              'description': config.find('description').text,              'thumbnail': config.find('image').text,              'uploader': config.find('username').text, diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py new file mode 100644 index 000000000..3e7923648 --- /dev/null +++ b/youtube_dl/extractor/ellentv.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    parse_iso8601, +) + + +class EllenTVIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?ellentv\.com/videos/(?P<id>[a-z0-9_-]+)' +    _TEST = { +        'url': 'http://www.ellentv.com/videos/0-7jqrsr18/', +        'md5': 'e4af06f3bf0d5f471921a18db5764642', +        'info_dict': { +            'id': '0-7jqrsr18', +            'ext': 'mp4', +            'title': 'What\'s Wrong with These Photos? A Whole Lot', +            'timestamp': 1406876400, +            'upload_date': '20140801', +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) +        timestamp = parse_iso8601(self._search_regex( +            r'<span class="publish-date"><time datetime="([^"]+)">', +            webpage, 'timestamp')) + +        return { +            'id': video_id, +            'title': self._og_search_title(webpage), +            'url': self._html_search_meta('VideoURL', webpage, 'url'), +            'timestamp': timestamp, +        } + + +class EllenTVClipsIE(InfoExtractor): +    IE_NAME = 'EllenTV:clips' +    _VALID_URL = r'https?://(?:www\.)?ellentv\.com/episodes/(?P<id>[a-z0-9_-]+)' +    _TEST = { +        'url': 'http://www.ellentv.com/episodes/meryl-streep-vanessa-hudgens/', +        'info_dict': { +            'id': 'meryl-streep-vanessa-hudgens', +            'title': 'Meryl Streep, Vanessa Hudgens', +        }, +        'playlist_mincount': 9, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        playlist_id = mobj.group('id') + +        webpage = self._download_webpage(url, playlist_id) +        playlist = self._extract_playlist(webpage) + +        return { +            '_type': 'playlist', +            'id': playlist_id, +            'title': self._og_search_title(webpage), +            'entries': self._extract_entries(playlist) +        } + +    def _extract_playlist(self, webpage): +        json_string = self._search_regex(r'playerView.addClips\(\[\{(.*?)\}\]\);', webpage, 'json') +        try: +            return json.loads("[{" + json_string + "}]") +        except ValueError as ve: +            raise ExtractorError('Failed to download JSON', cause=ve) + +    def _extract_entries(self, playlist): +        return [self.url_result(item['url'], 'EllenTV') for item in playlist] diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 272dfe1f6..476fc22b9 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -36,7 +36,7 @@ class EscapistIE(InfoExtractor):              r'<meta name="description" content="([^"]*)"',              webpage, 'description', fatal=False) -        playerUrl = self._og_search_video_url(webpage, name=u'player URL') +        playerUrl = self._og_search_video_url(webpage, name='player URL')          title = self._html_search_regex(              r'<meta name="title" content="([^"]*)"', diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py new file mode 100644 index 000000000..a38b773e8 --- /dev/null +++ b/youtube_dl/extractor/expotv.py @@ -0,0 +1,73 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    unified_strdate, +) + + +class ExpoTVIE(InfoExtractor): +    _VALID_URL = r'https?://www\.expotv\.com/videos/[^?#]*/(?P<id>[0-9]+)($|[?#])' +    _TEST = { +        'url': 'http://www.expotv.com/videos/reviews/1/24/LinneCardscom/17561', +        'md5': '2985e6d7a392b2f7a05e0ca350fe41d0', +        'info_dict': { +            'id': '17561', +            'ext': 'mp4', +            'upload_date': '20060212', +            'title': 'My Favorite Online Scrapbook Store', +            'view_count': int, +            'description': 'You\'ll find most everything you need at this virtual store front.', +            'uploader': 'Anna T.', +            'thumbnail': 're:^https?://.*\.jpg$', +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) +        player_key = self._search_regex( +            r'<param name="playerKey" value="([^"]+)"', webpage, 'player key') +        config_url = 'http://client.expotv.com/video/config/%s/%s' % ( +            video_id, player_key) +        config = self._download_json( +            config_url, video_id, +            note='Downloading video configuration') + +        formats = [{ +            'url': fcfg['file'], +            'height': int_or_none(fcfg.get('height')), +            'format_note': fcfg.get('label'), +            'ext': self._search_regex( +                r'filename=.*\.([a-z0-9_A-Z]+)&', fcfg['file'], +                'file extension', default=None), +        } for fcfg in config['sources']] +        self._sort_formats(formats) + +        title = self._og_search_title(webpage) +        description = self._og_search_description(webpage) +        thumbnail = config.get('image') +        view_count = int_or_none(self._search_regex( +            r'<h5>Plays: ([0-9]+)</h5>', webpage, 'view counts')) +        uploader = self._search_regex( +            r'<div class="reviewer">\s*<img alt="([^"]+)"', webpage, 'uploader', +            fatal=False) +        upload_date = unified_strdate(self._search_regex( +            r'<h5>Reviewed on ([0-9/.]+)</h5>', webpage, 'upload date', +            fatal=False)) + +        return { +            'id': video_id, +            'formats': formats, +            'title': title, +            'description': description, +            'view_count': view_count, +            'thumbnail': thumbnail, +            'uploader': uploader, +            'upload_date': upload_date, +        } diff --git a/youtube_dl/extractor/gameone.py b/youtube_dl/extractor/gameone.py index b580f52fb..3022f539d 100644 --- a/youtube_dl/extractor/gameone.py +++ b/youtube_dl/extractor/gameone.py @@ -88,3 +88,28 @@ class GameOneIE(InfoExtractor):              'age_limit': age_limit,              'timestamp': timestamp,          } + + +class GameOnePlaylistIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?gameone\.de(?:/tv)?/?$' +    IE_NAME = 'gameone:playlist' +    _TEST = { +        'url': 'http://www.gameone.de/tv', +        'info_dict': { +            'title': 'GameOne', +        }, +        'playlist_mincount': 294, +    } + +    def _real_extract(self, url): +        webpage = self._download_webpage('http://www.gameone.de/tv', 'TV') +        max_id = max(map(int, re.findall(r'<a href="/tv/(\d+)"', webpage))) +        entries = [ +            self.url_result('http://www.gameone.de/tv/%d' % video_id, 'GameOne') +            for video_id in range(max_id, 0, -1)] + +        return { +            '_type': 'playlist', +            'title': 'GameOne', +            'entries': entries, +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8e915735e..8b11f7f7a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -15,11 +15,14 @@ from ..utils import (      compat_xml_parse_error,      ExtractorError, +    float_or_none,      HEADRequest, +    orderedSet,      parse_xml,      smuggle_url,      unescapeHTML,      unified_strdate, +    unsmuggle_url,      url_basename,  )  from .brightcove import BrightcoveIE @@ -289,6 +292,58 @@ class GenericIE(InfoExtractor):                  'description': 'Mario\'s life in the fast lane has never looked so good.',              },          }, +        # YouTube embed via <data-embed-url=""> +        { +            'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM', +            'info_dict': { +                'id': 'jpSGZsgga_I', +                'ext': 'mp4', +                'title': 'Asphalt 8: Airborne - Launch Trailer', +                'uploader': 'Gameloft', +                'uploader_id': 'gameloft', +                'upload_date': '20130821', +                'description': 'md5:87bd95f13d8be3e7da87a5f2c443106a', +            }, +            'params': { +                'skip_download': True, +            } +        }, +        # Camtasia studio +        { +            'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/', +            'playlist': [{ +                'md5': '0c5e352edabf715d762b0ad4e6d9ee67', +                'info_dict': { +                    'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', +                    'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1', +                    'ext': 'flv', +                    'duration': 2235.90, +                } +            }, { +                'md5': '10e4bb3aaca9fd630e273ff92d9f3c63', +                'info_dict': { +                    'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP', +                    'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip', +                    'ext': 'flv', +                    'duration': 2235.93, +                } +            }], +            'info_dict': { +                'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', +            } +        }, +        # Flowplayer +        { +            'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html', +            'md5': '9d65602bf31c6e20014319c7d07fba27', +            'info_dict': { +                'id': '5123ea6d5e5a7', +                'ext': 'mp4', +                'age_limit': 18, +                'uploader': 'www.handjobhub.com', +                'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub', +            } +        }      ]      def report_download_webpage(self, video_id): @@ -301,58 +356,6 @@ class GenericIE(InfoExtractor):          """Report information extraction."""          self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) -    def _send_head(self, url): -        """Check if it is a redirect, like url shorteners, in case return the new url.""" - -        class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): -            """ -            Subclass the HTTPRedirectHandler to make it use our -            HEADRequest also on the redirected URL -            """ -            def redirect_request(self, req, fp, code, msg, headers, newurl): -                if code in (301, 302, 303, 307): -                    newurl = newurl.replace(' ', '%20') -                    newheaders = dict((k,v) for k,v in req.headers.items() -                                      if k.lower() not in ("content-length", "content-type")) -                    try: -                        # This function was deprecated in python 3.3 and removed in 3.4 -                        origin_req_host = req.get_origin_req_host() -                    except AttributeError: -                        origin_req_host = req.origin_req_host -                    return HEADRequest(newurl, -                                       headers=newheaders, -                                       origin_req_host=origin_req_host, -                                       unverifiable=True) -                else: -                    raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) - -        class HTTPMethodFallback(compat_urllib_request.BaseHandler): -            """ -            Fallback to GET if HEAD is not allowed (405 HTTP error) -            """ -            def http_error_405(self, req, fp, code, msg, headers): -                fp.read() -                fp.close() - -                newheaders = dict((k,v) for k,v in req.headers.items() -                                  if k.lower() not in ("content-length", "content-type")) -                return self.parent.open(compat_urllib_request.Request(req.get_full_url(), -                                                 headers=newheaders, -                                                 origin_req_host=req.get_origin_req_host(), -                                                 unverifiable=True)) - -        # Build our opener -        opener = compat_urllib_request.OpenerDirector() -        for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, -                        HTTPMethodFallback, HEADRedirectHandler, -                        compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: -            opener.add_handler(handler()) - -        response = opener.open(HEADRequest(url)) -        if response is None: -            raise ExtractorError('Invalid URL protocol') -        return response -      def _extract_rss(self, url, video_id, doc):          playlist_title = doc.find('./channel/title').text          playlist_desc_el = doc.find('./channel/description') @@ -372,6 +375,43 @@ class GenericIE(InfoExtractor):              'entries': entries,          } +    def _extract_camtasia(self, url, video_id, webpage): +        """ Returns None if no camtasia video can be found. """ + +        camtasia_cfg = self._search_regex( +            r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);', +            webpage, 'camtasia configuration file', default=None) +        if camtasia_cfg is None: +            return None + +        title = self._html_search_meta('DC.title', webpage, fatal=True) + +        camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg) +        camtasia_cfg = self._download_xml( +            camtasia_url, video_id, +            note='Downloading camtasia configuration', +            errnote='Failed to download camtasia configuration') +        fileset_node = camtasia_cfg.find('./playlist/array/fileset') + +        entries = [] +        for n in fileset_node.getchildren(): +            url_n = n.find('./uri') +            if url_n is None: +                continue + +            entries.append({ +                'id': os.path.splitext(url_n.text.rpartition('/')[2])[0], +                'title': '%s - %s' % (title, n.tag), +                'url': compat_urlparse.urljoin(url, url_n.text), +                'duration': float_or_none(n.find('./duration').text), +            }) + +        return { +            '_type': 'playlist', +            'entries': entries, +            'title': title, +        } +      def _real_extract(self, url):          if url.startswith('//'):              return { @@ -408,17 +448,31 @@ class GenericIE(InfoExtractor):              else:                  assert ':' in default_search                  return self.url_result(default_search + url) -        video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0] + +        url, smuggled_data = unsmuggle_url(url) +        force_videoid = None +        if smuggled_data and 'force_videoid' in smuggled_data: +            force_videoid = smuggled_data['force_videoid'] +            video_id = force_videoid +        else: +            video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]          self.to_screen('%s: Requesting header' % video_id) -        try: -            response = self._send_head(url) +        head_req = HEADRequest(url) +        response = self._request_webpage( +            head_req, video_id, +            note=False, errnote='Could not send HEAD request to %s' % url, +            fatal=False) +        if response is not False:              # Check for redirect              new_url = response.geturl()              if url != new_url:                  self.report_following_redirect(new_url) +                if force_videoid: +                    new_url = smuggle_url( +                        new_url, {'force_videoid': force_videoid})                  return self.url_result(new_url)              # Check for direct link to a video @@ -439,10 +493,6 @@ class GenericIE(InfoExtractor):                      'upload_date': upload_date,                  } -        except compat_urllib_error.HTTPError: -            # This may be a stupid server that doesn't like HEAD, our UA, or so -            pass -          try:              webpage = self._download_webpage(url, video_id)          except ValueError: @@ -460,6 +510,11 @@ class GenericIE(InfoExtractor):          except compat_xml_parse_error:              pass +        # Is it a Camtasia project? +        camtasia_res = self._extract_camtasia(url, video_id, webpage) +        if camtasia_res is not None: +            return camtasia_res +          # Sometimes embedded video player is hidden behind percent encoding          # (e.g. https://github.com/rg3/youtube-dl/issues/2448)          # Unescaping the whole page allows to handle those cases in a generic way @@ -475,10 +530,26 @@ class GenericIE(InfoExtractor):              r'(?s)<title>(.*?)</title>', webpage, 'video title',              default='video') +        # Try to detect age limit automatically +        age_limit = self._rta_search(webpage) +        # And then there are the jokers who advertise that they use RTA, +        # but actually don't. +        AGE_LIMIT_MARKERS = [ +            r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>', +        ] +        if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): +            age_limit = 18 +          # video uploader is domain name          video_uploader = self._search_regex(              r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') +        # Helper method +        def _playlist_from_matches(matches, getter, ie=None): +            urlrs = orderedSet(self.url_result(getter(m), ie) for m in matches) +            return self.playlist_result( +                urlrs, playlist_id=video_id, playlist_title=video_title) +          # Look for BrightCove:          bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)          if bc_urls: @@ -514,6 +585,7 @@ class GenericIE(InfoExtractor):          matches = re.findall(r'''(?x)              (?:                  <iframe[^>]+?src=| +                data-video-url=|                  <embed[^>]+?src=|                  embedSWF\(?:\s*              ) @@ -522,19 +594,15 @@ class GenericIE(InfoExtractor):                  (?:embed|v)/.+?)              \1''', webpage)          if matches: -            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube') -                     for tuppl in matches] -            return self.playlist_result( -                urlrs, playlist_id=video_id, playlist_title=video_title) +            return _playlist_from_matches( +                matches, lambda m: unescapeHTML(m[1]), ie='Youtube')          # Look for embedded Dailymotion player          matches = re.findall(              r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)          if matches: -            urlrs = [self.url_result(unescapeHTML(tuppl[1])) -                     for tuppl in matches] -            return self.playlist_result( -                urlrs, playlist_id=video_id, playlist_title=video_title) +            return _playlist_from_matches( +                matches, lambda m: unescapeHTML(m[1]))          # Look for embedded Wistia player          match = re.search( @@ -553,7 +621,7 @@ class GenericIE(InfoExtractor):          mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)          if mobj:              return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV') -        mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9]+)', webpage) +        mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)          if mobj:              return self.url_result(mobj.group(1), 'BlipTV') @@ -648,10 +716,8 @@ class GenericIE(InfoExtractor):          # Look for funnyordie embed          matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)          if matches: -            urlrs = [self.url_result(unescapeHTML(eurl), 'FunnyOrDie') -                     for eurl in matches] -            return self.playlist_result( -                urlrs, playlist_id=video_id, playlist_title=video_title) +            return _playlist_from_matches( +                matches, getter=unescapeHTML, ie='FunnyOrDie')          # Look for embedded RUTV player          rutv_url = RUTVIE._extract_url(webpage) @@ -713,6 +779,13 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result(mobj.group('url'), 'Yahoo') +        # Look for embedded sbs.com.au player +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)sbs\.com\.au/ondemand/video/single/.+?)\1', +            webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'SBS') +          # Start with something easy: JW Player in SWFObject          found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)          if not found: @@ -731,6 +804,15 @@ class GenericIE(InfoExtractor):              # Broaden the findall a little bit: JWPlayer JS loader              found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)          if not found: +            # Flow player +            found = re.findall(r'''(?xs) +                flowplayer\("[^"]+",\s* +                    \{[^}]+?\}\s*, +                    \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s* +                        ["']?url["']?\s*:\s*["']([^"']+)["'] +            ''', webpage) +            assert found +        if not found:              # Try to find twitter cards info              found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)          if not found: @@ -739,7 +821,12 @@ class GenericIE(InfoExtractor):              m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)              # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:              if m_video_type is not None: -                found = re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) +                def check_video(vurl): +                    vpath = compat_urlparse.urlparse(vurl).path +                    return '.' in vpath and not vpath.endswith('.swf') +                found = list(filter( +                    check_video, +                    re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)))          if not found:              # HTML5 video              found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage) @@ -776,6 +863,7 @@ class GenericIE(InfoExtractor):                  'url': video_url,                  'uploader': video_uploader,                  'title': video_title, +                'age_limit': age_limit,              })          if len(entries) == 1: diff --git a/youtube_dl/extractor/grooveshark.py b/youtube_dl/extractor/grooveshark.py new file mode 100644 index 000000000..726adff77 --- /dev/null +++ b/youtube_dl/extractor/grooveshark.py @@ -0,0 +1,190 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import time +import math +import os.path +import re + + +from .common import InfoExtractor +from ..utils import ExtractorError, compat_urllib_request, compat_html_parser + +from ..utils import ( +    compat_urllib_parse, +    compat_urlparse, +) + + +class GroovesharkHtmlParser(compat_html_parser.HTMLParser): +    def __init__(self): +        self._current_object = None +        self.objects = [] +        compat_html_parser.HTMLParser.__init__(self) + +    def handle_starttag(self, tag, attrs): +        attrs = dict((k, v) for k, v in attrs) +        if tag == 'object': +            self._current_object = {'attrs': attrs, 'params': []} +        elif tag == 'param': +            self._current_object['params'].append(attrs) + +    def handle_endtag(self, tag): +        if tag == 'object': +            self.objects.append(self._current_object) +            self._current_object = None + +    @classmethod +    def extract_object_tags(cls, html): +        p = cls() +        p.feed(html) +        p.close() +        return p.objects + + +class GroovesharkIE(InfoExtractor): +    _VALID_URL = r'https?://(www\.)?grooveshark\.com/#!/s/([^/]+)/([^/]+)' +    _TEST = { +        'url': 'http://grooveshark.com/#!/s/Jolene+Tenth+Key+Remix+Ft+Will+Sessions/6SS1DW?src=5', +        'md5': '7ecf8aefa59d6b2098517e1baa530023', +        'info_dict': { +            'id': '6SS1DW', +            'title': 'Jolene (Tenth Key Remix ft. Will Sessions)', +            'ext': 'mp3', +            'duration': 227, +        } +    } + +    do_playerpage_request = True +    do_bootstrap_request = True + +    def _parse_target(self, target): +        uri = compat_urlparse.urlparse(target) +        hash = uri.fragment[1:].split('?')[0] +        token = os.path.basename(hash.rstrip('/')) +        return (uri, hash, token) + +    def _build_bootstrap_url(self, target): +        (uri, hash, token) = self._parse_target(target) +        query = 'getCommunicationToken=1&hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts) +        return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token) + +    def _build_meta_url(self, target): +        (uri, hash, token) = self._parse_target(target) +        query = 'hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts) +        return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token) + +    def _build_stream_url(self, meta): +        return compat_urlparse.urlunparse(('http', meta['streamKey']['ip'], '/stream.php', None, None, None)) + +    def _build_swf_referer(self, target, obj): +        (uri, _, _) = self._parse_target(target) +        return compat_urlparse.urlunparse((uri.scheme, uri.netloc, obj['attrs']['data'], None, None, None)) + +    def _transform_bootstrap(self, js): +        return re.split('(?m)^\s*try\s*{', js)[0] \ +                 .split(' = ', 1)[1].strip().rstrip(';') + +    def _transform_meta(self, js): +        return js.split('\n')[0].split('=')[1].rstrip(';') + +    def _get_meta(self, target): +        (meta_url, token) = self._build_meta_url(target) +        self.to_screen('Metadata URL: %s' % meta_url) + +        headers = {'Referer': compat_urlparse.urldefrag(target)[0]} +        req = compat_urllib_request.Request(meta_url, headers=headers) +        res = self._download_json(req, token, +                                  transform_source=self._transform_meta) + +        if 'getStreamKeyWithSong' not in res: +            raise ExtractorError( +                'Metadata not found. URL may be malformed, or Grooveshark API may have changed.') + +        if res['getStreamKeyWithSong'] is None: +            raise ExtractorError( +                'Metadata download failed, probably due to Grooveshark anti-abuse throttling. Wait at least an hour before retrying from this IP.', +                expected=True) + +        return res['getStreamKeyWithSong'] + +    def _get_bootstrap(self, target): +        (bootstrap_url, token) = self._build_bootstrap_url(target) + +        headers = {'Referer': compat_urlparse.urldefrag(target)[0]} +        req = compat_urllib_request.Request(bootstrap_url, headers=headers) +        res = self._download_json(req, token, fatal=False, +                                  note='Downloading player bootstrap data', +                                  errnote='Unable to download player bootstrap data', +                                  transform_source=self._transform_bootstrap) +        return res + +    def _get_playerpage(self, target): +        (_, _, token) = self._parse_target(target) + +        webpage = self._download_webpage( +            target, token, +            note='Downloading player page', +            errnote='Unable to download player page', +            fatal=False) + +        if webpage is not None: +            # Search (for example German) error message +            error_msg = self._html_search_regex( +                r'<div id="content">\s*<h2>(.*?)</h2>', webpage, +                'error message', default=None) +            if error_msg is not None: +                error_msg = error_msg.replace('\n', ' ') +                raise ExtractorError('Grooveshark said: %s' % error_msg) + +        if webpage is not None: +            o = GroovesharkHtmlParser.extract_object_tags(webpage) +            return (webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed']) + +        return (webpage, None) + +    def _real_initialize(self): +        self.ts = int(time.time() * 1000)  # timestamp in millis + +    def _real_extract(self, url): +        (target_uri, _, token) = self._parse_target(url) + +        # 1. Fill cookiejar by making a request to the player page +        swf_referer = None +        if self.do_playerpage_request: +            (_, player_objs) = self._get_playerpage(url) +            if player_objs is not None: +                swf_referer = self._build_swf_referer(url, player_objs[0]) +                self.to_screen('SWF Referer: %s' % swf_referer) + +        # 2. Ask preload.php for swf bootstrap data to better mimic webapp +        if self.do_bootstrap_request: +            bootstrap = self._get_bootstrap(url) +            self.to_screen('CommunicationToken: %s' % bootstrap['getCommunicationToken']) + +        # 3. Ask preload.php for track metadata. +        meta = self._get_meta(url) + +        # 4. Construct stream request for track. +        stream_url = self._build_stream_url(meta) +        duration = int(math.ceil(float(meta['streamKey']['uSecs']) / 1000000)) +        post_dict = {'streamKey': meta['streamKey']['streamKey']} +        post_data = compat_urllib_parse.urlencode(post_dict).encode('utf-8') +        headers = { +            'Content-Length': len(post_data), +            'Content-Type': 'application/x-www-form-urlencoded' +        } +        if swf_referer is not None: +            headers['Referer'] = swf_referer + +        return { +            'id': token, +            'title': meta['song']['Name'], +            'http_method': 'POST', +            'url': stream_url, +            'ext': 'mp3', +            'format': 'mp3 audio', +            'duration': duration, +            'http_post_data': post_data, +            'http_headers': headers, +        } diff --git a/youtube_dl/extractor/jove.py b/youtube_dl/extractor/jove.py new file mode 100644 index 000000000..cf73cd753 --- /dev/null +++ b/youtube_dl/extractor/jove.py @@ -0,0 +1,80 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    unified_strdate +) + + +class JoveIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)' +    _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}' +    _TESTS = [ +        { +            'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current', +            'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b', +            'info_dict': { +                'id': '2744', +                'ext': 'mp4', +                'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation', +                'description': 'md5:015dd4509649c0908bc27f049e0262c6', +                'thumbnail': 're:^https?://.*\.png$', +                'upload_date': '20110523', +            } +        }, +        { +            'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation', +            'md5': '914aeb356f416811d911996434811beb', +            'info_dict': { +                'id': '51796', +                'ext': 'mp4', +                'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment', +                'description': 'md5:35ff029261900583970c4023b70f1dc9', +                'thumbnail': 're:^https?://.*\.png$', +                'upload_date': '20140802', +            } +        }, + +    ] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        chapters_id = self._html_search_regex( +            r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id') + +        chapters_xml = self._download_xml( +            self._CHAPTERS_URL.format(video_id=chapters_id), +            video_id, note='Downloading chapters XML', +            errnote='Failed to download chapters XML') + +        video_url = chapters_xml.attrib.get('video') +        if not video_url: +            raise ExtractorError('Failed to get the video URL') + +        title = self._html_search_meta('citation_title', webpage, 'title') +        thumbnail = self._og_search_thumbnail(webpage) +        description = self._html_search_regex( +            r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>', +            webpage, 'description', fatal=False) +        publish_date = unified_strdate(self._html_search_meta( +            'citation_publication_date', webpage, 'publish date', fatal=False)) +        comment_count = self._html_search_regex( +            r'<meta name="num_comments" content="(\d+) Comments?"', +            webpage, 'comment count', fatal=False) + +        return { +            'id': video_id, +            'title': title, +            'url': video_url, +            'thumbnail': thumbnail, +            'description': description, +            'upload_date': publish_date, +            'comment_count': comment_count, +        } diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 6436c05a3..1a896b536 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -9,6 +9,7 @@ from ..utils import (      compat_urllib_request,      determine_ext,      ExtractorError, +    int_or_none,  ) @@ -83,6 +84,21 @@ class MetacafeIE(InfoExtractor):                  'skip_download': True,              },          }, +        # Movieclips.com video +        { +            'url': 'http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/', +            'info_dict': { +                'id': 'mv-Wy7ZU', +                'ext': 'mp4', +                'title': 'My Week with Marilyn - Do You Love Me?', +                'description': 'From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie.', +                'uploader': 'movie_trailers', +                'duration': 176, +            }, +            'params': { +                'skip_download': 'requires rtmpdump', +            } +        }      ]      def report_disclaimer(self): @@ -134,6 +150,7 @@ class MetacafeIE(InfoExtractor):          # Extract URL, uploader and title from webpage          self.report_extraction(video_id) +        video_url = None          mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)          if mobj is not None:              mediaURL = compat_urllib_parse.unquote(mobj.group(1)) @@ -146,16 +163,17 @@ class MetacafeIE(InfoExtractor):              else:                  gdaKey = mobj.group(1)                  video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) -        else: +        if video_url is None:              mobj = re.search(r'<video src="([^"]+)"', webpage)              if mobj:                  video_url = mobj.group(1)                  video_ext = 'mp4' -            else: -                mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) -                if mobj is None: -                    raise ExtractorError('Unable to extract media URL') -                vardict = compat_parse_qs(mobj.group(1)) +        if video_url is None: +            flashvars = self._search_regex( +                r' name="flashvars" value="(.*?)"', webpage, 'flashvars', +                default=None) +            if flashvars: +                vardict = compat_parse_qs(flashvars)                  if 'mediaData' not in vardict:                      raise ExtractorError('Unable to extract media URL')                  mobj = re.search( @@ -165,26 +183,68 @@ class MetacafeIE(InfoExtractor):                  mediaURL = mobj.group('mediaURL').replace('\\/', '/')                  video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))                  video_ext = determine_ext(video_url) - -        video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, 'title') +        if video_url is None: +            player_url = self._search_regex( +                r"swfobject\.embedSWF\('([^']+)'", +                webpage, 'config URL', default=None) +            if player_url: +                config_url = self._search_regex( +                    r'config=(.+)$', player_url, 'config URL') +                config_doc = self._download_xml( +                    config_url, video_id, +                    note='Downloading video config') +                smil_url = config_doc.find('.//properties').attrib['smil_file'] +                smil_doc = self._download_xml( +                    smil_url, video_id, +                    note='Downloading SMIL document') +                base_url = smil_doc.find('./head/meta').attrib['base'] +                video_url = [] +                for vn in smil_doc.findall('.//video'): +                    br = int(vn.attrib['system-bitrate']) +                    play_path = vn.attrib['src'] +                    video_url.append({ +                        'format_id': 'smil-%d' % br, +                        'url': base_url, +                        'play_path': play_path, +                        'page_url': url, +                        'player_url': player_url, +                        'ext': play_path.partition(':')[0], +                    }) + +        if video_url is None: +            raise ExtractorError('Unsupported video type') + +        video_title = self._html_search_regex( +            r'(?im)<title>(.*) - Video</title>', webpage, 'title')          description = self._og_search_description(webpage)          thumbnail = self._og_search_thumbnail(webpage)          video_uploader = self._html_search_regex(                  r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',                  webpage, 'uploader nickname', fatal=False) +        duration = int_or_none( +            self._html_search_meta('video:duration', webpage)) + +        age_limit = ( +            18 +            if re.search(r'"contentRating":"restricted"', webpage) +            else 0) -        if re.search(r'"contentRating":"restricted"', webpage) is not None: -            age_limit = 18 +        if isinstance(video_url, list): +            formats = video_url          else: -            age_limit = 0 +            formats = [{ +                'url': video_url, +                'ext': video_ext, +            }] +        self._sort_formats(formats)          return {              'id': video_id, -            'url': video_url,              'description': description,              'uploader': video_uploader,              'title': video_title, -            'thumbnail':thumbnail, -            'ext': video_ext, +            'thumbnail': thumbnail,              'age_limit': age_limit, +            'formats': formats, +            'duration': duration,          } diff --git a/youtube_dl/extractor/ministrygrid.py b/youtube_dl/extractor/ministrygrid.py new file mode 100644 index 000000000..949ad11db --- /dev/null +++ b/youtube_dl/extractor/ministrygrid.py @@ -0,0 +1,57 @@ +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    smuggle_url, +) + + +class MinistryGridIE(InfoExtractor): +    _VALID_URL = r'https?://www\.ministrygrid.com/([^/?#]*/)*(?P<id>[^/#?]+)/?(?:$|[?#])' + +    _TEST = { +        'url': 'http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers', +        'md5': '844be0d2a1340422759c2a9101bab017', +        'info_dict': { +            'id': '3453494717001', +            'ext': 'mp4', +            'title': 'The Gospel by Numbers', +            'description': 'Coming soon from T4G 2014!', +            'uploader': 'LifeWay Christian Resources (MG)', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) +        portlets_json = self._search_regex( +            r'Liferay\.Portlet\.list=(\[.+?\])', webpage, 'portlet list') +        portlets = json.loads(portlets_json) +        pl_id = self._search_regex( +            r'<!--\s*p_l_id - ([0-9]+)<br>', webpage, 'p_l_id') + +        for i, portlet in enumerate(portlets): +            portlet_url = 'http://www.ministrygrid.com/c/portal/render_portlet?p_l_id=%s&p_p_id=%s' % (pl_id, portlet) +            portlet_code = self._download_webpage( +                portlet_url, video_id, +                note='Looking in portlet %s (%d/%d)' % (portlet, i + 1, len(portlets)), +                fatal=False) +            video_iframe_url = self._search_regex( +                r'<iframe.*?src="([^"]+)"', portlet_code, 'video iframe', +                default=None) +            if video_iframe_url: +                surl = smuggle_url( +                    video_iframe_url, {'force_videoid': video_id}) +                return { +                    '_type': 'url', +                    'id': video_id, +                    'url': surl, +                } + +        raise ExtractorError('Could not find video iframe in any portlets') diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py new file mode 100644 index 000000000..979f3d692 --- /dev/null +++ b/youtube_dl/extractor/mitele.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_parse, +    get_element_by_attribute, +    parse_duration, +    strip_jsonp, +) + + +class MiTeleIE(InfoExtractor): +    IE_NAME = 'mitele.es' +    _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<episode>[^/]+)/' + +    _TEST = { +        'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', +        'md5': '6a75fe9d0d3275bead0cb683c616fddb', +        'info_dict': { +            'id': '0fce117d', +            'ext': 'mp4', +            'title': 'Programa 144 - Tor, la web invisible', +            'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', +            'display_id': 'programa-144', +            'duration': 2913, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        episode = mobj.group('episode') +        webpage = self._download_webpage(url, episode) +        embed_data_json = self._search_regex( +            r'MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', +            flags=re.DOTALL +        ).replace('\'', '"') +        embed_data = json.loads(embed_data_json) + +        info_url = embed_data['flashvars']['host'] +        info_el = self._download_xml(info_url, episode).find('./video/info') + +        video_link = info_el.find('videoUrl/link').text +        token_query = compat_urllib_parse.urlencode({'id': video_link}) +        token_info = self._download_json( +            'http://token.mitele.es/?' + token_query, episode, +            transform_source=strip_jsonp +        ) + +        return { +            'id': embed_data['videoId'], +            'display_id': episode, +            'title': info_el.find('title').text, +            'url': token_info['tokenizedUrl'], +            'description': get_element_by_attribute('class', 'text', webpage), +            'thumbnail': info_el.find('thumb').text, +            'duration': parse_duration(info_el.find('duration').text), +        } diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py new file mode 100644 index 000000000..456807dd1 --- /dev/null +++ b/youtube_dl/extractor/movieclips.py @@ -0,0 +1,78 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    compat_str, +    clean_html, +) + + +class MovieClipsIE(InfoExtractor): +    _VALID_URL = r'https?://movieclips\.com/(?P<id>[\da-zA-Z]+)(?:-(?P<display_id>[\da-z-]+))?' +    _TEST = { +        'url': 'http://movieclips.com/Wy7ZU-my-week-with-marilyn-movie-do-you-love-me/', +        'info_dict': { +            'id': 'Wy7ZU', +            'display_id': 'my-week-with-marilyn-movie-do-you-love-me', +            'ext': 'mp4', +            'title': 'My Week with Marilyn - Do You Love Me?', +            'description': 'md5:e86795bd332fe3cff461e7c8dc542acb', +            'thumbnail': 're:^https?://.*\.jpg$', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        display_id = mobj.group('display_id') +        show_id = display_id or video_id + +        config = self._download_xml( +            'http://config.movieclips.com/player/config/%s' % video_id, +            show_id, 'Downloading player config') + +        if config.find('./country-region').text == 'false': +            raise ExtractorError( +                '%s said: %s' % (self.IE_NAME, config.find('./region_alert').text), expected=True) + +        properties = config.find('./video/properties') +        smil_file = properties.attrib['smil_file'] + +        smil = self._download_xml(smil_file, show_id, 'Downloading SMIL') +        base_url = smil.find('./head/meta').attrib['base'] + +        formats = [] +        for video in smil.findall('./body/switch/video'): +            vbr = int(video.attrib['system-bitrate']) / 1000 +            src = video.attrib['src'] +            formats.append({ +                'url': base_url, +                'play_path': src, +                'ext': src.split(':')[0], +                'vbr': vbr, +                'format_id': '%dk' % vbr, +            }) + +        self._sort_formats(formats) + +        title = '%s - %s' % (properties.attrib['clip_movie_title'], properties.attrib['clip_title']) +        description = clean_html(compat_str(properties.attrib['clip_description'])) +        thumbnail = properties.attrib['image'] +        categories = properties.attrib['clip_categories'].split(',') + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'categories': categories, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py index 280328b78..58ec81f91 100644 --- a/youtube_dl/extractor/nuvid.py +++ b/youtube_dl/extractor/nuvid.py @@ -38,7 +38,7 @@ class NuvidIE(InfoExtractor):              webpage = self._download_webpage(                  request, video_id, 'Downloading %s page' % format_id)              video_url = self._html_search_regex( -                r'<a href="([^"]+)"\s*>Continue to watch video', webpage, '%s video URL' % format_id, fatal=False) +                r'<a\s+href="([^"]+)"\s+class="b_link">', webpage, '%s video URL' % format_id, fatal=False)              if not video_url:                  continue              formats.append({ @@ -49,19 +49,24 @@ class NuvidIE(InfoExtractor):          webpage = self._download_webpage(              'http://m.nuvid.com/video/%s' % video_id, video_id, 'Downloading video page')          title = self._html_search_regex( -            r'<div class="title">\s+<h2[^>]*>([^<]+)</h2>', webpage, 'title').strip() -        thumbnail = self._html_search_regex( -            r'href="(/thumbs/[^"]+)"[^>]*data-link_type="thumbs"', -            webpage, 'thumbnail URL', fatal=False) +            [r'<span title="([^"]+)">', +             r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>'], webpage, 'title').strip() +        thumbnails = [ +            { +                'url': thumb_url, +            } for thumb_url in re.findall(r'<img src="([^"]+)" alt="" />', webpage) +        ] +        thumbnail = thumbnails[0]['url'] if thumbnails else None          duration = parse_duration(self._html_search_regex( -            r'Length:\s*<span>(\d{2}:\d{2})</span>',webpage, 'duration', fatal=False)) +            r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', webpage, 'duration', fatal=False))          upload_date = unified_strdate(self._html_search_regex( -            r'Added:\s*<span>(\d{4}-\d{2}-\d{2})</span>', webpage, 'upload date', fatal=False)) +            r'<i class="fa fa-user"></i>\s*(\d{4}-\d{2}-\d{2})', webpage, 'upload date', fatal=False))          return {              'id': video_id,              'title': title, -            'thumbnail': 'http://m.nuvid.com%s' % thumbnail, +            'thumbnails': thumbnails, +            'thumbnail': thumbnail,              'duration': duration,              'upload_date': upload_date,              'age_limit': 18, diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py new file mode 100644 index 000000000..5429592a7 --- /dev/null +++ b/youtube_dl/extractor/patreon.py @@ -0,0 +1,100 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( +    js_to_json, +) + + +class PatreonIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(.+)' +    _TESTS = [ +        { +            'url': 'http://www.patreon.com/creation?hid=743933', +            'md5': 'e25505eec1053a6e6813b8ed369875cc', +            'info_dict': { +                'id': '743933', +                'ext': 'mp3', +                'title': 'Episode 166: David Smalley of Dogma Debate', +                'uploader': 'Cognitive Dissonance Podcast', +                'thumbnail': 're:^https?://.*$', +            }, +        }, +        { +            'url': 'http://www.patreon.com/creation?hid=754133', +            'md5': '3eb09345bf44bf60451b8b0b81759d0a', +            'info_dict': { +                'id': '754133', +                'ext': 'mp3', +                'title': 'CD 167 Extra', +                'uploader': 'Cognitive Dissonance Podcast', +                'thumbnail': 're:^https?://.*$', +            }, +        }, +    ] + +    # Currently Patreon exposes download URL via hidden CSS, so login is not +    # needed. Keeping this commented for when this inevitably changes. +    ''' +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            return + +        login_form = { +            'redirectUrl': 'http://www.patreon.com/', +            'email': username, +            'password': password, +        } + +        request = compat_urllib_request.Request( +            'https://www.patreon.com/processLogin', +            compat_urllib_parse.urlencode(login_form).encode('utf-8') +        ) +        login_page = self._download_webpage(request, None, note='Logging in as %s' % username) + +        if re.search(r'onLoginFailed', login_page): +            raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) + +    def _real_initialize(self): +        self._login() +    ''' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group(1) + +        webpage = self._download_webpage(url, video_id) +        title = self._og_search_title(webpage).strip() + +        attach_fn = self._html_search_regex( +            r'<div class="attach"><a target="_blank" href="([^"]+)">', +            webpage, 'attachment URL', default=None) +        if attach_fn is not None: +            video_url = 'http://www.patreon.com' + attach_fn +            thumbnail = self._og_search_thumbnail(webpage) +            uploader = self._html_search_regex( +                r'<strong>(.*?)</strong> is creating', webpage, 'uploader') +        else: +            playlist_js = self._search_regex( +                r'(?s)new\s+jPlayerPlaylist\(\s*\{\s*[^}]*},\s*(\[.*?,?\s*\])', +                webpage, 'playlist JSON') +            playlist_json = js_to_json(playlist_js) +            playlist = json.loads(playlist_json) +            data = playlist[0] +            video_url = self._proto_relative_url(data['mp3']) +            thumbnail = self._proto_relative_url(data.get('cover')) +            uploader = data.get('artist') + +        return { +            'id': video_id, +            'url': video_url, +            'ext': 'mp3', +            'title': title, +            'uploader': uploader, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index ec95d0704..2adfde909 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -20,17 +20,53 @@ class PBSIE(InfoExtractor):          )      ''' -    _TEST = { -        'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', -        'md5': 'ce1888486f0908d555a8093cac9a7362', -        'info_dict': { -            'id': '2365006249', -            'ext': 'mp4', -            'title': 'A More Perfect Union', -            'description': 'md5:ba0c207295339c8d6eced00b7c363c6a', -            'duration': 3190, +    _TESTS = [ +        { +            'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', +            'md5': 'ce1888486f0908d555a8093cac9a7362', +            'info_dict': { +                'id': '2365006249', +                'ext': 'mp4', +                'title': 'A More Perfect Union', +                'description': 'md5:ba0c207295339c8d6eced00b7c363c6a', +                'duration': 3190, +            }, +        }, +        { +            'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', +            'md5': '143c98aa54a346738a3d78f54c925321', +            'info_dict': { +                'id': '2365297690', +                'ext': 'mp4', +                'title': 'Losing Iraq', +                'description': 'md5:f5bfbefadf421e8bb8647602011caf8e', +                'duration': 5050, +            }, +        }, +        { +            'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', +            'md5': 'b19856d7f5351b17a5ab1dc6a64be633', +            'info_dict': { +                'id': '2201174722', +                'ext': 'mp4', +                'title': 'Cyber Schools Gain Popularity, but Quality Questions Persist', +                'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28', +                'duration': 801, +            },          }, -    } +        { +            'url': 'http://www.pbs.org/wnet/gperf/dudamel-conducts-verdi-requiem-hollywood-bowl-full-episode/3374/', +            'md5': 'c62859342be2a0358d6c9eb306595978', +            'info_dict': { +                'id': '2365297708', +                'ext': 'mp4', +                'description': 'md5:68d87ef760660eb564455eb30ca464fe', +                'title': 'Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full', +                'duration': 6559, +                'thumbnail': 're:^https?://.*\.jpg$', +            } +        } +    ]      def _extract_ids(self, url):          mobj = re.match(self._VALID_URL, url) @@ -40,15 +76,18 @@ class PBSIE(InfoExtractor):          if presumptive_id:              webpage = self._download_webpage(url, display_id) -            # frontline video embed +            MEDIA_ID_REGEXES = [ +                r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed +                r'class="coveplayerid">([^<]+)<',                       # coveplayer +            ] +              media_id = self._search_regex( -                r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", -                webpage, 'frontline video ID', fatal=False, default=None) +                MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)              if media_id:                  return media_id, presumptive_id              url = self._search_regex( -                r'<iframe\s+id=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>', +                r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',                  webpage, 'player URL')              mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py new file mode 100644 index 000000000..72df4d842 --- /dev/null +++ b/youtube_dl/extractor/playfm.py @@ -0,0 +1,82 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_parse, +    compat_urllib_request, +    ExtractorError, +    float_or_none, +    int_or_none, +) + + +class PlayFMIE(InfoExtractor): +    IE_NAME = 'play.fm' +    _VALID_URL = r'https?://(?:www\.)?play\.fm/[^?#]*(?P<upload_date>[0-9]{8})(?P<id>[0-9]{6})(?:$|[?#])' + +    _TEST = { +        'url': 'http://www.play.fm/recording/leipzigelectronicmusicbatofarparis_fr20140712137220', +        'md5': 'c505f8307825a245d0c7ad1850001f22', +        'info_dict': { +            'id': '137220', +            'ext': 'mp3', +            'title': 'LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', +            'uploader': 'Sven Tasnadi', +            'uploader_id': 'sventasnadi', +            'duration': 5627.428, +            'upload_date': '20140712', +            'view_count': int, +            'thumbnail': 're:^https?://.*\.jpg$', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        upload_date = mobj.group('upload_date') + +        rec_data = compat_urllib_parse.urlencode({'rec_id': video_id}) +        req = compat_urllib_request.Request( +            'http://www.play.fm/flexRead/recording', data=rec_data) +        req.add_header('Content-Type', 'application/x-www-form-urlencoded') +        rec_doc = self._download_xml(req, video_id) + +        error_node = rec_doc.find('./error') +        if error_node is not None: +            raise ExtractorError('An error occured: %s (code %s)' % ( +                error_node.text, rec_doc.find('./status').text)) + +        recording = rec_doc.find('./recording') +        title = recording.find('./title').text +        view_count = int_or_none(recording.find('./stats/playcount').text) +        duration = float_or_none(recording.find('./duration').text, scale=1000) +        thumbnail = recording.find('./image').text + +        artist = recording.find('./artists/artist') +        uploader = artist.find('./name').text +        uploader_id = artist.find('./slug').text + +        video_url = '%s//%s/%s/%s/offset/0/sh/%s/rec/%s/jingle/%s/loc/%s' % ( +            'http:', recording.find('./url').text, +            recording.find('./_class').text, recording.find('./file_id').text, +            rec_doc.find('./uuid').text, video_id, +            rec_doc.find('./jingle/file_id').text, +            'http%3A%2F%2Fwww.play.fm%2Fplayer', +        ) + +        return { +            'id': video_id, +            'url': video_url, +            'ext': 'mp3', +            'filesize': int_or_none(recording.find('./size').text), +            'title': title, +            'upload_date': upload_date, +            'view_count': view_count, +            'duration': duration, +            'thumbnail': thumbnail, +            'uploader': uploader, +            'uploader_id': uploader_id, +        } diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py index 35dc5a9ff..04bd3d979 100644 --- a/youtube_dl/extractor/pornotube.py +++ b/youtube_dl/extractor/pornotube.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import re  from .common import InfoExtractor @@ -9,15 +11,16 @@ from ..utils import (  class PornotubeIE(InfoExtractor): -    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$' +    _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'      _TEST = { -        u'url': u'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing', -        u'file': u'1689755.flv', -        u'md5': u'374dd6dcedd24234453b295209aa69b6', -        u'info_dict': { -            u"upload_date": u"20090708",  -            u"title": u"Marilyn-Monroe-Bathing", -            u"age_limit": 18 +        'url': 'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing', +        'md5': '374dd6dcedd24234453b295209aa69b6', +        'info_dict': { +            'id': '1689755', +            'ext': 'flv', +            'upload_date': '20090708', +            'title': 'Marilyn-Monroe-Bathing', +            'age_limit': 18          }      } @@ -32,22 +35,22 @@ class PornotubeIE(InfoExtractor):          # Get the video URL          VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' -        video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url') +        video_url = self._search_regex(VIDEO_URL_RE, webpage, 'video url')          video_url = compat_urllib_parse.unquote(video_url)          #Get the uploaded date          VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' -        upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) -        if upload_date: upload_date = unified_strdate(upload_date) +        upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, 'upload date', fatal=False) +        if upload_date: +            upload_date = unified_strdate(upload_date)          age_limit = self._rta_search(webpage) -        info = {'id': video_id, -                'url': video_url, -                'uploader': None, -                'upload_date': upload_date, -                'title': video_title, -                'ext': 'flv', -                'format': 'flv', -                'age_limit': age_limit} - -        return [info] +        return { +            'id': video_id, +            'url': video_url, +            'upload_date': upload_date, +            'title': video_title, +            'ext': 'flv', +            'format': 'flv', +            'age_limit': age_limit, +        } diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py new file mode 100644 index 000000000..190c8f226 --- /dev/null +++ b/youtube_dl/extractor/rtlnl.py @@ -0,0 +1,51 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RtlXlIE(InfoExtractor): +    IE_NAME = 'rtlxl.nl' +    _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)' + +    _TEST = { +        'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', +        'info_dict': { +            'id': '6e4203a6-0a5e-3596-8424-c599a59e0677', +            'ext': 'flv', +            'title': 'RTL Nieuws - Laat', +            'description': 'Dagelijks het laatste nieuws uit binnen- en ' +                'buitenland. Voor nog meer nieuws kunt u ook gebruikmaken van ' +                'onze mobiele apps.', +            'timestamp': 1408051800, +            'upload_date': '20140814', +        }, +        'params': { +            # We download the first bytes of the first fragment, it can't be +            # processed by the f4m downloader beacuse it isn't complete +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        uuid = mobj.group('uuid') + +        info = self._download_json( +            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid, +            uuid) +        material = info['material'][0] +        episode_info = info['episodes'][0] + +        f4m_url = 'http://manifest.us.rtl.nl' + material['videopath'] +        progname = info['abstracts'][0]['name'] +        subtitle = material['title'] or info['episodes'][0]['name'] + +        return { +            'id': uuid, +            'title': '%s - %s' % (progname, subtitle),  +            'formats': self._extract_f4m_formats(f4m_url, uuid), +            'timestamp': material['original_date'], +            'description': episode_info['synopsis'], +        } diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index c2228b2f0..4dd35a47b 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -1,21 +1,66 @@  # encoding: utf-8  from __future__ import unicode_literals -import re  import base64 +import re +import time  from .common import InfoExtractor  from ..utils import (      struct_unpack, +    remove_end,  ) +def _decrypt_url(png): +    encrypted_data = base64.b64decode(png) +    text_index = encrypted_data.find(b'tEXt') +    text_chunk = encrypted_data[text_index - 4:] +    length = struct_unpack('!I', text_chunk[:4])[0] +    # Use bytearray to get integers when iterating in both python 2.x and 3.x +    data = bytearray(text_chunk[8:8 + length]) +    data = [chr(b) for b in data if b != 0] +    hash_index = data.index('#') +    alphabet_data = data[:hash_index] +    url_data = data[hash_index + 1:] + +    alphabet = [] +    e = 0 +    d = 0 +    for l in alphabet_data: +        if d == 0: +            alphabet.append(l) +            d = e = (e + 1) % 4 +        else: +            d -= 1 +    url = '' +    f = 0 +    e = 3 +    b = 1 +    for letter in url_data: +        if f == 0: +            l = int(letter) * 10 +            f = 1 +        else: +            if e == 0: +                l += int(letter) +                url += alphabet[l] +                e = (b + 3) % 4 +                f = 0 +                b += 1 +            else: +                e -= 1 + +    return url + + +  class RTVEALaCartaIE(InfoExtractor):      IE_NAME = 'rtve.es:alacarta'      IE_DESC = 'RTVE a la carta'      _VALID_URL = r'http://www\.rtve\.es/alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',          'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43',          'info_dict': { @@ -23,48 +68,15 @@ class RTVEALaCartaIE(InfoExtractor):              'ext': 'mp4',              'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',          }, -    } - -    def _decrypt_url(self, png): -        encrypted_data = base64.b64decode(png) -        text_index = encrypted_data.find(b'tEXt') -        text_chunk = encrypted_data[text_index-4:] -        length = struct_unpack('!I', text_chunk[:4])[0] -        # Use bytearray to get integers when iterating in both python 2.x and 3.x -        data = bytearray(text_chunk[8:8+length]) -        data = [chr(b) for b in data if b != 0] -        hash_index = data.index('#') -        alphabet_data = data[:hash_index] -        url_data = data[hash_index+1:] - -        alphabet = [] -        e = 0 -        d = 0 -        for l in alphabet_data: -            if d == 0: -                alphabet.append(l) -                d = e = (e + 1) % 4 -            else: -                d -= 1 -        url = '' -        f = 0 -        e = 3 -        b = 1 -        for letter in url_data: -            if f == 0: -                l = int(letter)*10 -                f = 1 -            else: -                if e == 0: -                    l += int(letter) -                    url += alphabet[l] -                    e = (b + 3) % 4 -                    f = 0 -                    b += 1 -                else: -                    e -= 1 - -        return url +    }, { +        'note': 'Live stream', +        'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', +        'info_dict': { +            'id': '1694255', +            'ext': 'flv', +            'title': 'TODO', +        } +    }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -74,11 +86,57 @@ class RTVEALaCartaIE(InfoExtractor):              video_id)['page']['items'][0]          png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % video_id          png = self._download_webpage(png_url, video_id, 'Downloading url information') -        video_url = self._decrypt_url(png) +        video_url = _decrypt_url(png)          return {              'id': video_id,              'title': info['title'],              'url': video_url, -            'thumbnail': info['image'], +            'thumbnail': info.get('image'), +            'page_url': url, +        } + + +class RTVELiveIE(InfoExtractor): +    IE_NAME = 'rtve.es:live' +    IE_DESC = 'RTVE.es live streams' +    _VALID_URL = r'http://www\.rtve\.es/(?:deportes/directo|noticias|television)/(?P<id>[a-zA-Z0-9-]+)' + +    _TESTS = [{ +        'url': 'http://www.rtve.es/noticias/directo-la-1/', +        'info_dict': { +            'id': 'directo-la-1', +            'ext': 'flv', +            'title': 're:^La 1 de TVE [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', +        }, +        'params': { +            'skip_download': 'live stream', +        } +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        start_time = time.gmtime() +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) +        player_url = self._search_regex( +            r'<param name="movie" value="([^"]+)"/>', webpage, 'player URL') +        title = remove_end(self._og_search_title(webpage), ' en directo') +        title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) + +        vidplayer_id = self._search_regex( +            r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') +        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id +        png = self._download_webpage(png_url, video_id, 'Downloading url information') +        video_url = _decrypt_url(png) + +        return { +            'id': video_id, +            'ext': 'flv', +            'title': title, +            'url': video_url, +            'app': 'rtve-live-live?ovpfv=2.1.2', +            'player_url': player_url, +            'rtmp_live': True,          } diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py new file mode 100644 index 000000000..34058fd4b --- /dev/null +++ b/youtube_dl/extractor/sbs.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import json +import re +from .common import InfoExtractor +from ..utils import ( +    js_to_json, +    remove_end, +) + + +class SBSIE(InfoExtractor): +    IE_DESC = 'sbs.com.au' +    _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/single/(?P<id>[0-9]+)/' + +    _TESTS = [{ +        # Original URL is handled by the generic IE which finds the iframe: +        # http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation +        'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed', +        'md5': '3150cf278965eeabb5b4cea1c963fe0a', +        'info_dict': { +            'id': '320403011771', +            'ext': 'flv', +            'title': 'Dingo Conservation', +            'description': 'Dingoes are on the brink of extinction; most of the animals we think are dingoes are in fact crossbred with wild dogs. This family run a dingo conservation park to prevent their extinction', +            'thumbnail': 're:http://.*\.jpg', +        }, +        'add_ies': ['generic'], +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        webpage = self._download_webpage(url, video_id) + +        release_urls_json = js_to_json(self._search_regex( +            r'(?s)playerParams\.releaseUrls\s*=\s*(\{.*?\n\});\n', +            webpage, '')) +        release_urls = json.loads(release_urls_json) +        theplatform_url = ( +            release_urls.get('progressive') or release_urls.get('standard')) + +        title = remove_end(self._og_search_title(webpage), ' (The Feed)') +        description = self._html_search_meta('description', webpage) +        thumbnail = self._og_search_thumbnail(webpage) + +        return { +            '_type': 'url_transparent', +            'id': video_id, +            'url': theplatform_url, + +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index f8dd7e955..fa796ce72 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -37,7 +37,7 @@ class TeamcocoIE(InfoExtractor):          video_id = mobj.group("video_id")          if not video_id:              video_id = self._html_search_regex( -                r'<article class="video" data-id="(\d+?)"', +                r'data-node-id="(\d+?)"',                  webpage, 'video id')          data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index 6d3b78749..affef6507 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -44,7 +44,7 @@ class VodlockerIE(InfoExtractor):                  req, video_id, 'Downloading video page')          title = self._search_regex( -            r'id="file_title".*?>\s*(.*?)\s*<span', webpage, 'title') +            r'id="file_title".*?>\s*(.*?)\s*<(?:br|span)', webpage, 'title')          thumbnail = self._search_regex(              r'image:\s*"(http[^\"]+)",', webpage, 'thumbnail')          url = self._search_regex( diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index a584e0896..1f330378a 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -2,27 +2,30 @@  from __future__ import unicode_literals  import re +import time +import hashlib  from .common import InfoExtractor  from ..utils import ( +    ExtractorError,      unified_strdate,  )  class WatIE(InfoExtractor): -    _VALID_URL = r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html' +    _VALID_URL = r'http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html'      IE_NAME = 'wat.tv'      _TEST = { -        'url': 'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html', +        'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', +        'md5': 'ce70e9223945ed26a8056d413ca55dc9',          'info_dict': { -            'id': '10631273', +            'id': '11713067', +            'display_id': 'soupe-figues-l-orange-aux-epices',              'ext': 'mp4', -            'title': 'World War Z - Philadelphia VOST', -            'description': 'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr', -        }, -        'params': { -            # Sometimes wat serves the whole file with the --test option -            'skip_download': True, +            'title': 'Soupe de figues à l\'orange et aux épices', +            'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.', +            'upload_date': '20140819', +            'duration': 120,          },      } @@ -36,13 +39,20 @@ class WatIE(InfoExtractor):          def real_id_for_chapter(chapter):              return chapter['tc_start'].split('-')[0]          mobj = re.match(self._VALID_URL, url) -        short_id = mobj.group('shortID') -        webpage = self._download_webpage(url, short_id) +        short_id = mobj.group('short_id') +        display_id = mobj.group('display_id') +        webpage = self._download_webpage(url, display_id or short_id)          real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')          video_info = self.download_video_info(real_id) + +        if video_info.get('geolock'): +            raise ExtractorError('This content is not available in your area', expected=True) +          chapters = video_info['chapters']          first_chapter = chapters[0] +        files = video_info['files'] +        first_file = files[0]          if real_id_for_chapter(first_chapter) != real_id:              self.to_screen('Multipart video detected') @@ -61,12 +71,45 @@ class WatIE(InfoExtractor):              upload_date = unified_strdate(first_chapter['date_diffusion'])          # Otherwise we can continue and extract just one part, we have to use          # the short id for getting the video url + +        formats = [{ +            'url': 'http://wat.tv/get/android5/%s.mp4' % real_id, +            'format_id': 'Mobile', +        }] + +        fmts = [('SD', 'web')] +        if first_file.get('hasHD'): +            fmts.append(('HD', 'webhd')) + +        def compute_token(param): +            timestamp = '%08x' % int(time.time()) +            magic = '9b673b13fa4682ed14c3cfa5af5310274b514c4133e9b3a81e6e3aba009l2564' +            return '%s/%s' % (hashlib.md5((magic + param + timestamp).encode('ascii')).hexdigest(), timestamp) + +        for fmt in fmts: +            webid = '/%s/%s' % (fmt[1], real_id) +            video_url = self._download_webpage( +                'http://www.wat.tv/get%s?token=%s&getURL=1' % (webid, compute_token(webid)), +                real_id, +                'Downloding %s video URL' % fmt[0], +                'Failed to download %s video URL' % fmt[0], +                False) +            if not video_url: +                continue +            formats.append({ +                'url': video_url, +                'ext': 'mp4', +                'format_id': fmt[0], +            }) +          return {              'id': real_id, -            'url': 'http://wat.tv/get/android5/%s.mp4' % real_id, +            'display_id': display_id,              'title': first_chapter['title'],              'thumbnail': first_chapter['preview'],              'description': first_chapter['description'],              'view_count': video_info['views'],              'upload_date': upload_date, +            'duration': first_file['duration'], +            'formats': formats,          } diff --git a/youtube_dl/extractor/wayofthemaster.py b/youtube_dl/extractor/wayofthemaster.py new file mode 100644 index 000000000..af7bb8b49 --- /dev/null +++ b/youtube_dl/extractor/wayofthemaster.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class WayOfTheMasterIE(InfoExtractor): +    _VALID_URL = r'https?://www\.wayofthemaster\.com/([^/?#]*/)*(?P<id>[^/?#]+)\.s?html(?:$|[?#])' + +    _TEST = { +        'url': 'http://www.wayofthemaster.com/hbks.shtml', +        'md5': '5316b57487ada8480606a93cb3d18d24', +        'info_dict': { +            'id': 'hbks', +            'ext': 'mp4', +            'title': 'Intelligent Design vs. Evolution', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        title = self._search_regex( +            r'<img src="images/title_[^"]+".*?alt="([^"]+)"', +            webpage, 'title', default=None) +        if title is None: +            title = self._html_search_regex( +                r'<title>(.*?)</title>', webpage, 'page title') + +        url_base = self._search_regex( +            r'<param\s+name="?movie"?\s+value=".*?/wotm_videoplayer_highlow[0-9]*\.swf\?vid=([^"]+)"', +            webpage, 'URL base') +        formats = [{ +            'format_id': 'low', +            'quality': 1, +            'url': url_base + '_low.mp4', +        }, { +            'format_id': 'high', +            'quality': 2, +            'url': url_base + '_high.mp4', +        }] +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 5374495f9..00b6d1eba 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -14,7 +14,7 @@ from ..utils import (  class XHamsterIE(InfoExtractor):      """Information Extractor for xHamster""" -    _VALID_URL = r'http://(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' +    _VALID_URL = r'http://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'      _TESTS = [          {              'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5bfe5e7e5..75044d71a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -297,7 +297,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          # Dash webm audio -        '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50}, +        '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},          '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},          # RTMP (unnamed) @@ -446,6 +446,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  return lambda s: u''.join(s[i] for i in cache_spec)              except IOError:                  pass  # No cache available +            except ValueError: +                try: +                    file_size = os.path.getsize(cache_fn) +                except (OSError, IOError) as oe: +                    file_size = str(oe) +                self._downloader.report_warning( +                    u'Cache %s failed (%s)' % (cache_fn, file_size))          if player_type == 'js':              code = self._download_webpage( @@ -573,6 +580,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          sub_lang_list = {}          for l in lang_list:              lang = l[1] +            if lang in sub_lang_list: +                continue              params = compat_urllib_parse.urlencode({                  'lang': lang,                  'v': video_id, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 65b492fb3..8095400d0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -24,6 +24,7 @@ import socket  import struct  import subprocess  import sys +import tempfile  import traceback  import xml.etree.ElementTree  import zlib @@ -228,18 +229,42 @@ else:          assert type(s) == type(u'')          print(s) -# In Python 2.x, json.dump expects a bytestream. -# In Python 3.x, it writes to a character stream -if sys.version_info < (3,0): -    def write_json_file(obj, fn): -        with open(fn, 'wb') as f: -            json.dump(obj, f) -else: -    def write_json_file(obj, fn): -        with open(fn, 'w', encoding='utf-8') as f: -            json.dump(obj, f) -if sys.version_info >= (2,7): +def write_json_file(obj, fn): +    """ Encode obj as JSON and write it to fn, atomically """ + +    args = { +        'suffix': '.tmp', +        'prefix': os.path.basename(fn) + '.', +        'dir': os.path.dirname(fn), +        'delete': False, +    } + +    # In Python 2.x, json.dump expects a bytestream. +    # In Python 3.x, it writes to a character stream +    if sys.version_info < (3, 0): +        args['mode'] = 'wb' +    else: +        args.update({ +            'mode': 'w', +            'encoding': 'utf-8', +        }) + +    tf = tempfile.NamedTemporaryFile(**args) + +    try: +        with tf: +            json.dump(obj, tf) +        os.rename(tf.name, fn) +    except: +        try: +            os.remove(tf.name) +        except OSError: +            pass +        raise + + +if sys.version_info >= (2, 7):      def find_xpath_attr(node, xpath, key, val):          """ Find the xpath xpath[@key=val] """          assert re.match(r'^[a-zA-Z-]+$', key) @@ -827,8 +852,10 @@ def unified_strdate(date_str):          '%b %dnd %Y %I:%M%p',          '%b %dth %Y %I:%M%p',          '%Y-%m-%d', +        '%Y/%m/%d',          '%d.%m.%Y',          '%d/%m/%Y', +        '%d/%m/%y',          '%Y/%m/%d %H:%M:%S',          '%Y-%m-%d %H:%M:%S',          '%d.%m.%Y %H:%M', @@ -1259,6 +1286,12 @@ def remove_start(s, start):      return s +def remove_end(s, end): +    if s.endswith(end): +        return s[:-len(end)] +    return s + +  def url_basename(url):      path = compat_urlparse.urlparse(url).path      return path.strip(u'/').split(u'/')[-1] @@ -1448,6 +1481,34 @@ def strip_jsonp(code):      return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code) +def js_to_json(code): +    def fix_kv(m): +        key = m.group(2) +        if key.startswith("'"): +            assert key.endswith("'") +            assert '"' not in key +            key = '"%s"' % key[1:-1] +        elif not key.startswith('"'): +            key = '"%s"' % key + +        value = m.group(4) +        if value.startswith("'"): +            assert value.endswith("'") +            assert '"' not in value +            value = '"%s"' % value[1:-1] + +        return m.group(1) + key + m.group(3) + value + +    res = re.sub(r'''(?x) +            ([{,]\s*) +            ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+) +            (:\s*) +            ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{) +        ''', fix_kv, code) +    res = re.sub(r',(\s*\])', lambda m: m.group(1), res) +    return res + +  def qualities(quality_ids):      """ Get a numeric quality value out of a list of possible values """      def q(qid): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2ef0d59e3..a05ce2eba 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.08.10' +__version__ = '2014.08.24.5'  | 
