diff options
Diffstat (limited to 'youtube_dl')
54 files changed, 1628 insertions, 333 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b07c0b4cc..76726305a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -134,13 +134,16 @@ from .gamestar import GameStarIE  from .gametrailers import GametrailersIE  from .gdcvault import GDCVaultIE  from .generic import GenericIE +from .globo import GloboIE  from .godtube import GodTubeIE +from .golem import GolemIE  from .googleplus import GooglePlusIE  from .googlesearch import GoogleSearchIE  from .gorillavid import GorillaVidIE  from .goshgay import GoshgayIE  from .grooveshark import GroovesharkIE  from .hark import HarkIE +from .heise import HeiseIE  from .helsinki import HelsinkiIE  from .hentaistigma import HentaiStigmaIE  from .hornbunny import HornBunnyIE @@ -188,6 +191,7 @@ from .livestream import (      LivestreamOriginalIE,      LivestreamShortenerIE,  ) +from .lrt import LRTIE  from .lynda import (      LyndaIE,      LyndaCourseIE @@ -261,6 +265,7 @@ from .nrk import (  from .ntv import NTVIE  from .nytimes import NYTimesIE  from .nuvid import NuvidIE +from .oktoberfesttv import OktoberfestTVIE  from .ooyala import OoyalaIE  from .orf import (      ORFTVthekIE, @@ -271,6 +276,8 @@ from .parliamentliveuk import ParliamentLiveUKIE  from .patreon import PatreonIE  from .pbs import PBSIE  from .photobucket import PhotobucketIE +from .planetaplay import PlanetaPlayIE +from .played import PlayedIE  from .playfm import PlayFMIE  from .playvid import PlayvidIE  from .podomatic import PodomaticIE @@ -350,6 +357,7 @@ from .swrmediathek import SWRMediathekIE  from .syfy import SyfyIE  from .sztvhu import SztvHuIE  from .tagesschau import TagesschauIE +from .tapely import TapelyIE  from .teachertube import (      TeacherTubeIE,      TeacherTubeUserIE, @@ -363,11 +371,15 @@ from .tenplay import TenPlayIE  from .testurl import TestURLIE  from .tf1 import TF1IE  from .theplatform import ThePlatformIE +from .thesixtyone import TheSixtyOneIE  from .thisav import ThisAVIE  from .tinypic import TinyPicIE  from .tlc import TlcIE, TlcDeIE  from .tnaflix import TNAFlixIE -from .thvideo import THVideoIE +from .thvideo import ( +    THVideoIE, +    THVideoPlaylistIE +)  from .toutv import TouTvIE  from .toypics import ToypicsUserIE, ToypicsIE  from .traileraddict import TrailerAddictIE @@ -408,11 +420,12 @@ from .videoweed import VideoWeedIE  from .vidme import VidmeIE  from .vimeo import (      VimeoIE, -    VimeoChannelIE, -    VimeoUserIE,      VimeoAlbumIE, +    VimeoChannelIE,      VimeoGroupsIE, +    VimeoLikesIE,      VimeoReviewIE, +    VimeoUserIE,      VimeoWatchLaterIE,  )  from .vimple import VimpleIE diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 7d89f44ee..69f89320c 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -22,8 +22,7 @@ class ABCIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          urls_info_json = self._search_regex( diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 54cec1c2f..8de9c11ea 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,8 +8,6 @@ from ..utils import (      determine_ext,      ExtractorError,      qualities, -    compat_urllib_parse_urlparse, -    compat_urllib_parse,      int_or_none,      parse_duration,      unified_strdate, diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 957d35979..c3d02f85e 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -86,11 +86,15 @@ class ArteTVPlus7IE(InfoExtractor):          info = self._download_json(json_url, video_id)          player_info = info['videoJsonPlayer'] +        upload_date_str = player_info.get('shootingDate') +        if not upload_date_str: +            upload_date_str = player_info.get('VDA', '').split(' ')[0] +          info_dict = {              'id': player_info['VID'],              'title': player_info['VTI'],              'description': player_info.get('VDE'), -            'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]), +            'upload_date': unified_strdate(upload_date_str),              'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),          } diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index c569aa4d2..c13446665 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -15,13 +15,23 @@ class BandcampIE(InfoExtractor):      _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)'      _TESTS = [{          'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', -        'file': '1812978515.mp3',          'md5': 'c557841d5e50261777a6585648adf439',          'info_dict': { -            "title": "youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", -            "duration": 9.8485, +            'id': '1812978515', +            'ext': 'mp3', +            'title': "youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", +            'duration': 9.8485,          },          '_skip': 'There is a limit of 200 free downloads / month for the test song' +    }, { +        'url': 'http://benprunty.bandcamp.com/track/lanius-battle', +        'md5': '2b68e5851514c20efdff2afc5603b8b4', +        'info_dict': { +            'id': '2650410135', +            'ext': 'mp3', +            'title': 'Lanius (Battle)', +            'uploader': 'Ben Prunty Music', +        },      }]      def _real_extract(self, url): @@ -59,9 +69,9 @@ class BandcampIE(InfoExtractor):                  raise ExtractorError('No free songs found')          download_link = m_download.group(1) -        video_id = re.search( -            r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', -            webpage, re.MULTILINE | re.DOTALL).group('id') +        video_id = self._search_regex( +            r'var TralbumData = {.*?id: (?P<id>\d+),?$', +            webpage, 'video id', flags=re.MULTILINE | re.DOTALL)          download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page')          # We get the dictionary of the track from some javascript code diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 4e2960c62..2e277c8c3 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -26,6 +26,8 @@ class BRIE(InfoExtractor):                  'title': 'Wenn das Traditions-Theater wackelt',                  'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt',                  'duration': 34, +                'uploader': 'BR', +                'upload_date': '20140802',              }          },          { @@ -66,8 +68,7 @@ class BRIE(InfoExtractor):      ]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        display_id = mobj.group('id') +        display_id = self._match_id(url)          page = self._download_webpage(url, display_id)          xml_url = self._search_regex(              r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 1bfc9f35b..2c0e5eea2 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -4,37 +4,61 @@ import re  import json  from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    parse_age_limit, +)  class BreakIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?break\.com/video/([^/]+)' -    _TEST = { +    _VALID_URL = r'http://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)' +    _TESTS = [{          'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', -        'md5': 'a3513fb1547fba4fb6cfac1bffc6c46b', +        'md5': '33aa4ff477ecd124d18d7b5d23b87ce5',          'info_dict': {              'id': '2468056',              'ext': 'mp4',              'title': 'When Girls Act Like D-Bags',          } -    } +    }, { +        'url': 'http://www.break.com/video/ugc/baby-flex-2773063', +        'only_matching': True, +    }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group(1).split("-")[-1] -        embed_url = 'http://www.break.com/embed/%s' % video_id -        webpage = self._download_webpage(embed_url, video_id) -        info_json = self._search_regex(r'var embedVars = ({.*})\s*?</script>', -            webpage, 'info json', flags=re.DOTALL) -        info = json.loads(info_json) -        video_url = info['videoUri'] +        video_id = self._match_id(url) +        webpage = self._download_webpage( +            'http://www.break.com/embed/%s' % video_id, video_id) +        info = json.loads(self._search_regex( +            r'var embedVars = ({.*})\s*?</script>', +            webpage, 'info json', flags=re.DOTALL)) +          youtube_id = info.get('youtubeId')          if youtube_id:              return self.url_result(youtube_id, 'Youtube') -        final_url = video_url + '?' + info['AuthToken'] +        formats = [{ +            'url': media['uri'] + '?' + info['AuthToken'], +            'tbr': media['bitRate'], +            'width': media['width'], +            'height': media['height'], +        } for media in info['media']] + +        if not formats: +            formats.append({ +                'url': info['videoUri'] +            }) + +        self._sort_formats(formats) + +        duration = int_or_none(info.get('videoLengthInSeconds')) +        age_limit = parse_age_limit(info.get('audienceRating')) +          return {              'id': video_id, -            'url': final_url,              'title': info['contentName'],              'thumbnail': info['thumbUri'], +            'duration': duration, +            'age_limit': age_limit, +            'formats': formats,          } diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 65c12136a..d4227e6eb 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -35,7 +35,6 @@ class CliphunterIE(InfoExtractor):              'title': 'Fun Jynx Maze solo',              'thumbnail': 're:^https?://.*\.jpg$',              'age_limit': 18, -            'duration': 1317,          }      } @@ -86,14 +85,11 @@ class CliphunterIE(InfoExtractor):          thumbnail = self._search_regex(              r"var\s+mov_thumb\s*=\s*'([^']+)';",              webpage, 'thumbnail', fatal=False) -        duration = int_or_none(self._search_regex( -            r'pl_dur\s*=\s*([0-9]+)', webpage, 'duration', fatal=False))          return {              'id': video_id,              'title': video_title,              'formats': formats, -            'duration': duration,              'age_limit': self._rta_search(webpage),              'thumbnail': thumbnail,          } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 60cab6f4e..450c7dfd6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,6 +1,7 @@  from __future__ import unicode_literals  import base64 +import datetime  import hashlib  import json  import netrc @@ -21,6 +22,7 @@ from ..utils import (      clean_html,      compiled_regex_type,      ExtractorError, +    float_or_none,      int_or_none,      RegexNotFoundError,      sanitize_filename, @@ -136,6 +138,8 @@ class InfoExtractor(object):      Unless mentioned otherwise, the fields should be Unicode strings. +    Unless mentioned otherwise, None is equivalent to absence of information. +      Subclasses of this one should re-define the _real_initialize() and      _real_extract() methods and define a _VALID_URL regexp.      Probably, they should also be added to the list of extractors. @@ -165,6 +169,14 @@ class InfoExtractor(object):          return cls._VALID_URL_RE.match(url) is not None      @classmethod +    def _match_id(cls, url): +        if '_VALID_URL_RE' not in cls.__dict__: +            cls._VALID_URL_RE = re.compile(cls._VALID_URL) +        m = cls._VALID_URL_RE.match(url) +        assert m +        return m.group('id') + +    @classmethod      def working(cls):          """Getter method for _WORKING."""          return cls._WORKING @@ -324,7 +336,11 @@ class InfoExtractor(object):          try:              return json.loads(json_string)          except ValueError as ve: -            raise ExtractorError('Failed to download JSON', cause=ve) +            errmsg = '%s: Failed to parse JSON ' % video_id +            if fatal: +                raise ExtractorError(errmsg, cause=ve) +            else: +                self.report_warning(errmsg + str(ve))      def report_warning(self, msg, video_id=None):          idstr = '' if video_id is None else '%s: ' % video_id @@ -705,6 +721,34 @@ class InfoExtractor(object):          self._sort_formats(formats)          return formats +    def _live_title(self, name): +        """ Generate the title for a live video """ +        now = datetime.datetime.now() +        now_str = now.strftime("%Y-%m-%d %H:%M") +        return name + ' ' + now_str + +    def _int(self, v, name, fatal=False, **kwargs): +        res = int_or_none(v, **kwargs) +        if 'get_attr' in kwargs: +            print(getattr(v, kwargs['get_attr'])) +        if res is None: +            msg = 'Failed to extract %s: Could not parse value %r' % (name, v) +            if fatal: +                raise ExtractorError(msg) +            else: +                self._downloader.report_warning(msg) +        return res + +    def _float(self, v, name, fatal=False, **kwargs): +        res = float_or_none(v, **kwargs) +        if res is None: +            msg = 'Failed to extract %s: Could not parse value %r' % (name, v) +            if fatal: +                raise ExtractorError(msg) +            else: +                self._downloader.report_warning(msg) +        return res +  class SearchInfoExtractor(InfoExtractor):      """ diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 66a8f16d9..dbcf5d6a7 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -82,11 +82,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):      ]      def _real_extract(self, url): -        # Extract id and simplified title from URL -        mobj = re.match(self._VALID_URL, url) - -        video_id = mobj.group('id') - +        video_id = self._match_id(url)          url = 'http://www.dailymotion.com/video/%s' % video_id          # Retrieve video webpage to extract further information @@ -147,18 +143,23 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):              self._list_available_subtitles(video_id, webpage)              return -        view_count = self._search_regex( -            r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, 'view count', fatal=False) -        if view_count is not None: -            view_count = str_to_int(view_count) +        view_count = str_to_int(self._search_regex( +            r'video_views_count[^>]+>\s+([\d\.,]+)', +            webpage, 'view count', fatal=False)) + +        title = self._og_search_title(webpage, default=None) +        if title is None: +            title = self._html_search_regex( +                r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage, +                'title')          return { -            'id':       video_id, +            'id': video_id,              'formats': formats,              'uploader': info['owner.screenname'], -            'upload_date':  video_upload_date, -            'title':    self._og_search_title(webpage), -            'subtitles':    video_subtitles, +            'upload_date': video_upload_date, +            'title': title, +            'subtitles': video_subtitles,              'thumbnail': info['thumbnail_url'],              'age_limit': age_limit,              'view_count': view_count, diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 817a9bd61..5f24ac721 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -29,9 +29,8 @@ class DropboxIE(InfoExtractor):          video_id = mobj.group('id')          fn = compat_urllib_parse_unquote(url_basename(url))          title = os.path.splitext(fn)[0] -        video_url = ( -            re.sub(r'[?&]dl=0', '', url) + -            ('?' if '?' in url else '&') + 'dl=1') +        video_url = re.sub(r'[?&]dl=0', '', url) +        video_url += ('?' if '?' not in video_url else '&') + 'dl=1'          return {              'id': video_id, diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index 4ba323148..2cba82532 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -1,4 +1,6 @@  # encoding: utf-8 +from __future__ import unicode_literals +  import re  from .common import InfoExtractor @@ -7,20 +9,20 @@ from ..utils import ExtractorError  class EitbIE(InfoExtractor): -    IE_NAME = u'eitb.tv' +    IE_NAME = 'eitb.tv'      _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)'      _TEST = { -        u'add_ie': ['Brightcove'], -        u'url': u'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', -        u'md5': u'edf4436247185adee3ea18ce64c47998', -        u'info_dict': { -            u'id': u'2743577154001', -            u'ext': u'mp4', -            u'title': u'60 minutos (Lasa y Zabala, 30 años)', +        'add_ie': ['Brightcove'], +        'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', +        'md5': 'edf4436247185adee3ea18ce64c47998', +        'info_dict': { +            'id': '2743577154001', +            'ext': 'mp4', +            'title': '60 minutos (Lasa y Zabala, 30 años)',              # All videos from eitb has this description in the brightcove info -            u'description': u'.', -            u'uploader': u'Euskal Telebista', +            'description': '.', +            'uploader': 'Euskal Telebista',          },      } @@ -30,7 +32,7 @@ class EitbIE(InfoExtractor):          webpage = self._download_webpage(url, chapter_id)          bc_url = BrightcoveIE._extract_brightcove_url(webpage)          if bc_url is None: -            raise ExtractorError(u'Could not extract the Brightcove url') +            raise ExtractorError('Could not extract the Brightcove url')          # The BrightcoveExperience object doesn't contain the video id, we set          # it manually          bc_url += '&%40videoPlayer={0}'.format(chapter_id) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 522aa3d63..bb231ecb1 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -14,11 +14,11 @@ class EpornerIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\d+)/(?P<display_id>[\w-]+)'      _TEST = {          'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', -        'md5': '3b427ae4b9d60619106de3185c2987cd', +        'md5': '39d486f046212d8e1b911c52ab4691f8',          'info_dict': {              'id': '95008',              'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Infamous Tiffany Teen Strip Tease Video',              'duration': 194,              'view_count': int, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 60e68d98a..3ad993751 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -35,7 +35,7 @@ class FacebookIE(InfoExtractor):              'id': '637842556329505',              'ext': 'mp4',              'duration': 38, -            'title': 'Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam fin...', +            'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam',          }      }, {          'note': 'Video without discernible title', diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 721e5fce0..d966e8403 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,7 +21,7 @@ class FunnyOrDieIE(InfoExtractor):          },      }, {          'url': 'http://www.funnyordie.com/embed/e402820827', -        'md5': 'ff4d83318f89776ed0250634cfaa8d36', +        'md5': '29f4c5e5a61ca39dfd7e8348a75d0aad',          'info_dict': {              'id': 'e402820827',              'ext': 'mp4', diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 367f930dd..c16da70f1 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -155,7 +155,6 @@ class GenericIE(InfoExtractor):          # funnyordie embed          {              'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns', -            'md5': '7cf780be104d40fea7bae52eed4a470e',              'info_dict': {                  'id': '18e820ec3f',                  'ext': 'mp4', @@ -180,13 +179,13 @@ class GenericIE(InfoExtractor):          # Embedded TED video          {              'url': 'http://en.support.wordpress.com/videos/ted-talks/', -            'md5': 'deeeabcc1085eb2ba205474e7235a3d5', +            'md5': '65fdff94098e4a607385a60c5177c638',              'info_dict': { -                'id': '981', +                'id': '1969',                  'ext': 'mp4', -                'title': 'My web playroom', -                'uploader': 'Ze Frank', -                'description': 'md5:ddb2a40ecd6b6a147e400e535874947b', +                'title': 'Hidden miracles of the natural world', +                'uploader': 'Louie Schwartzberg', +                'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',              }          },          # Embeded Ustream video @@ -226,21 +225,6 @@ class GenericIE(InfoExtractor):                  'skip_download': 'Requires rtmpdump'              }          }, -        # smotri embed -        { -            'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml', -            'md5': 'ec40048448e9284c9a1de77bb188108b', -            'info_dict': { -                'id': 'v27008541fad', -                'ext': 'mp4', -                'title': 'Крым и Севастополь вошли в состав России', -                'description': 'md5:fae01b61f68984c7bd2fa741e11c3175', -                'duration': 900, -                'upload_date': '20140318', -                'uploader': 'rbctv_2012_4', -                'uploader_id': 'rbctv_2012_4', -            }, -        },          # Condé Nast embed          {              'url': 'http://www.wired.com/2014/04/honda-asimo/', @@ -295,13 +279,13 @@ class GenericIE(InfoExtractor):          {              'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',              'info_dict': { -                'id': 'jpSGZsgga_I', +                'id': '4vAffPZIT44',                  'ext': 'mp4', -                'title': 'Asphalt 8: Airborne - Launch Trailer', +                'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',                  'uploader': 'Gameloft',                  'uploader_id': 'gameloft', -                'upload_date': '20130821', -                'description': 'md5:87bd95f13d8be3e7da87a5f2c443106a', +                'upload_date': '20140828', +                'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',              },              'params': {                  'skip_download': True, @@ -397,12 +381,6 @@ class GenericIE(InfoExtractor):          },      ] -    def report_download_webpage(self, video_id): -        """Report webpage download.""" -        if not self._downloader.params.get('test', False): -            self._downloader.report_warning('Falling back on generic information extractor.') -        super(GenericIE, self).report_download_webpage(video_id) -      def report_following_redirect(self, new_url):          """Report information extraction."""          self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) @@ -502,6 +480,7 @@ class GenericIE(InfoExtractor):          url, smuggled_data = unsmuggle_url(url)          force_videoid = None +        is_intentional = smuggled_data and smuggled_data.get('to_generic')          if smuggled_data and 'force_videoid' in smuggled_data:              force_videoid = smuggled_data['force_videoid']              video_id = force_videoid @@ -544,6 +523,9 @@ class GenericIE(InfoExtractor):                      'upload_date': upload_date,                  } +        if not self._downloader.params.get('test', False) and not is_intentional: +            self._downloader.report_warning('Falling back on generic information extractor.') +          try:              webpage = self._download_webpage(url, video_id)          except ValueError: @@ -657,6 +639,16 @@ class GenericIE(InfoExtractor):              return _playlist_from_matches(                  matches, lambda m: unescapeHTML(m[1])) +        # Look for embedded Dailymotion playlist player (#3822) +        m = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) +        if m: +            playlists = re.findall( +                r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) +            if playlists: +                return _playlist_from_matches( +                    playlists, lambda p: '//dailymotion.com/playlist/%s' % p) +          # Look for embedded Wistia player          match = re.search(              r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py new file mode 100644 index 000000000..77c3ad4fc --- /dev/null +++ b/youtube_dl/extractor/globo.py @@ -0,0 +1,398 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import math + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    float_or_none, +    compat_str, +    compat_chr, +    compat_ord, +) + + +class GloboIE(InfoExtractor): +    _VALID_URL = 'https?://.+?\.globo\.com/(?P<id>.+)' + +    _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' +    _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=2.9.9.50&resource_id=%s' + +    _VIDEOID_REGEXES = [ +        r'\bdata-video-id="(\d+)"', +        r'\bdata-player-videosids="(\d+)"', +        r'<div[^>]+\bid="(\d+)"', +    ] + +    _RESIGN_EXPIRATION = 86400 + +    _TESTS = [ +        { +            'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/', +            'md5': '03ebf41cb7ade43581608b7d9b71fab0', +            'info_dict': { +                'id': '3654973', +                'ext': 'mp4', +                'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão', +                'duration': 251.585, +                'uploader': 'SporTV', +                'uploader_id': 698, +                'like_count': int, +            } +        }, +        { +            'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', +            'md5': 'b3ccc801f75cd04a914d51dadb83a78d', +            'info_dict': { +                'id': '3607726', +                'ext': 'mp4', +                'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', +                'duration': 103.204, +                'uploader': 'Globo.com', +                'uploader_id': 265, +                'like_count': int, +            } +        }, +        { +            'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', +            'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', +            'info_dict': { +                'id': '3652183', +                'ext': 'mp4', +                'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', +                'duration': 110.711, +                'uploader': 'Rede Globo', +                'uploader_id': 196, +                'like_count': int, +            } +        }, +    ] + +    class MD5(): +        HEX_FORMAT_LOWERCASE = 0 +        HEX_FORMAT_UPPERCASE = 1 +        BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = '' +        BASE64_PAD_CHARACTER_RFC_COMPLIANCE = '=' +        PADDING = '=0xFF01DD' +        hexcase = 0 +        b64pad = '' + +        def __init__(self): +            pass + +        class JSArray(list): +            def __getitem__(self, y): +                try: +                    return list.__getitem__(self, y) +                except IndexError: +                    return 0 + +            def __setitem__(self, i, y): +                try: +                    return list.__setitem__(self, i, y) +                except IndexError: +                    self.extend([0] * (i - len(self) + 1)) +                    self[-1] = y + +        @classmethod +        def hex_md5(cls, param1): +            return cls.rstr2hex(cls.rstr_md5(cls.str2rstr_utf8(param1))) + +        @classmethod +        def b64_md5(cls, param1, param2=None): +            return cls.rstr2b64(cls.rstr_md5(cls.str2rstr_utf8(param1, param2))) + +        @classmethod +        def any_md5(cls, param1, param2): +            return cls.rstr2any(cls.rstr_md5(cls.str2rstr_utf8(param1)), param2) + +        @classmethod +        def rstr_md5(cls, param1): +            return cls.binl2rstr(cls.binl_md5(cls.rstr2binl(param1), len(param1) * 8)) + +        @classmethod +        def rstr2hex(cls, param1): +            _loc_2 = '0123456789ABCDEF' if cls.hexcase else '0123456789abcdef' +            _loc_3 = '' +            for _loc_5 in range(0, len(param1)): +                _loc_4 = compat_ord(param1[_loc_5]) +                _loc_3 += _loc_2[_loc_4 >> 4 & 15] + _loc_2[_loc_4 & 15] +            return _loc_3 + +        @classmethod +        def rstr2b64(cls, param1): +            _loc_2 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' +            _loc_3 = '' +            _loc_4 = len(param1) +            for _loc_5 in range(0, _loc_4, 3): +                _loc_6_1 = compat_ord(param1[_loc_5]) << 16 +                _loc_6_2 = compat_ord(param1[_loc_5 + 1]) << 8 if _loc_5 + 1 < _loc_4 else 0 +                _loc_6_3 = compat_ord(param1[_loc_5 + 2]) if _loc_5 + 2 < _loc_4 else 0 +                _loc_6 = _loc_6_1 | _loc_6_2 | _loc_6_3 +                for _loc_7 in range(0, 4): +                    if _loc_5 * 8 + _loc_7 * 6 > len(param1) * 8: +                        _loc_3 += cls.b64pad +                    else: +                        _loc_3 += _loc_2[_loc_6 >> 6 * (3 - _loc_7) & 63] +            return _loc_3 + +        @staticmethod +        def rstr2any(param1, param2): +            _loc_3 = len(param2) +            _loc_4 = [] +            _loc_9 = [0] * ((len(param1) >> 2) + 1) +            for _loc_5 in range(0, len(_loc_9)): +                _loc_9[_loc_5] = compat_ord(param1[_loc_5 * 2]) << 8 | compat_ord(param1[_loc_5 * 2 + 1]) + +            while len(_loc_9) > 0: +                _loc_8 = [] +                _loc_7 = 0 +                for _loc_5 in range(0, len(_loc_9)): +                    _loc_7 = (_loc_7 << 16) + _loc_9[_loc_5] +                    _loc_6 = math.floor(_loc_7 / _loc_3) +                    _loc_7 -= _loc_6 * _loc_3 +                    if len(_loc_8) > 0 or _loc_6 > 0: +                        _loc_8[len(_loc_8)] = _loc_6 + +                _loc_4[len(_loc_4)] = _loc_7 +                _loc_9 = _loc_8 + +            _loc_10 = '' +            _loc_5 = len(_loc_4) - 1 +            while _loc_5 >= 0: +                _loc_10 += param2[_loc_4[_loc_5]] +                _loc_5 -= 1 + +            return _loc_10 + +        @classmethod +        def str2rstr_utf8(cls, param1, param2=None): +            _loc_3 = '' +            _loc_4 = -1 +            if not param2: +                param2 = cls.PADDING +            param1 = param1 + param2[1:9] +            while True: +                _loc_4 += 1 +                if _loc_4 >= len(param1): +                    break +                _loc_5 = compat_ord(param1[_loc_4]) +                _loc_6 = compat_ord(param1[_loc_4 + 1]) if _loc_4 + 1 < len(param1) else 0 +                if 55296 <= _loc_5 <= 56319 and 56320 <= _loc_6 <= 57343: +                    _loc_5 = 65536 + ((_loc_5 & 1023) << 10) + (_loc_6 & 1023) +                    _loc_4 += 1 +                if _loc_5 <= 127: +                    _loc_3 += compat_chr(_loc_5) +                    continue +                if _loc_5 <= 2047: +                    _loc_3 += compat_chr(192 | _loc_5 >> 6 & 31) + compat_chr(128 | _loc_5 & 63) +                    continue +                if _loc_5 <= 65535: +                    _loc_3 += compat_chr(224 | _loc_5 >> 12 & 15) + compat_chr(128 | _loc_5 >> 6 & 63) + compat_chr( +                        128 | _loc_5 & 63) +                    continue +                if _loc_5 <= 2097151: +                    _loc_3 += compat_chr(240 | _loc_5 >> 18 & 7) + compat_chr(128 | _loc_5 >> 12 & 63) + compat_chr( +                        128 | _loc_5 >> 6 & 63) + compat_chr(128 | _loc_5 & 63) +            return _loc_3 + +        @staticmethod +        def rstr2binl(param1): +            _loc_2 = [0] * ((len(param1) >> 2) + 1) +            for _loc_3 in range(0, len(_loc_2)): +                _loc_2[_loc_3] = 0 +            for _loc_3 in range(0, len(param1) * 8, 8): +                _loc_2[_loc_3 >> 5] |= (compat_ord(param1[_loc_3 // 8]) & 255) << _loc_3 % 32 +            return _loc_2 + +        @staticmethod +        def binl2rstr(param1): +            _loc_2 = '' +            for _loc_3 in range(0, len(param1) * 32, 8): +                _loc_2 += compat_chr(param1[_loc_3 >> 5] >> _loc_3 % 32 & 255) +            return _loc_2 + +        @classmethod +        def binl_md5(cls, param1, param2): +            param1 = cls.JSArray(param1) +            param1[param2 >> 5] |= 128 << param2 % 32 +            param1[(param2 + 64 >> 9 << 4) + 14] = param2 +            _loc_3 = 1732584193 +            _loc_4 = -271733879 +            _loc_5 = -1732584194 +            _loc_6 = 271733878 +            for _loc_7 in range(0, len(param1), 16): +                _loc_8 = _loc_3 +                _loc_9 = _loc_4 +                _loc_10 = _loc_5 +                _loc_11 = _loc_6 +                _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 7, -680876936) +                _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 1], 12, -389564586) +                _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 17, 606105819) +                _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 3], 22, -1044525330) +                _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 7, -176418897) +                _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 5], 12, 1200080426) +                _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 17, -1473231341) +                _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 7], 22, -45705983) +                _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 7, 1770035416) +                _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 9], 12, -1958414417) +                _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 17, -42063) +                _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 11], 22, -1990404162) +                _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 7, 1804603682) +                _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 13], 12, -40341101) +                _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 17, -1502002290) +                _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 15], 22, 1236535329) +                _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 5, -165796510) +                _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 6], 9, -1069501632) +                _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 14, 643717713) +                _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 0], 20, -373897302) +                _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 5, -701558691) +                _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 10], 9, 38016083) +                _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 14, -660478335) +                _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 4], 20, -405537848) +                _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 5, 568446438) +                _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 14], 9, -1019803690) +                _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 14, -187363961) +                _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 8], 20, 1163531501) +                _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 5, -1444681467) +                _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 2], 9, -51403784) +                _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 14, 1735328473) +                _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 12], 20, -1926607734) +                _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 4, -378558) +                _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 8], 11, -2022574463) +                _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 16, 1839030562) +                _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 14], 23, -35309556) +                _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 4, -1530992060) +                _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 4], 11, 1272893353) +                _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 16, -155497632) +                _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 10], 23, -1094730640) +                _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 4, 681279174) +                _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 0], 11, -358537222) +                _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 16, -722521979) +                _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 6], 23, 76029189) +                _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 4, -640364487) +                _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 12], 11, -421815835) +                _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 16, 530742520) +                _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 2], 23, -995338651) +                _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 6, -198630844) +                _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 7], 10, 1126891415) +                _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 15, -1416354905) +                _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 5], 21, -57434055) +                _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 6, 1700485571) +                _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 3], 10, -1894986606) +                _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 15, -1051523) +                _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 1], 21, -2054922799) +                _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 6, 1873313359) +                _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 15], 10, -30611744) +                _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 15, -1560198380) +                _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 13], 21, 1309151649) +                _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 6, -145523070) +                _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 11], 10, -1120210379) +                _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 15, 718787259) +                _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 9], 21, -343485551) +                _loc_3 = cls.safe_add(_loc_3, _loc_8) +                _loc_4 = cls.safe_add(_loc_4, _loc_9) +                _loc_5 = cls.safe_add(_loc_5, _loc_10) +                _loc_6 = cls.safe_add(_loc_6, _loc_11) +            return [_loc_3, _loc_4, _loc_5, _loc_6] + +        @classmethod +        def md5_cmn(cls, param1, param2, param3, param4, param5, param6): +            return cls.safe_add( +                cls.bit_rol(cls.safe_add(cls.safe_add(param2, param1), cls.safe_add(param4, param6)), param5), param3) + +        @classmethod +        def md5_ff(cls, param1, param2, param3, param4, param5, param6, param7): +            return cls.md5_cmn(param2 & param3 | ~param2 & param4, param1, param2, param5, param6, param7) + +        @classmethod +        def md5_gg(cls, param1, param2, param3, param4, param5, param6, param7): +            return cls.md5_cmn(param2 & param4 | param3 & ~param4, param1, param2, param5, param6, param7) + +        @classmethod +        def md5_hh(cls, param1, param2, param3, param4, param5, param6, param7): +            return cls.md5_cmn(param2 ^ param3 ^ param4, param1, param2, param5, param6, param7) + +        @classmethod +        def md5_ii(cls, param1, param2, param3, param4, param5, param6, param7): +            return cls.md5_cmn(param3 ^ (param2 | ~param4), param1, param2, param5, param6, param7) + +        @classmethod +        def safe_add(cls, param1, param2): +            _loc_3 = (param1 & 65535) + (param2 & 65535) +            _loc_4 = (param1 >> 16) + (param2 >> 16) + (_loc_3 >> 16) +            return cls.lshift(_loc_4, 16) | _loc_3 & 65535 + +        @classmethod +        def bit_rol(cls, param1, param2): +            return cls.lshift(param1, param2) | (param1 & 0xFFFFFFFF) >> (32 - param2) + +        @staticmethod +        def lshift(value, count): +            r = (0xFFFFFFFF & value) << count +            return -(~(r - 1) & 0xFFFFFFFF) if r > 0x7FFFFFFF else r + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) +        video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id') + +        video = self._download_json( +            self._API_URL_TEMPLATE % video_id, video_id)['videos'][0] + +        title = video['title'] +        duration = float_or_none(video['duration'], 1000) +        like_count = video['likes'] +        uploader = video['channel'] +        uploader_id = video['channel_id'] + +        formats = [] + +        for resource in video['resources']: +            resource_id = resource.get('_id') +            if not resource_id: +                continue + +            security = self._download_json( +                self._SECURITY_URL_TEMPLATE % (video_id, resource_id), +                video_id, 'Downloading security hash for %s' % resource_id) + +            security_hash = security.get('hash') +            if not security_hash: +                message = security.get('message') +                if message: +                    raise ExtractorError( +                        '%s returned error: %s' % (self.IE_NAME, message), expected=True) +                continue + +            hash_code = security_hash[:2] +            received_time = int(security_hash[2:12]) +            received_random = security_hash[12:22] +            received_md5 = security_hash[22:] + +            sign_time = received_time + self._RESIGN_EXPIRATION +            padding = '%010d' % random.randint(1, 10000000000) + +            signed_md5 = self.MD5.b64_md5(received_md5 + compat_str(sign_time) + padding) +            signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5 + +            formats.append({ +                'url': '%s?h=%s&k=%s' % (resource['url'], signed_hash, 'flash'), +                'format_id': resource_id, +                'height': resource['height'] +            }) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'duration': duration, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'like_count': like_count, +            'formats': formats +        }
\ No newline at end of file diff --git a/youtube_dl/extractor/godtube.py b/youtube_dl/extractor/godtube.py index 73bd6d890..363dc6608 100644 --- a/youtube_dl/extractor/godtube.py +++ b/youtube_dl/extractor/godtube.py @@ -36,16 +36,16 @@ class GodTubeIE(InfoExtractor):              'http://www.godtube.com/resource/mediaplayer/%s.xml' % video_id.lower(),              video_id, 'Downloading player config XML') -        video_url = config.find('.//file').text -        uploader = config.find('.//author').text -        timestamp = parse_iso8601(config.find('.//date').text) -        duration = parse_duration(config.find('.//duration').text) -        thumbnail = config.find('.//image').text +        video_url = config.find('file').text +        uploader = config.find('author').text +        timestamp = parse_iso8601(config.find('date').text) +        duration = parse_duration(config.find('duration').text) +        thumbnail = config.find('image').text          media = self._download_xml(              'http://www.godtube.com/media/xml/?v=%s' % video_id, video_id, 'Downloading media XML') -        title = media.find('.//title').text +        title = media.find('title').text          return {              'id': video_id, diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py new file mode 100644 index 000000000..53714f47f --- /dev/null +++ b/youtube_dl/extractor/golem.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    compat_urlparse, +    determine_ext, +) + + +class GolemIE(InfoExtractor): +    _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P<id>.+?)/' +    _TEST = { +        'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html', +        'md5': 'c1a2c0a3c863319651c7c992c5ee29bf', +        'info_dict': { +            'id': '14095', +            'format_id': 'high', +            'ext': 'mp4', +            'title': 'iPhone 6 und 6 Plus - Test', +            'duration': 300.44, +            'filesize': 65309548, +        } +    } + +    _PREFIX = 'http://video.golem.de' + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        config = self._download_xml( +            'https://video.golem.de/xml/{0}.xml'.format(video_id), video_id) + +        info = { +            'id': video_id, +            'title': config.findtext('./title', 'golem'), +            'duration': self._float(config.findtext('./playtime'), 'duration'), +        } + +        formats = [] +        for e in config: +            url = e.findtext('./url') +            if not url: +                continue + +            formats.append({ +                'format_id': e.tag, +                'url': compat_urlparse.urljoin(self._PREFIX, url), +                'height': self._int(e.get('height'), 'height'), +                'width': self._int(e.get('width'), 'width'), +                'filesize': self._int(e.findtext('filesize'), 'filesize'), +                'ext': determine_ext(e.findtext('./filename')), +            }) +        self._sort_formats(formats) +        info['formats'] = formats + +        thumbnails = [] +        for e in config.findall('.//teaser'): +            url = e.findtext('./url') +            if not url: +                continue +            thumbnails.append({ +                'url': compat_urlparse.urljoin(self._PREFIX, url), +                'width': self._int(e.get('width'), 'thumbnail width'), +                'height': self._int(e.get('height'), 'thumbnail height'), +            }) +        info['thumbnails'] = thumbnails + +        return info diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py index ca5f7c417..45cca1d24 100644 --- a/youtube_dl/extractor/gorillavid.py +++ b/youtube_dl/extractor/gorillavid.py @@ -5,6 +5,7 @@ import re  from .common import InfoExtractor  from ..utils import ( +    ExtractorError,      determine_ext,      compat_urllib_parse,      compat_urllib_request, @@ -12,20 +13,22 @@ from ..utils import (  class GorillaVidIE(InfoExtractor): -    IE_DESC = 'GorillaVid.in and daclips.in' +    IE_DESC = 'GorillaVid.in, daclips.in and movpod.in'      _VALID_URL = r'''(?x)          https?://(?P<host>(?:www\.)? -            (?:daclips\.in|gorillavid\.in))/ +            (?:daclips\.in|gorillavid\.in|movpod\.in))/          (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?      ''' +    _FILE_NOT_FOUND_REGEX = r'>(?:404 - )?File Not Found<' +      _TESTS = [{          'url': 'http://gorillavid.in/06y9juieqpmi',          'md5': '5ae4a3580620380619678ee4875893ba',          'info_dict': {              'id': '06y9juieqpmi',              'ext': 'flv', -            'title': 'Rebecca Black My Moment Official Music Video Reaction', +            'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ',              'thumbnail': 're:http://.*\.jpg',          },      }, { @@ -46,6 +49,9 @@ class GorillaVidIE(InfoExtractor):              'title': 'Micro Pig piglets ready on 16th July 2009',              'thumbnail': 're:http://.*\.jpg',          }, +    }, { +        'url': 'http://movpod.in/0wguyyxi1yca', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -54,6 +60,9 @@ class GorillaVidIE(InfoExtractor):          webpage = self._download_webpage('http://%s/%s' % (mobj.group('host'), video_id), video_id) +        if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None: +            raise ExtractorError('Video %s does not exist' % video_id, expected=True) +          fields = dict(re.findall(r'''(?x)<input\s+              type="hidden"\s+              name="([^"]+)"\s+ @@ -69,14 +78,14 @@ class GorillaVidIE(InfoExtractor):              webpage = self._download_webpage(req, video_id, 'Downloading video page') -        title = self._search_regex(r'style="z-index: [0-9]+;">([0-9a-zA-Z ]+)(?:-.+)?</span>', webpage, 'title') -        thumbnail = self._search_regex(r'image:\'(http[^\']+)\',', webpage, 'thumbnail') -        url = self._search_regex(r'file: \'(http[^\']+)\',', webpage, 'file url') +        title = self._search_regex(r'style="z-index: [0-9]+;">([^<]+)</span>', webpage, 'title') +        video_url = self._search_regex(r'file\s*:\s*\'(http[^\']+)\',', webpage, 'file url') +        thumbnail = self._search_regex(r'image\s*:\s*\'(http[^\']+)\',', webpage, 'thumbnail', fatal=False)          formats = [{              'format_id': 'sd', -            'url': url, -            'ext': determine_ext(url), +            'url': video_url, +            'ext': determine_ext(video_url),              'quality': 1,          }] diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py new file mode 100644 index 000000000..f97b1e085 --- /dev/null +++ b/youtube_dl/extractor/heise.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    get_meta_content, +    parse_iso8601, +) + + +class HeiseIE(InfoExtractor): +    _VALID_URL = r'''(?x) +        https?://(?:www\.)?heise\.de/video/artikel/ +        .+?(?P<id>[0-9]+)\.html(?:$|[?#]) +    ''' +    _TEST = { +        'url': ( +            'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html' +        ), +        'md5': 'ffed432483e922e88545ad9f2f15d30e', +        'info_dict': { +            'id': '2404147', +            'ext': 'mp4', +            'title': ( +                "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone" +            ), +            'format_id': 'mp4_720', +            'timestamp': 1411812600, +            'upload_date': '20140927', +            'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) +        json_url = self._search_regex( +            r'json_url:\s*"([^"]+)"', webpage, 'json URL') +        config = self._download_json(json_url, video_id) + +        info = { +            'id': video_id, +            'thumbnail': config.get('poster'), +            'timestamp': parse_iso8601(get_meta_content('date', webpage)), +            'description': self._og_search_description(webpage), +        } + +        title = get_meta_content('fulltitle', webpage) +        if title: +            info['title'] = title +        elif config.get('title'): +            info['title'] = config['title'] +        else: +            info['title'] = self._og_search_title(webpage) + +        formats = [] +        for t, rs in config['formats'].items(): +            if not rs or not hasattr(rs, 'items'): +                self._downloader.report_warning( +                    'formats: {0}: no resolutions'.format(t)) +                continue + +            for height_str, obj in rs.items(): +                format_id = '{0}_{1}'.format(t, height_str) + +                if not obj or not obj.get('url'): +                    self._downloader.report_warning( +                        'formats: {0}: no url'.format(format_id)) +                    continue + +                formats.append({ +                    'url': obj['url'], +                    'format_id': format_id, +                    'height': self._int(height_str, 'height'), +                }) + +        self._sort_formats(formats) +        info['formats'] = formats + +        return info diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 12e9e61c4..c80185b53 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -89,7 +89,12 @@ class IGNIE(InfoExtractor):                  '<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]',                  webpage)              if multiple_urls: -                return [self.url_result(u, ie='IGN') for u in multiple_urls] +                entries = [self.url_result(u, ie='IGN') for u in multiple_urls] +                return { +                    '_type': 'playlist', +                    'id': name_or_id, +                    'entries': entries, +                }          video_id = self._find_video_id(webpage)          result = self._get_video_info(video_id) diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 4ddda2f1b..53f9a5f75 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import re  from .common import InfoExtractor @@ -12,12 +14,13 @@ class InternetVideoArchiveIE(InfoExtractor):      _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?'      _TEST = { -        u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', -        u'file': u'452693.mp4', -        u'info_dict': { -            u'title': u'SKYFALL', -            u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', -            u'duration': 153, +        'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', +        'info_dict': { +            'id': '452693', +            'ext': 'mp4', +            'title': 'SKYFALL', +            'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', +            'duration': 149,          },      } @@ -42,7 +45,7 @@ class InternetVideoArchiveIE(InfoExtractor):          url = self._build_url(query)          flashconfiguration = self._download_xml(url, video_id, -            u'Downloading flash configuration') +            'Downloading flash configuration')          file_url = flashconfiguration.find('file').text          file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')          # Replace some of the parameters in the query to get the best quality @@ -51,7 +54,7 @@ class InternetVideoArchiveIE(InfoExtractor):              lambda m: self._clean_query(m.group()),              file_url)          info = self._download_xml(file_url, video_id, -            u'Downloading video info') +            'Downloading video info')          item = info.find('channel/item')          def _bp(p): diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index a83dd249f..07ef682ee 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -63,7 +63,8 @@ class IzleseneIE(InfoExtractor):          title = self._og_search_title(webpage)          description = self._og_search_description(webpage) -        thumbnail = self._og_search_thumbnail(webpage) +        thumbnail = self._proto_relative_url( +            self._og_search_thumbnail(webpage), scheme='http:')          uploader = self._html_search_regex(              r"adduserUsername\s*=\s*'([^']+)';", diff --git a/youtube_dl/extractor/jpopsukitv.py b/youtube_dl/extractor/jpopsukitv.py index aad782578..122e2dd8c 100644 --- a/youtube_dl/extractor/jpopsukitv.py +++ b/youtube_dl/extractor/jpopsukitv.py @@ -1,8 +1,6 @@  # coding=utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  from ..utils import (      int_or_none, @@ -12,14 +10,14 @@ from ..utils import (  class JpopsukiIE(InfoExtractor):      IE_NAME = 'jpopsuki.tv' -    _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/video/(.*?)/(?P<id>\S+)' +    _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/(?:category/)?video/[^/]+/(?P<id>\S+)'      _TEST = {          'url': 'http://www.jpopsuki.tv/video/ayumi-hamasaki---evolution/00be659d23b0b40508169cdee4545771',          'md5': '88018c0c1a9b1387940e90ec9e7e198e', -        'file': '00be659d23b0b40508169cdee4545771.mp4',          'info_dict': {              'id': '00be659d23b0b40508169cdee4545771', +            'ext': 'mp4',              'title': 'ayumi hamasaki - evolution',              'description': 'Release date: 2001.01.31\r\n浜崎あゆみ - evolution',              'thumbnail': 'http://www.jpopsuki.tv/cache/89722c74d2a2ebe58bcac65321c115b2.jpg', @@ -30,8 +28,7 @@ class JpopsukiIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) @@ -47,11 +44,9 @@ class JpopsukiIE(InfoExtractor):          uploader_id = self._html_search_regex(              r'<li>from: <a href="/user/view/user/\S*?/uid/(\d*)',              webpage, 'video uploader_id', fatal=False) -        upload_date = self._html_search_regex( +        upload_date = unified_strdate(self._html_search_regex(              r'<li>uploaded: (.*?)</li>', webpage, 'video upload_date', -            fatal=False) -        if upload_date is not None: -            upload_date = unified_strdate(upload_date) +            fatal=False))          view_count_str = self._html_search_regex(              r'<li>Hits: ([0-9]+?)</li>', webpage, 'video view_count',              fatal=False) diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py index 9b553b9fa..5aa32bf09 100644 --- a/youtube_dl/extractor/jukebox.py +++ b/youtube_dl/extractor/jukebox.py @@ -11,10 +11,9 @@ from ..utils import (  class JukeboxIE(InfoExtractor): -    _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html' +    _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<id>[a-z0-9\-]+)\.html'      _TEST = {          'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html', -        'md5': '1574e9b4d6438446d5b7dbcdf2786276',          'info_dict': {              'id': 'r303r',              'ext': 'flv', @@ -24,8 +23,7 @@ class JukeboxIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('video_id') +        video_id = self._match_id(url)          html = self._download_webpage(url, video_id)          iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url')) diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py new file mode 100644 index 000000000..fca0bfef0 --- /dev/null +++ b/youtube_dl/extractor/lrt.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    js_to_json, +    parse_duration, +    remove_end, +) + + +class LRTIE(InfoExtractor): +    IE_NAME = 'lrt.lt' +    _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://www.lrt.lt/mediateka/irasas/54391/', +        'info_dict': { +            'id': '54391', +            'ext': 'mp4', +            'title': 'Septynios Kauno dienos', +            'description': 'Kauno miesto ir apskrities naujienos', +            'duration': 1783, +        }, +        'params': { +            'skip_download': True,  # HLS download +        }, + +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        title = remove_end(self._og_search_title(webpage), ' - LRT') +        thumbnail = self._og_search_thumbnail(webpage) +        description = self._og_search_description(webpage) +        duration = parse_duration(self._search_regex( +            r"'duration':\s*'([^']+)',", webpage, +            'duration', fatal=False, default=None)) + +        formats = [] +        for js in re.findall(r'(?s)config:\s*(\{.*?\})', webpage): +            data = json.loads(js_to_json(js)) +            if data['provider'] == 'rtmp': +                formats.append({ +                    'format_id': 'rtmp', +                    'ext': determine_ext(data['file']), +                    'url': data['streamer'], +                    'play_path': 'mp4:%s' % data['file'], +                    'preference': -1, +                }) +            else: +                formats.extend( +                    self._extract_m3u8_formats(data['file'], video_id, 'mp4')) + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +            'thumbnail': thumbnail, +            'description': description, +            'duration': duration, +        } diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py index 3a938861b..c7f6beb9c 100644 --- a/youtube_dl/extractor/muenchentv.py +++ b/youtube_dl/extractor/muenchentv.py @@ -1,7 +1,6 @@  # coding: utf-8  from __future__ import unicode_literals -import datetime  import json  from .common import InfoExtractor @@ -23,6 +22,7 @@ class MuenchenTVIE(InfoExtractor):              'ext': 'mp4',              'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',              'is_live': True, +            'thumbnail': 're:^https?://.*\.jpg$'          },          'params': {              'skip_download': True, @@ -33,9 +33,7 @@ class MuenchenTVIE(InfoExtractor):          display_id = 'live'          webpage = self._download_webpage(url, display_id) -        now = datetime.datetime.now() -        now_str = now.strftime("%Y-%m-%d %H:%M") -        title = self._og_search_title(webpage) + ' ' + now_str +        title = self._live_title(self._og_search_title(webpage))          data_js = self._search_regex(              r'(?s)\nplaylist:\s*(\[.*?}\]),related:', @@ -73,5 +71,6 @@ class MuenchenTVIE(InfoExtractor):              'title': title,              'formats': formats,              'is_live': True, +            'thumbnail': thumbnail,          } diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index 963c4587c..cc7c921c3 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -6,6 +6,7 @@ import re  from .common import InfoExtractor  from ..utils import (      ExtractorError, +    compat_urllib_parse_urlparse,      int_or_none,      remove_end,  ) @@ -13,76 +14,116 @@ from ..utils import (  class NFLIE(InfoExtractor):      IE_NAME = 'nfl.com' -    _VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)' -    _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json' -    _TEST = { -        'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', -        # 'md5': '5eb8c40a727dda106d510e5d6ffa79e5',  # md5 checksum fluctuates -        'info_dict': { -            'id': '0ap3000000398478', -            'ext': 'mp4', -            'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights', -            'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', -            'upload_date': '20140921', -            'timestamp': 1411337580, -            'thumbnail': 're:^https?://.*\.jpg$', +    _VALID_URL = r'''(?x)https?:// +        (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/ +        (?:.+?/)* +        (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))''' +    _TESTS = [ +        { +            'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', +            'md5': '394ef771ddcd1354f665b471d78ec4c6', +            'info_dict': { +                'id': '0ap3000000398478', +                'ext': 'mp4', +                'title': 'Week 3: Redskins vs. Eagles highlights', +                'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', +                'upload_date': '20140921', +                'timestamp': 1411337580, +                'thumbnail': 're:^https?://.*\.jpg$', +            } +        }, +        { +            'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266', +            'md5': 'cf85bdb4bc49f6e9d3816d130c78279c', +            'info_dict': { +                'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266', +                'ext': 'mp4', +                'title': 'LIVE: Post Game vs. Browns', +                'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8', +                'upload_date': '20131229', +                'timestamp': 1388354455, +                'thumbnail': 're:^https?://.*\.jpg$', +            } +        } +    ] + +    @staticmethod +    def prepend_host(host, url): +        if not url.startswith('http'): +            if not url.startswith('/'): +                url = '/%s' % url +            url = 'http://{0:}{1:}'.format(host, url) +        return url + +    @staticmethod +    def format_from_stream(stream, protocol, host, path_prefix='', +                           preference=0, note=None): +        url = '{protocol:}://{host:}/{prefix:}{path:}'.format( +            protocol=protocol, +            host=host, +            prefix=path_prefix, +            path=stream.get('path'), +        ) +        return { +            'url': url, +            'vbr': int_or_none(stream.get('rate', 0), 1000), +            'preference': preference, +            'format_note': note,          } -    }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id, host = mobj.group('id'), mobj.group('host') -        config = self._download_json(self._PLAYER_CONFIG_URL, video_id, -                                     note='Downloading player config') -        url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config) -        video_data = self._download_json(url_template.format(id=video_id), video_id) +        webpage = self._download_webpage(url, video_id) -        cdns = config.get('cdns') -        if not cdns: -            raise ExtractorError('Failed to get CDN data', expected=True) +        config_url = NFLIE.prepend_host(host, self._search_regex( +            r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL')) +        config = self._download_json(config_url, video_id, +                                     note='Downloading player config') +        url_template = NFLIE.prepend_host( +            host, '{contentURLTemplate:}'.format(**config)) +        video_data = self._download_json( +            url_template.format(id=video_id), video_id)          formats = [] -        streams = video_data.get('cdnData', {}).get('bitrateInfo', []) -        for name, cdn in cdns.items(): -            # LimeLight streams don't seem to work -            if cdn.get('name') == 'LIMELIGHT': -                continue - -            protocol = cdn.get('protocol') -            host = remove_end(cdn.get('host', ''), '/') -            if not (protocol and host): -                continue - -            path_prefix = cdn.get('pathprefix', '') -            if path_prefix and not path_prefix.endswith('/'): -                path_prefix = '%s/' % path_prefix - -            get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format( -                protocol=protocol, -                host=host, -                prefix=path_prefix, -                path=p, -            ) - -            if protocol == 'rtmp': -                preference = -2 -            elif 'prog' in name.lower(): -                preference = -1 -            else: -                preference = 0 - +        cdn_data = video_data.get('cdnData', {}) +        streams = cdn_data.get('bitrateInfo', []) +        if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM': +            parts = compat_urllib_parse_urlparse(cdn_data.get('uri')) +            protocol, host = parts.scheme, parts.netloc              for stream in streams: -                path = stream.get('path') -                if not path: +                formats.append( +                    NFLIE.format_from_stream(stream, protocol, host)) +        else: +            cdns = config.get('cdns') +            if not cdns: +                raise ExtractorError('Failed to get CDN data', expected=True) + +            for name, cdn in cdns.items(): +                # LimeLight streams don't seem to work +                if cdn.get('name') == 'LIMELIGHT':                      continue -                formats.append({ -                    'url': get_url(path), -                    'vbr': int_or_none(stream.get('rate', 0), 1000), -                    'preference': preference, -                    'format_note': name, -                }) +                protocol = cdn.get('protocol') +                host = remove_end(cdn.get('host', ''), '/') +                if not (protocol and host): +                    continue + +                prefix = cdn.get('pathprefix', '') +                if prefix and not prefix.endswith('/'): +                    prefix = '%s/' % prefix + +                preference = 0 +                if protocol == 'rtmp': +                    preference = -2 +                elif 'prog' in name.lower(): +                    preference = 1 + +                for stream in streams: +                    formats.append( +                        NFLIE.format_from_stream(stream, protocol, host, +                                                 prefix, preference, name))          self._sort_formats(formats) @@ -94,7 +135,7 @@ class NFLIE(InfoExtractor):          return {              'id': video_id, -            'title': video_data.get('storyHeadline'), +            'title': video_data.get('headline'),              'formats': formats,              'description': video_data.get('caption'),              'duration': video_data.get('duration'), diff --git a/youtube_dl/extractor/oktoberfesttv.py b/youtube_dl/extractor/oktoberfesttv.py new file mode 100644 index 000000000..4a41c0542 --- /dev/null +++ b/youtube_dl/extractor/oktoberfesttv.py @@ -0,0 +1,47 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class OktoberfestTVIE(InfoExtractor): +    _VALID_URL = r'https?://www\.oktoberfest-tv\.de/[^/]+/[^/]+/video/(?P<id>[^/?#]+)' + +    _TEST = { +        'url': 'http://www.oktoberfest-tv.de/de/kameras/video/hb-zelt', +        'info_dict': { +            'id': 'hb-zelt', +            'ext': 'mp4', +            'title': 're:^Live-Kamera: Hofbräuzelt [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            'thumbnail': 're:^https?://.*\.jpg$', +            'is_live': True, +        }, +        'params': { +            'skip_download': True, +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        title = self._live_title(self._html_search_regex( +            r'<h1><strong>.*?</strong>(.*?)</h1>', webpage, 'title')) + +        clip = self._search_regex( +            r"clip:\s*\{\s*url:\s*'([^']+)'", webpage, 'clip') +        ncurl = self._search_regex( +            r"netConnectionUrl:\s*'([^']+)'", webpage, 'rtmp base') +        video_url = ncurl + clip +        thumbnail = self._search_regex( +            r"canvas:\s*\{\s*backgroundImage:\s*'url\(([^)]+)\)'", webpage, +            'thumbnail', fatal=False) + +        return { +            'id': video_id, +            'title': title, +            'url': video_url, +            'ext': 'mp4', +            'is_live': True, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 2adfde909..8f140d626 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ import re  from .common import InfoExtractor  from ..utils import ( +    unified_strdate,      US_RATINGS,  ) @@ -11,10 +12,10 @@ from ..utils import (  class PBSIE(InfoExtractor):      _VALID_URL = r'''(?x)https?://          (?: -            # Direct video URL -            video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | -            # Article with embedded player -           (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) | +           # Direct video URL +           video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | +           # Article with embedded player (or direct video) +           (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |             # Player             video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/          ) @@ -65,10 +66,25 @@ class PBSIE(InfoExtractor):                  'duration': 6559,                  'thumbnail': 're:^https?://.*\.jpg$',              } +        }, +        { +            'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', +            'md5': '908f3e5473a693b266b84e25e1cf9703', +            'info_dict': { +                'id': '2365160389', +                'display_id': 'killer-typhoon', +                'ext': 'mp4', +                'description': 'md5:c741d14e979fc53228c575894094f157', +                'title': 'Killer Typhoon', +                'duration': 3172, +                'thumbnail': 're:^https?://.*\.jpg$', +                'upload_date': '20140122', +            }          } +      ] -    def _extract_ids(self, url): +    def _extract_webpage(self, url):          mobj = re.match(self._VALID_URL, url)          presumptive_id = mobj.group('presumptive_id') @@ -76,15 +92,20 @@ class PBSIE(InfoExtractor):          if presumptive_id:              webpage = self._download_webpage(url, display_id) +            upload_date = unified_strdate(self._search_regex( +                r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"', +                webpage, 'upload date', default=None)) +              MEDIA_ID_REGEXES = [                  r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed                  r'class="coveplayerid">([^<]+)<',                       # coveplayer +                r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>',  # jwplayer              ]              media_id = self._search_regex(                  MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)              if media_id: -                return media_id, presumptive_id +                return media_id, presumptive_id, upload_date              url = self._search_regex(                  r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>', @@ -104,10 +125,10 @@ class PBSIE(InfoExtractor):              video_id = mobj.group('id')              display_id = video_id -        return video_id, display_id +        return video_id, display_id, None      def _real_extract(self, url): -        video_id, display_id = self._extract_ids(url) +        video_id, display_id, upload_date = self._extract_webpage(url)          info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id          info = self._download_json(info_url, display_id) @@ -119,6 +140,7 @@ class PBSIE(InfoExtractor):          return {              'id': video_id, +            'display_id': display_id,              'title': info['title'],              'url': info['alternate_encoding']['url'],              'ext': 'mp4', @@ -126,4 +148,5 @@ class PBSIE(InfoExtractor):              'thumbnail': info.get('image_url'),              'duration': info.get('duration'),              'age_limit': age_limit, +            'upload_date': upload_date,          } diff --git a/youtube_dl/extractor/planetaplay.py b/youtube_dl/extractor/planetaplay.py new file mode 100644 index 000000000..596c621d7 --- /dev/null +++ b/youtube_dl/extractor/planetaplay.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class PlanetaPlayIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?planetaplay\.com/\?sng=(?P<id>[0-9]+)' +    _API_URL = 'http://planetaplay.com/action/playlist/?sng={0:}' +    _THUMBNAIL_URL = 'http://planetaplay.com/img/thumb/{thumb:}' +    _TEST = { +        'url': 'http://planetaplay.com/?sng=3586', +        'md5': '9d569dceb7251a4e01355d5aea60f9db', +        'info_dict': { +            'id': '3586', +            'ext': 'flv', +            'title': 'md5:e829428ee28b1deed00de90de49d1da1', +        } +    } + +    _SONG_FORMATS = { +        'lq': (0, 'http://www.planetaplay.com/videoplayback/{med_hash:}'), +        'hq': (1, 'http://www.planetaplay.com/videoplayback/hi/{med_hash:}'), +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        response = self._download_json( +            self._API_URL.format(video_id), video_id)['response'] +        try: +            data = response.get('data')[0] +        except IndexError: +            raise ExtractorError( +                '%s: failed to get the playlist' % self.IE_NAME, expected=True) + +        title = '{song_artists:} - {sng_name:}'.format(**data) +        thumbnail = self._THUMBNAIL_URL.format(**data) + +        formats = [] +        for format_id, (quality, url_template) in self._SONG_FORMATS.items(): +            formats.append({ +                'format_id': format_id, +                'url': url_template.format(**data), +                'quality': quality, +                'ext': 'flv', +            }) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py new file mode 100644 index 000000000..645a1e06d --- /dev/null +++ b/youtube_dl/extractor/played.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import os.path + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_parse, +    compat_urllib_request, +) + + +class PlayedIE(InfoExtractor): +    IE_NAME = 'played.to' +    _VALID_URL = r'https?://(?:www\.)?played\.to/(?P<id>[a-zA-Z0-9_-]+)' + +    _TEST = { +        'url': 'http://played.to/j2f2sfiiukgt', +        'md5': 'c2bd75a368e82980e7257bf500c00637', +        'info_dict': { +            'id': 'j2f2sfiiukgt', +            'ext': 'flv', +            'title': 'youtube-dl_test_video.mp4', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        orig_webpage = self._download_webpage(url, video_id) +        fields = re.findall( +            r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage) +        data = dict(fields) + +        self._sleep(2, video_id) + +        post = compat_urllib_parse.urlencode(data) +        headers = { +            b'Content-Type': b'application/x-www-form-urlencoded', +        } +        req = compat_urllib_request.Request(url, post, headers) +        webpage = self._download_webpage( +            req, video_id, note='Downloading video page ...') + +        title = os.path.splitext(data['fname'])[0] + +        video_url = self._search_regex( +            r'file: "?(.+?)",', webpage, 'video URL') + +        return { +            'id': video_id, +            'title': title, +            'url': video_url, +        } diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 5b2a723c1..619496de7 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -144,7 +144,7 @@ class ProSiebenSat1IE(InfoExtractor):                  'id': '2156342',                  'ext': 'mp4',                  'title': 'Kurztrips zum Valentinstag', -                'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528', +                'description': 'Romantischer Kurztrip zum Valentinstag? Wir verraten, was sich hier wirklich lohnt.',                  'duration': 307.24,              },              'params': { @@ -180,12 +180,10 @@ class ProSiebenSat1IE(InfoExtractor):      ]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) -        page = self._download_webpage(url, video_id, 'Downloading page') - -        clip_id = self._html_search_regex(self._CLIPID_REGEXES, page, 'clip id') +        clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')          access_token = 'testclient'          client_name = 'kolibri-1.2.5' @@ -234,12 +232,12 @@ class ProSiebenSat1IE(InfoExtractor):          urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON') -        title = self._html_search_regex(self._TITLE_REGEXES, page, 'title') -        description = self._html_search_regex(self._DESCRIPTION_REGEXES, page, 'description', fatal=False) -        thumbnail = self._og_search_thumbnail(page) +        title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') +        description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) +        thumbnail = self._og_search_thumbnail(webpage)          upload_date = unified_strdate(self._html_search_regex( -            self._UPLOAD_DATE_REGEXES, page, 'upload date', default=None)) +            self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None))          formats = [] diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 2007a0013..94602e89e 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -9,7 +9,6 @@ from ..utils import (      compat_urllib_parse,      unified_strdate,      str_to_int, -    int_or_none,  )  from ..aes import aes_decrypt_text @@ -40,31 +39,42 @@ class SpankwireIE(InfoExtractor):          req.add_header('Cookie', 'age_verified=1')          webpage = self._download_webpage(req, video_id) -        title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title') +        title = self._html_search_regex( +            r'<h1>([^<]+)', webpage, 'title')          description = self._html_search_regex( -            r'<div\s+id="descriptionContent">([^<]+)<', webpage, 'description', fatal=False) +            r'<div\s+id="descriptionContent">([^<]+)<', +            webpage, 'description', fatal=False)          thumbnail = self._html_search_regex( -            r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False) +            r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']', +            webpage, 'thumbnail', fatal=False)          uploader = self._html_search_regex( -            r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False) +            r'by:\s*<a [^>]*>(.+?)</a>', +            webpage, 'uploader', fatal=False)          uploader_id = self._html_search_regex( -            r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', webpage, 'uploader id', fatal=False) -        upload_date = self._html_search_regex(r'</a> on (.+?) at \d+:\d+', webpage, 'upload date', fatal=False) -        if upload_date: -            upload_date = unified_strdate(upload_date) -         -        view_count = self._html_search_regex( -            r'<div id="viewsCounter"><span>([^<]+)</span> views</div>', webpage, 'view count', fatal=False) -        if view_count: -            view_count = str_to_int(view_count) -        comment_count = int_or_none(self._html_search_regex( -            r'<span id="spCommentCount">\s*(\d+)</span> Comments</div>', webpage, 'comment count', fatal=False)) +            r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', +            webpage, 'uploader id', fatal=False) +        upload_date = unified_strdate(self._html_search_regex( +            r'</a> on (.+?) at \d+:\d+', +            webpage, 'upload date', fatal=False)) -        video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage))) +        view_count = str_to_int(self._html_search_regex( +            r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>', +            webpage, 'view count', fatal=False)) +        comment_count = str_to_int(self._html_search_regex( +            r'Comments<span[^>]+>\s*\(([\d,\.]+)\)</span>', +            webpage, 'comment count', fatal=False)) + +        video_urls = list(map( +            compat_urllib_parse.unquote, +            re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage)))          if webpage.find('flashvars\.encrypted = "true"') != -1: -            password = self._html_search_regex(r'flashvars\.video_title = "([^"]+)', webpage, 'password').replace('+', ' ') -            video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) +            password = self._html_search_regex( +                r'flashvars\.video_title = "([^"]+)', +                webpage, 'password').replace('+', ' ') +            video_urls = list(map( +                lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), +                video_urls))          formats = []          for video_url in video_urls: diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 185353bef..abb827783 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -17,11 +17,11 @@ class SportDeutschlandIE(InfoExtractor):          'info_dict': {              'id': 'live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen',              'ext': 'mp4', -            'title': 'LIVE: Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', +            'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen',              'categories': ['Badminton'],              'view_count': int,              'thumbnail': 're:^https?://.*\.jpg$', -            'description': 're:^Die Badminton-WM 2014 aus Kopenhagen LIVE', +            'description': 're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV',              'timestamp': int,              'upload_date': 're:^201408[23][0-9]$',          }, diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index 7de3c9dd5..263f09b46 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -39,10 +39,10 @@ class SunPornoIE(InfoExtractor):              r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False)          duration = parse_duration(self._search_regex( -            r'<span>Duration: (\d+:\d+)</span>', webpage, 'duration', fatal=False)) +            r'Duration:\s*(\d+:\d+)\s*<', webpage, 'duration', fatal=False))          view_count = int_or_none(self._html_search_regex( -            r'<span class="views">(\d+)</span>', webpage, 'view count', fatal=False)) +            r'class="views">\s*(\d+)\s*<', webpage, 'view count', fatal=False))          comment_count = int_or_none(self._html_search_regex(              r'(\d+)</b> Comments?', webpage, 'comment count', fatal=False)) diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py new file mode 100644 index 000000000..77e056242 --- /dev/null +++ b/youtube_dl/extractor/tapely.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    clean_html, +    compat_urllib_request, +    float_or_none, +    parse_iso8601, +) + + +class TapelyIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?tape\.ly/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?' +    _API_URL = 'http://tape.ly/showtape?id={0:}' +    _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}' +    _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}' +    _TESTS = [ +        { +            'url': 'http://tape.ly/my-grief-as-told-by-water', +            'info_dict': { +                'id': 23952, +                'title': 'my grief as told by water', +                'thumbnail': 're:^https?://.*\.png$', +                'uploader_id': 16484, +                'timestamp': 1411848286, +                'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.', +            }, +            'playlist_count': 13, +        }, +        { +            'url': 'http://tape.ly/my-grief-as-told-by-water/1', +            'md5': '79031f459fdec6530663b854cbc5715c', +            'info_dict': { +                'id': 258464, +                'title': 'Dreaming Awake  (My Brightest Diamond)', +                'ext': 'm4a', +            }, +        }, +    ] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        display_id = mobj.group('id') + +        playlist_url = self._API_URL.format(display_id) +        request = compat_urllib_request.Request(playlist_url) +        request.add_header('X-Requested-With', 'XMLHttpRequest') +        request.add_header('Accept', 'application/json') + +        playlist = self._download_json(request, display_id) + +        tape = playlist['tape'] + +        entries = [] +        for s in tape['songs']: +            song = s['song'] +            entry = { +                'id': song['id'], +                'duration': float_or_none(song.get('songduration'), 1000), +                'title': song['title'], +            } +            if song['source'] == 'S3': +                entry.update({ +                    'url': self._S3_SONG_URL.format(song['filename']), +                }) +                entries.append(entry) +            elif song['source'] == 'YT': +                self.to_screen('YouTube video detected') +                yt_id = song['filename'].replace('/youtube/', '') +                entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id)) +                entries.append(entry) +            elif song['source'] == 'SC': +                self.to_screen('SoundCloud song detected') +                sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename']) +                entry.update(self.url_result(sc_url, 'Soundcloud')) +                entries.append(entry) +            else: +                self.report_warning('Unknown song source: %s' % song['source']) + +        if mobj.group('songnr'): +            songnr = int(mobj.group('songnr')) - 1 +            try: +                return entries[songnr] +            except IndexError: +                raise ExtractorError( +                    'No song with index: %s' % mobj.group('songnr'), +                    expected=True) + +        return { +            '_type': 'playlist', +            'id': tape['id'], +            'display_id': display_id, +            'title': tape['name'], +            'entries': entries, +            'thumbnail': tape.get('image_url'), +            'description': clean_html(tape.get('subtext')), +            'like_count': tape.get('likescount'), +            'uploader_id': tape.get('user_id'), +            'timestamp': parse_iso8601(tape.get('published_at')), +        } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 1cca47771..d5e28efad 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -149,7 +149,7 @@ class TEDIE(SubtitlesInfoExtractor):              thumbnail = 'http://' + thumbnail          return {              'id': video_id, -            'title': talk_info['title'], +            'title': talk_info['title'].strip(),              'uploader': talk_info['speaker'],              'thumbnail': thumbnail,              'description': self._og_search_description(webpage), diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py new file mode 100644 index 000000000..a77c6a2fc --- /dev/null +++ b/youtube_dl/extractor/thesixtyone.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class TheSixtyOneIE(InfoExtractor): +    _VALID_URL = r'''(?x)https?://(?:www\.)?thesixtyone\.com/ +        (?:.*?/)* +        (?: +            s| +            song/comments/list| +            song +        )/(?P<id>[A-Za-z0-9]+)/?$''' +    _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}' +    _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}.thesixtyone.com/thesixtyone_production/audio/{0:}_stream' +    _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop' +    _TESTS = [ +        { +            'url': 'http://www.thesixtyone.com/s/SrE3zD7s1jt/', +            'md5': '821cc43b0530d3222e3e2b70bb4622ea', +            'info_dict': { +                'id': 'SrE3zD7s1jt', +                'ext': 'mp3', +                'title': 'CASIO - Unicorn War Mixtape', +                'thumbnail': 're:^https?://.*_desktop$', +                'upload_date': '20071217', +                'duration': 3208, +            } +        }, +        { +            'url': 'http://www.thesixtyone.com/song/comments/list/SrE3zD7s1jt', +            'only_matching': True, +        }, +        { +            'url': 'http://www.thesixtyone.com/s/ULoiyjuJWli#/s/SrE3zD7s1jt/', +            'only_matching': True, +        }, +        { +            'url': 'http://www.thesixtyone.com/#/s/SrE3zD7s1jt/', +            'only_matching': True, +        }, +        { +            'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/', +            'only_matching': True, +        }, +    ] + +    _DECODE_MAP = { +        "x": "a", +        "m": "b", +        "w": "c", +        "q": "d", +        "n": "e", +        "p": "f", +        "a": "0", +        "h": "1", +        "e": "2", +        "u": "3", +        "s": "4", +        "i": "5", +        "o": "6", +        "y": "7", +        "r": "8", +        "c": "9" +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        song_id = mobj.group('id') + +        webpage = self._download_webpage( +            self._SONG_URL_TEMPLATE.format(song_id), song_id) + +        song_data = json.loads(self._search_regex( +            r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data')) +        keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']] +        url = self._SONG_FILE_URL_TEMPLATE.format( +            "".join(reversed(keys)), **song_data) + +        formats = [{ +            'format_id': 'sd', +            'url': url, +            'ext': 'mp3', +        }] + +        return { +            'id': song_id, +            'title': '{artist:} - {name:}'.format(**song_data), +            'formats': formats, +            'comment_count': song_data.get('comments_count'), +            'duration': song_data.get('play_time'), +            'like_count': song_data.get('score'), +            'thumbnail': self._THUMBNAIL_URL_TEMPLATE.format(**song_data), +            'upload_date': unified_strdate(song_data.get('publish_date')), +        } diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py index 607e947bb..496f15d80 100644 --- a/youtube_dl/extractor/thvideo.py +++ b/youtube_dl/extractor/thvideo.py @@ -26,8 +26,7 @@ class THVideoIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          # extract download link from mobile player page          webpage_player = self._download_webpage( @@ -57,3 +56,29 @@ class THVideoIE(InfoExtractor):              'description': description,              'upload_date': upload_date          } + + +class THVideoPlaylistIE(InfoExtractor): +    _VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://thvideo.tv/mylist2', +        'info_dict': { +            'id': '2', +            'title': '幻想万華鏡', +        }, +        'playlist_mincount': 23, +    } + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) + +        webpage = self._download_webpage(url, playlist_id) +        list_title = self._html_search_regex( +            r'<h1 class="show_title">(.*?)<b id', webpage, 'playlist title', +            fatal=False) + +        entries = [ +            self.url_result('http://thvideo.tv/v/th' + id, 'THVideo') +            for id in re.findall(r'<dd><a href="http://thvideo.tv/v/th(\d+)/" target=', webpage)] + +        return self.playlist_result(entries, playlist_id, list_title) diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py index dc8697850..27962b5fe 100644 --- a/youtube_dl/extractor/tvigle.py +++ b/youtube_dl/extractor/tvigle.py @@ -17,16 +17,16 @@ class TvigleIE(InfoExtractor):      _TESTS = [          { -            'url': 'http://www.tvigle.ru/video/brat-2/', -            'md5': '72cb7eab33e54314e1790da402d3c9c3', +            'url': 'http://www.tvigle.ru/video/brat/', +            'md5': 'ff4344a4894b0524441fb6f8218dc716',              'info_dict': { -                'id': '5119390', -                'display_id': 'brat-2', +                'id': '5118490', +                'display_id': 'brat',                  'ext': 'mp4', -                'title': 'Брат 2 ', -                'description': 'md5:5751f4fe345a58e1692585c361294bd8', -                'duration': 7356.369, -                'age_limit': 0, +                'title': 'Брат', +                'description': 'md5:d16ac7c0b47052ea51fddb92c4e413eb', +                'duration': 5722.6, +                'age_limit': 16,              },          },          { @@ -71,6 +71,7 @@ class TvigleIE(InfoExtractor):                      'format_id': '%s-%s' % (vcodec, quality),                      'vcodec': vcodec,                      'height': int(quality[:-1]), +                    'filesize': item['video_files_size'][vcodec][quality],                  })          self._sort_formats(formats) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index ebab8b86c..5b1a3ec78 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -5,7 +5,6 @@ import xml.etree.ElementTree  from .common import InfoExtractor  from ..utils import ( -    compat_HTTPError,      compat_urllib_request,      ExtractorError,  ) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 7d27d6c57..964470070 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -31,7 +31,7 @@ class VGTVIE(InfoExtractor):              'url': 'http://www.vgtv.no/#!/live/100764/opptak-vgtv-foelger-em-kvalifiseringen',              'info_dict': {                  'id': '100764', -                'ext': 'mp4', +                'ext': 'flv',                  'title': 'OPPTAK: VGTV følger EM-kvalifiseringen',                  'description': 'md5:3772d9c0dc2dff92a886b60039a7d4d3',                  'thumbnail': 're:^https?://.*\.jpg', @@ -50,7 +50,7 @@ class VGTVIE(InfoExtractor):              'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen',              'info_dict': {                  'id': '100015', -                'ext': 'mp4', +                'ext': 'flv',                  'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!',                  'description': 'md5:9a60cc23fa349f761628924e56eeec2d',                  'thumbnail': 're:^https?://.*\.jpg', diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bc01d7fbf..d2c36b58a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,17 +8,19 @@ import itertools  from .common import InfoExtractor  from .subtitles import SubtitlesInfoExtractor  from ..utils import ( +    clean_html,      compat_HTTPError,      compat_urllib_parse,      compat_urllib_request, -    clean_html, -    get_element_by_attribute, +    compat_urlparse,      ExtractorError, +    get_element_by_attribute, +    InAdvancePagedList, +    int_or_none,      RegexNotFoundError,      std_headers,      unsmuggle_url,      urlencode_postdata, -    int_or_none,  ) @@ -89,6 +91,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):                  'uploader_id': 'openstreetmapus',                  'uploader': 'OpenStreetMap US',                  'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', +                'description': 'md5:380943ec71b89736ff4bf27183233d09',                  'duration': 1595,              },          }, @@ -103,6 +106,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):                  'uploader': 'The BLN & Business of Software',                  'uploader_id': 'theblnbusinessofsoftware',                  'duration': 3610, +                'description': None,              },          },          { @@ -117,6 +121,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):                  'uploader_id': 'user18948128',                  'uploader': 'Jaime Marquínez Ferrándiz',                  'duration': 10, +                'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people who love them.',              },              'params': {                  'videopassword': 'youtube-dl', @@ -203,6 +208,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):          # Extract ID from URL          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') +        orig_url = url          if mobj.group('pro') or mobj.group('player'):              url = 'http://player.vimeo.com/video/' + video_id @@ -273,18 +279,23 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):                  _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1]          # Extract video description -        video_description = None -        try: -            video_description = get_element_by_attribute("class", "description_wrapper", webpage) -            if video_description: -                video_description = clean_html(video_description) -        except AssertionError as err: -            # On some pages like (http://player.vimeo.com/video/54469442) the -            # html tags are not closed, python 2.6 cannot handle it -            if err.args[0] == 'we should not get here!': -                pass -            else: -                raise + +        video_description = self._html_search_regex( +            r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>', +            webpage, 'description', default=None) +        if not video_description: +            video_description = self._html_search_meta( +                'description', webpage, default=None) +        if not video_description and mobj.group('pro'): +            orig_webpage = self._download_webpage( +                orig_url, video_id, +                note='Downloading webpage for description', +                fatal=False) +            if orig_webpage: +                video_description = self._html_search_meta( +                    'description', orig_webpage, default=None) +        if not video_description and not mobj.group('player'): +            self._downloader.report_warning('Cannot find video description')          # Extract video duration          video_duration = int_or_none(config["video"].get("duration")) @@ -529,3 +540,58 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):      def _real_extract(self, url):          return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater') + + +class VimeoLikesIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)' +    IE_NAME = 'vimeo:likes' +    IE_DESC = 'Vimeo user likes' +    _TEST = { +        'url': 'https://vimeo.com/user755559/likes/', +        'playlist_mincount': 293, +        "info_dict": { +            "description": "See all the videos urza likes", +            "title": 'Videos urza likes', +        }, +    } + +    def _real_extract(self, url): +        user_id = self._match_id(url) +        webpage = self._download_webpage(url, user_id) +        page_count = self._int( +            self._search_regex( +                r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)"> +                    .*?</a></li>\s*<li\s+class="pagination_next"> +                ''', webpage, 'page count'), +            'page count', fatal=True) +        PAGE_SIZE = 12 +        title = self._html_search_regex( +            r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False) +        description = self._html_search_meta('description', webpage) + +        def _get_page(idx): +            page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % ( +                self.http_scheme(), user_id, idx + 1) +            webpage = self._download_webpage( +                page_url, user_id, +                note='Downloading page %d/%d' % (idx + 1, page_count)) +            video_list = self._search_regex( +                r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>', +                webpage, 'video content') +            paths = re.findall( +                r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list) +            for path in paths: +                yield { +                    '_type': 'url', +                    'url': compat_urlparse.urljoin(page_url, path), +                } + +        pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) + +        return { +            '_type': 'playlist', +            'id': 'user%s_likes' % user_id, +            'title': title, +            'description': description, +            'entries': pl, +        } diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index fb0600f1a..ec3c010ad 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -5,6 +5,7 @@ import re  from .common import InfoExtractor  from ..utils import (      compat_urllib_parse_urlparse, +    ExtractorError,      parse_duration,      qualities,  ) @@ -14,13 +15,12 @@ class VuClipIE(InfoExtractor):      _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)'      _TEST = { -        'url': 'http://m.vuclip.com/w?cid=843902317&fid=63532&z=1007&nvar&frm=index.html&bu=4757321434', -        'md5': '92ac9d1ccefec4f0bb474661ab144fcf', +        'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html',          'info_dict': { -            'id': '843902317', +            'id': '922692425',              'ext': '3gp', -            'title': 'Movie Trailer: Noah', -            'duration': 139, +            'title': 'The Toy Soldiers - Hollywood Movie Trailer', +            'duration': 180,          }      } @@ -37,16 +37,32 @@ class VuClipIE(InfoExtractor):              webpage = self._download_webpage(                  adfree_url, video_id, note='Download post-ad page') +        error_msg = self._html_search_regex( +            r'<p class="message">(.*?)</p>', webpage, 'error message', +            default=None) +        if error_msg: +            raise ExtractorError( +                '%s said: %s' % (self.IE_NAME, error_msg), expected=True) + +        # These clowns alternate between two page types          links_code = self._search_regex( -            r'(?s)<div class="social align_c".*?>(.*?)<hr\s*/?>', webpage, -            'links') +            r'''(?xs) +                (?: +                    <img\s+src="/im/play.gif".*?>| +                    <!--\ player\ end\ -->\s*</div><!--\ thumb\ end--> +                ) +                (.*?) +                (?: +                    <a\s+href="fblike|<div\s+class="social"> +                ) +            ''', webpage, 'links')          title = self._html_search_regex(              r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip()          quality_order = qualities(['Reg', 'Hi'])          formats = []          for url, q in re.findall( -                r'<a href="(?P<url>[^"]+)".*?>(?P<q>[^<]+)</a>', links_code): +                r'<a\s+href="(?P<url>[^"]+)".*?>(?:<button[^>]*>)?(?P<q>[^<]+)(?:</button>)?</a>', links_code):              format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q              formats.append({                  'format_id': format_id, @@ -56,7 +72,7 @@ class VuClipIE(InfoExtractor):          self._sort_formats(formats)          duration = parse_duration(self._search_regex( -            r'\(([0-9:]+)\)</span></h1>', webpage, 'duration', fatal=False)) +            r'\(([0-9:]+)\)</span>', webpage, 'duration', fatal=False))          return {              'id': video_id, diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index 4e89acd81..bda3870db 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -13,37 +13,35 @@ class WorldStarHipHopIE(InfoExtractor):          "info_dict": {              "id": "wshh6a7q1ny0G34ZwuIO",              "ext": "mp4", -            "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" +            "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"          }      }      def _real_extract(self, url): -        m = re.match(self._VALID_URL, url) -        video_id = m.group('id') +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) -        webpage_src = self._download_webpage(url, video_id) - -        m_vevo_id = re.search(r'videoId=(.*?)&?', -                              webpage_src) +        m_vevo_id = re.search(r'videoId=(.*?)&?', webpage)          if m_vevo_id is not None:              return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')          video_url = self._search_regex( -            r'so\.addVariable\("file","(.*?)"\)', webpage_src, 'video URL') +            r'so\.addVariable\("file","(.*?)"\)', webpage, 'video URL')          if 'youtube' in video_url:              return self.url_result(video_url, ie='Youtube')          video_title = self._html_search_regex( -            r"<title>(.*)</title>", webpage_src, 'title') +            r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>', +            webpage, 'title')          # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.          thumbnail = self._html_search_regex( -            r'rel="image_src" href="(.*)" />', webpage_src, 'thumbnail', +            r'rel="image_src" href="(.*)" />', webpage, 'thumbnail',              fatal=False)          if not thumbnail: -            _title = r"""candytitles.*>(.*)</span>""" -            mobj = re.search(_title, webpage_src) +            _title = r'candytitles.*>(.*)</span>' +            mobj = re.search(_title, webpage)              if mobj is not None:                  video_title = mobj.group(1) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 3ab6017cd..221341c13 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -38,16 +38,6 @@ class YahooIE(InfoExtractor):              },          },          { -            'url': 'https://movies.yahoo.com/video/world-loves-spider-man-190819223.html', -            'md5': '410b7104aa9893b765bc22787a22f3d9', -            'info_dict': { -                'id': '516ed8e2-2c4f-339f-a211-7a8b49d30845', -                'ext': 'mp4', -                'title': 'The World Loves Spider-Man', -                'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''', -            } -        }, -        {              'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',              'md5': '60e8ac193d8fb71997caa8fce54c6460',              'info_dict': { diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 24872861a..944d7da38 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -13,7 +13,7 @@ class YnetIE(InfoExtractor):      _TESTS = [          {              'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', -            'md5': '002b44ee2f33d50363a1c153bed524cf', +            'md5': '4b29cb57c3dddd57642b3f051f535b07',              'info_dict': {                  'id': 'L-11659-99244',                  'ext': 'flv', @@ -22,7 +22,7 @@ class YnetIE(InfoExtractor):              }          }, {              'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html', -            'md5': '6455046ae1b48cf7e2b7cae285e53a16', +            'md5': '8194c2ea221e9a639cac96b6b0753dc5',              'info_dict': {                  'id': 'L-8859-84418',                  'ext': 'flv', @@ -33,9 +33,7 @@ class YnetIE(InfoExtractor):      ]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -         +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage)) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 07ed7cbd1..48d47a245 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -2,7 +2,6 @@  from __future__ import unicode_literals -import json  import math  import random  import re diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 99198e380..9041cfa87 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,7 +26,7 @@ from ..utils import (      get_element_by_attribute,      ExtractorError,      int_or_none, -    PagedList, +    OnDemandPagedList,      unescapeHTML,      unified_strdate,      orderedSet, @@ -655,6 +655,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          # Get video webpage          url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id +        pref_cookies = [ +            c for c in self._downloader.cookiejar +            if c.domain == '.youtube.com' and c.name == 'PREF'] +        for pc in pref_cookies: +            if 'hl=' in pc.value: +                pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value) +            else: +                if pc.value: +                    pc.value += '&' +                pc.value += 'hl=en'          video_webpage = self._download_webpage(url, video_id)          # Attempt to extract SWF player URL @@ -1341,7 +1351,7 @@ class YoutubeUserIE(InfoExtractor):                      'id': video_id,                      'title': title,                  } -        url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) +        url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)          return self.playlist_result(url_results, playlist_title=username) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 44dcb1e34..f651337ad 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -87,7 +87,7 @@ def parseOpts(overrideArguments=None):          for private_opt in ['-p', '--password', '-u', '--username', '--video-password']:              try:                  i = opts.index(private_opt) -                opts[i+1] = '<PRIVATE>' +                opts[i+1] = 'PRIVATE'              except ValueError:                  pass          return opts diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b644f4e92..d7ae5a90a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -673,6 +673,8 @@ class ExtractorError(Exception):              expected = True          if video_id is not None:              msg = video_id + ': ' + msg +        if cause: +            msg += u' (caused by %r)' % cause          if not expected:              msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'          super(ExtractorError, self).__init__(msg) @@ -799,6 +801,12 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):                  del req.headers['User-agent']              req.headers['User-agent'] = req.headers['Youtubedl-user-agent']              del req.headers['Youtubedl-user-agent'] + +        if sys.version_info < (2, 7) and '#' in req.get_full_url(): +            # Python 2.6 is brain-dead when it comes to fragments +            req._Request__original = req._Request__original.partition('#')[0] +            req._Request__r_type = req._Request__r_type.partition('#')[0] +          return req      def http_response(self, req, resp): @@ -884,7 +892,9 @@ def unified_strdate(date_str):          '%d/%m/%Y',          '%d/%m/%y',          '%Y/%m/%d %H:%M:%S', +        '%d/%m/%Y %H:%M:%S',          '%Y-%m-%d %H:%M:%S', +        '%Y-%m-%d %H:%M:%S.%f',          '%d.%m.%Y %H:%M',          '%d.%m.%Y %H.%M',          '%Y-%m-%dT%H:%M:%SZ', @@ -1384,14 +1394,16 @@ def check_executable(exe, args=[]):  class PagedList(object): -    def __init__(self, pagefunc, pagesize): -        self._pagefunc = pagefunc -        self._pagesize = pagesize -      def __len__(self):          # This is only useful for tests          return len(self.getslice()) + +class OnDemandPagedList(PagedList): +    def __init__(self, pagefunc, pagesize): +        self._pagefunc = pagefunc +        self._pagesize = pagesize +      def getslice(self, start=0, end=None):          res = []          for pagenum in itertools.count(start // self._pagesize): @@ -1430,6 +1442,35 @@ class PagedList(object):          return res +class InAdvancePagedList(PagedList): +    def __init__(self, pagefunc, pagecount, pagesize): +        self._pagefunc = pagefunc +        self._pagecount = pagecount +        self._pagesize = pagesize + +    def getslice(self, start=0, end=None): +        res = [] +        start_page = start // self._pagesize +        end_page = ( +            self._pagecount if end is None else (end // self._pagesize + 1)) +        skip_elems = start - start_page * self._pagesize +        only_more = None if end is None else end - start +        for pagenum in range(start_page, end_page): +            page = list(self._pagefunc(pagenum)) +            if skip_elems: +                page = page[skip_elems:] +                skip_elems = None +            if only_more is not None: +                if len(page) < only_more: +                    only_more -= len(page) +                else: +                    page = page[:only_more] +                    res.extend(page) +                    break +            res.extend(page) +        return res + +  def uppercase_escape(s):      unicode_escape = codecs.getdecoder('unicode_escape')      return re.sub( @@ -1534,33 +1575,37 @@ US_RATINGS = {  } +def parse_age_limit(s): +    if s is None: +        return None +    m = re.match(r'^(?P<age>\d{1,2})\+?$', s) +    return int(m.group('age')) if m else US_RATINGS.get(s, None) + +  def strip_jsonp(code):      return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)  def js_to_json(code):      def fix_kv(m): -        key = m.group(2) -        if key.startswith("'"): -            assert key.endswith("'") -            assert '"' not in key -            key = '"%s"' % key[1:-1] -        elif not key.startswith('"'): -            key = '"%s"' % key - -        value = m.group(4) -        if value.startswith("'"): -            assert value.endswith("'") -            assert '"' not in value -            value = '"%s"' % value[1:-1] - -        return m.group(1) + key + m.group(3) + value +        v = m.group(0) +        if v in ('true', 'false', 'null'): +            return v +        if v.startswith('"'): +            return v +        if v.startswith("'"): +            v = v[1:-1] +            v = re.sub(r"\\\\|\\'|\"", lambda m: { +                '\\\\': '\\\\', +                "\\'": "'", +                '"': '\\"', +            }[m.group(0)], v) +        return '"%s"' % v      res = re.sub(r'''(?x) -            ([{,]\s*) -            ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+) -            (:\s*) -            ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{) +        "(?:[^"\\]*(?:\\\\|\\")?)*"| +        '(?:[^'\\]*(?:\\\\|\\')?)*'| +        [a-zA-Z_][a-zA-Z_0-9]*          ''', fix_kv, code)      res = re.sub(r',(\s*\])', lambda m: m.group(1), res)      return res diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c17701d6a..4f0d486b9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.25' +__version__ = '2014.10.05.2'  | 
