diff options
Diffstat (limited to 'youtube_dl')
102 files changed, 3653 insertions, 1839 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 9a8c7da05..50425b8d7 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -47,7 +47,9 @@ from .utils import (      DEFAULT_OUTTMPL,      determine_ext,      DownloadError, +    encode_compat_str,      encodeFilename, +    error_to_compat_str,      ExtractorError,      format_bytes,      formatSeconds, @@ -495,7 +497,7 @@ class YoutubeDL(object):                      tb = ''                      if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:                          tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info)) -                    tb += compat_str(traceback.format_exc()) +                    tb += encode_compat_str(traceback.format_exc())                  else:                      tb_data = traceback.format_list(traceback.extract_stack())                      tb = ''.join(tb_data) @@ -674,14 +676,14 @@ class YoutubeDL(object):                      return self.process_ie_result(ie_result, download, extra_info)                  else:                      return ie_result -            except ExtractorError as de:  # An error we somewhat expected -                self.report_error(compat_str(de), de.format_traceback()) +            except ExtractorError as e:  # An error we somewhat expected +                self.report_error(compat_str(e), e.format_traceback())                  break              except MaxDownloadsReached:                  raise              except Exception as e:                  if self.params.get('ignoreerrors', False): -                    self.report_error(compat_str(e), tb=compat_str(traceback.format_exc())) +                    self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))                      break                  else:                      raise @@ -1110,6 +1112,12 @@ class YoutubeDL(object):                                            'contain the video, try using '                                            '"-f %s+%s"' % (format_2, format_1))                          return +                    # Formats must be opposite (video+audio) +                    if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none': +                        self.report_error( +                            'Both formats %s and %s are video-only, you must specify "-f video+audio"' +                            % (format_1, format_2)) +                        return                      output_ext = (                          formats_info[0]['ext']                          if self.params.get('merge_output_format') is None @@ -1453,7 +1461,7 @@ class YoutubeDL(object):              if dn and not os.path.exists(dn):                  os.makedirs(dn)          except (OSError, IOError) as err: -            self.report_error('unable to create directory ' + compat_str(err)) +            self.report_error('unable to create directory ' + error_to_compat_str(err))              return          if self.params.get('writedescription', False): @@ -1504,7 +1512,7 @@ class YoutubeDL(object):                              sub_info['url'], info_dict['id'], note=False)                      except ExtractorError as err:                          self.report_warning('Unable to download subtitle for "%s": %s' % -                                            (sub_lang, compat_str(err.cause))) +                                            (sub_lang, error_to_compat_str(err.cause)))                          continue                  try:                      sub_filename = subtitles_filename(filename, sub_lang, sub_format) @@ -2033,4 +2041,4 @@ class YoutubeDL(object):                                     (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))                  except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:                      self.report_warning('Unable to download thumbnail "%s": %s' % -                                        (t['url'], compat_str(err))) +                                        (t['url'], error_to_compat_str(err))) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index b8bf8daf8..beae8c4d0 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -5,9 +5,9 @@ import re  import sys  import time -from ..compat import compat_str  from ..utils import (      encodeFilename, +    error_to_compat_str,      decodeArgument,      format_bytes,      timeconvert, @@ -186,7 +186,7 @@ class FileDownloader(object):                  return              os.rename(encodeFilename(old_filename), encodeFilename(new_filename))          except (IOError, OSError) as err: -            self.report_error('unable to rename file: %s' % compat_str(err)) +            self.report_error('unable to rename file: %s' % error_to_compat_str(err))      def try_utime(self, filename, last_modified_hdr):          """Try to set the last-modified time of the given file.""" diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 6170cc155..aaf0c49c8 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -15,6 +15,7 @@ from ..compat import (  )  from ..utils import (      encodeFilename, +    fix_xml_ampersands,      sanitize_open,      struct_pack,      struct_unpack, @@ -288,7 +289,10 @@ class F4mFD(FragmentFD):          self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME)          urlh = self.ydl.urlopen(man_url)          man_url = urlh.geturl() -        manifest = urlh.read() +        # Some manifests may be malformed, e.g. prosiebensat1 generated manifests +        # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244 +        # and https://github.com/rg3/youtube-dl/issues/7823) +        manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip()          doc = compat_etree_fromstring(manifest)          formats = [(int(f.attrib.get('bitrate', -1)), f) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c1dd87550..971047ad4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -15,7 +15,6 @@ from .adobetv import (      AdobeTVVideoIE,  )  from .adultswim import AdultSwimIE -from .aftenposten import AftenpostenIE  from .aftonbladet import AftonbladetIE  from .airmozilla import AirMozillaIE  from .aljazeera import AlJazeeraIE @@ -26,7 +25,10 @@ from .aol import AolIE  from .allocine import AllocineIE  from .aparat import AparatIE  from .appleconnect import AppleConnectIE -from .appletrailers import AppleTrailersIE +from .appletrailers import ( +    AppleTrailersIE, +    AppleTrailersSectionIE, +)  from .archiveorg import ArchiveOrgIE  from .ard import (      ARDIE, @@ -61,8 +63,11 @@ from .beatportpro import BeatportProIE  from .bet import BetIE  from .bild import BildIE  from .bilibili import BiliBiliIE +from .bleacherreport import ( +    BleacherReportIE, +    BleacherReportCMSIE, +)  from .blinkx import BlinkxIE -from .bliptv import BlipTVIE, BlipTVUserIE  from .bloomberg import BloombergIE  from .bpb import BpbIE  from .br import BRIE @@ -78,7 +83,6 @@ from .camdemy import (      CamdemyIE,      CamdemyFolderIE  ) -from .canal13cl import Canal13clIE  from .canalplus import CanalplusIE  from .canalc2 import Canalc2IE  from .cbs import CBSIE @@ -131,7 +135,12 @@ from .dailymotion import (  )  from .daum import DaumIE  from .dbtv import DBTVIE -from .dcn import DCNIE +from .dcn import ( +    DCNIE, +    DCNVideoIE, +    DCNLiveIE, +    DCNSeasonIE, +)  from .dctp import DctpTvIE  from .deezer import DeezerPlaylistIE  from .democracynow import DemocracynowIE @@ -206,6 +215,7 @@ from .francetv import (  from .freesound import FreesoundIE  from .freespeech import FreespeechIE  from .freevideo import FreeVideoIE +from .funimation import FunimationIE  from .funnyordie import FunnyOrDieIE  from .gameinformer import GameInformerIE  from .gamekings import GamekingsIE @@ -231,9 +241,11 @@ from .globo import (  from .godtube import GodTubeIE  from .goldenmoustache import GoldenMoustacheIE  from .golem import GolemIE +from .googledrive import GoogleDriveIE  from .googleplus import GooglePlusIE  from .googlesearch import GoogleSearchIE  from .goshgay import GoshgayIE +from .gputechconf import GPUTechConfIE  from .groupon import GrouponIE  from .hark import HarkIE  from .hearthisat import HearThisAtIE @@ -246,12 +258,17 @@ from .history import HistoryIE  from .hitbox import HitboxIE, HitboxLiveIE  from .hornbunny import HornBunnyIE  from .hotnewhiphop import HotNewHipHopIE +from .hotstar import HotStarIE  from .howcast import HowcastIE  from .howstuffworks import HowStuffWorksIE  from .huffpost import HuffPostIE  from .hypem import HypemIE  from .iconosquare import IconosquareIE -from .ign import IGNIE, OneUPIE +from .ign import ( +    IGNIE, +    OneUPIE, +    PCMagIE, +)  from .imdb import (      ImdbIE,      ImdbListIE @@ -280,6 +297,7 @@ from .jadorecettepub import JadoreCettePubIE  from .jeuxvideo import JeuxVideoIE  from .jove import JoveIE  from .jukebox import JukeboxIE +from .jwplatform import JWPlatformIE  from .jpopsukitv import JpopsukiIE  from .kaltura import KalturaIE  from .kanalplay import KanalPlayIE @@ -334,6 +352,7 @@ from .lynda import (  from .m6 import M6IE  from .macgamestore import MacGameStoreIE  from .mailru import MailRuIE +from .makertv import MakerTVIE  from .malemotion import MalemotionIE  from .mdr import MDRIE  from .metacafe import MetacafeIE @@ -357,7 +376,6 @@ from .motherless import MotherlessIE  from .motorsport import MotorsportIE  from .movieclips import MovieClipsIE  from .moviezine import MoviezineIE -from .movshare import MovShareIE  from .mtv import (      MTVIE,      MTVServicesEmbeddedIE, @@ -423,7 +441,13 @@ from .noco import NocoIE  from .normalboots import NormalbootsIE  from .nosvideo import NosVideoIE  from .nova import NovaIE -from .novamov import NovaMovIE +from .novamov import ( +    NovaMovIE, +    WholeCloudIE, +    NowVideoIE, +    VideoWeedIE, +    CloudTimeIE, +)  from .nowness import (      NownessIE,      NownessPlaylistIE, @@ -433,7 +457,6 @@ from .nowtv import (      NowTVIE,      NowTVListIE,  ) -from .nowvideo import NowVideoIE  from .npo import (      NPOIE,      NPOLiveIE, @@ -514,7 +537,10 @@ from .radiode import RadioDeIE  from .radiojavan import RadioJavanIE  from .radiobremen import RadioBremenIE  from .radiofrance import RadioFranceIE -from .rai import RaiIE +from .rai import ( +    RaiTVIE, +    RaiIE, +)  from .rbmaradio import RBMARadioIE  from .rds import RDSIE  from .redtube import RedTubeIE @@ -580,10 +606,6 @@ from .snagfilms import (  )  from .snotr import SnotrIE  from .sohu import SohuIE -from .soompi import ( -    SoompiIE, -    SoompiShowIE, -)  from .soundcloud import (      SoundcloudIE,      SoundcloudSetIE, @@ -642,6 +664,7 @@ from .teachingchannel import TeachingChannelIE  from .teamcoco import TeamcocoIE  from .techtalks import TechTalksIE  from .ted import TEDIE +from .tele13 import Tele13IE  from .telebruxelles import TeleBruxellesIE  from .telecinco import TelecincoIE  from .telegraaf import TelegraafIE @@ -651,6 +674,7 @@ from .tenplay import TenPlayIE  from .testurl import TestURLIE  from .testtube import TestTubeIE  from .tf1 import TF1IE +from .theintercept import TheInterceptIE  from .theonion import TheOnionIE  from .theplatform import (      ThePlatformIE, @@ -670,6 +694,7 @@ from .tnaflix import (      EMPFlixIE,      MovieFapIE,  ) +from .toggle import ToggleIE  from .thvideo import (      THVideoIE,      THVideoPlaylistIE @@ -683,7 +708,13 @@ from .tube8 import Tube8IE  from .tubitv import TubiTvIE  from .tudou import TudouIE  from .tumblr import TumblrIE -from .tunein import TuneInIE +from .tunein import ( +    TuneInClipIE, +    TuneInStationIE, +    TuneInProgramIE, +    TuneInTopicIE, +    TuneInShortenerIE, +)  from .turbo import TurboIE  from .tutv import TutvIE  from .tv2 import ( @@ -744,7 +775,6 @@ from .videofyme import VideofyMeIE  from .videomega import VideoMegaIE  from .videopremium import VideoPremiumIE  from .videott import VideoTtIE -from .videoweed import VideoWeedIE  from .vidme import VidmeIE  from .vidzi import VidziIE  from .vier import VierIE, VierVideosIE @@ -846,7 +876,7 @@ from .youtube import (      YoutubeTruncatedIDIE,      YoutubeTruncatedURLIE,      YoutubeUserIE, -    YoutubeUserPlaylistsIE, +    YoutubePlaylistsIE,      YoutubeWatchLaterIE,  )  from .zapiks import ZapiksIE diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index c0e5d1abf..6a29e587f 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -23,6 +23,7 @@ class ABCIE(InfoExtractor):              'title': 'Australia to help staff Ebola treatment centre in Sierra Leone',              'description': 'md5:809ad29c67a05f54eb41f2a105693a67',          }, +        'skip': 'this video has expired',      }, {          'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326',          'md5': 'db2a5369238b51f9811ad815b69dc086', @@ -36,6 +37,7 @@ class ABCIE(InfoExtractor):              'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill',          },          'add_ie': ['Youtube'], +        'skip': 'Not accessible from Travis CI server',      }, {          'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080',          'md5': 'b96eee7c9edf4fc5a358a0252881cc1f', @@ -58,6 +60,9 @@ class ABCIE(InfoExtractor):              r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);',              webpage)          if mobj is None: +            expired = self._html_search_regex(r'(?s)class="expired-(?:video|audio)".+?<span>(.+?)</span>', webpage, 'expired', None) +            if expired: +                raise ExtractorError('%s said: %s' % (self.IE_NAME, expired), expected=True)              raise ExtractorError('Unable to extract video urls')          urls_info = self._parse_json( diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abc7news.py index c04949c21..122dc9099 100644 --- a/youtube_dl/extractor/abc7news.py +++ b/youtube_dl/extractor/abc7news.py @@ -44,7 +44,6 @@ class Abc7NewsIE(InfoExtractor):              'contentURL', webpage, 'm3u8 url', fatal=True)          formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') -        self._sort_formats(formats)          title = self._og_search_title(webpage).strip()          description = self._og_search_description(webpage).strip() diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 3ae618e71..bf21a6887 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -68,7 +68,7 @@ class AdultSwimIE(InfoExtractor):                  'md5': '3e346a2ab0087d687a05e1e7f3b3e529',                  'info_dict': {                      'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', -                    'ext': 'flv', +                    'ext': 'mp4',                      'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',                      'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',                  }, @@ -79,6 +79,10 @@ class AdultSwimIE(InfoExtractor):              'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',              'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',          }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }      }]      @staticmethod diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py deleted file mode 100644 index 0c00acfb5..000000000 --- a/youtube_dl/extractor/aftenposten.py +++ /dev/null @@ -1,23 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class AftenpostenIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P<id>\d+)' -    _TEST = { -        'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', -        'md5': 'fd828cd29774a729bf4d4425fe192972', -        'info_dict': { -            'id': '21039', -            'ext': 'mov', -            'title': 'TRAILER: "Sweatshop" - I can´t take any more', -            'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', -            'timestamp': 1416927969, -            'upload_date': '20141125', -        } -    } - -    def _real_extract(self, url): -        return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream') diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py new file mode 100644 index 000000000..dcc3c97f1 --- /dev/null +++ b/youtube_dl/extractor/amp.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    parse_iso8601, +) + + +class AMPIE(InfoExtractor): +    # parse Akamai Adaptive Media Player feed +    def _extract_feed_info(self, url): +        item = self._download_json( +            url, None, 'Downloading Akamai AMP feed', +            'Unable to download Akamai AMP feed')['channel']['item'] + +        video_id = item['guid'] + +        def get_media_node(name, default=None): +            media_name = 'media-%s' % name +            media_group = item.get('media-group') or item +            return media_group.get(media_name) or item.get(media_name) or item.get(name, default) + +        thumbnails = [] +        media_thumbnail = get_media_node('thumbnail') +        if media_thumbnail: +            if isinstance(media_thumbnail, dict): +                media_thumbnail = [media_thumbnail] +            for thumbnail_data in media_thumbnail: +                thumbnail = thumbnail_data['@attributes'] +                thumbnails.append({ +                    'url': self._proto_relative_url(thumbnail['url'], 'http:'), +                    'width': int_or_none(thumbnail.get('width')), +                    'height': int_or_none(thumbnail.get('height')), +                }) + +        subtitles = {} +        media_subtitle = get_media_node('subTitle') +        if media_subtitle: +            if isinstance(media_subtitle, dict): +                media_subtitle = [media_subtitle] +            for subtitle_data in media_subtitle: +                subtitle = subtitle_data['@attributes'] +                lang = subtitle.get('lang') or 'en' +                subtitles[lang] = [{'url': subtitle['href']}] + +        formats = [] +        media_content = get_media_node('content') +        if isinstance(media_content, dict): +            media_content = [media_content] +        for media_data in media_content: +            media = media_data['@attributes'] +            media_type = media['type'] +            if media_type == 'video/f4m': +                f4m_formats = self._extract_f4m_formats( +                    media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', +                    video_id, f4m_id='hds', fatal=False) +                if f4m_formats: +                    formats.extend(f4m_formats) +            elif media_type == 'application/x-mpegURL': +                m3u8_formats = self._extract_m3u8_formats( +                    media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False) +                if m3u8_formats: +                    formats.extend(m3u8_formats) +            else: +                formats.append({ +                    'format_id': media_data['media-category']['@attributes']['label'], +                    'url': media['url'], +                    'tbr': int_or_none(media.get('bitrate')), +                    'filesize': int_or_none(media.get('fileSize')), +                }) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': get_media_node('title'), +            'description': get_media_node('description'), +            'thumbnails': thumbnails, +            'timestamp': parse_iso8601(item.get('pubDate'), ' '), +            'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index f68dc3236..62ed0c918 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -11,6 +11,7 @@ from ..utils import (  class AppleTrailersIE(InfoExtractor): +    IE_NAME = 'appletrailers'      _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'      _TESTS = [{          'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', @@ -64,6 +65,12 @@ class AppleTrailersIE(InfoExtractor):              },          ]      }, { +        'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', +        'info_dict': { +            'id': 'blackthorn', +        }, +        'playlist_mincount': 2, +    }, {          'url': 'http://trailers.apple.com/ca/metropole/autrui/',          'only_matching': True,      }] @@ -79,7 +86,7 @@ class AppleTrailersIE(InfoExtractor):          def fix_html(s):              s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s) -            s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s) +            s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s)              # The ' in the onClick attributes are not escaped, it couldn't be parsed              # like: http://trailers.apple.com/trailers/wb/gravity/ @@ -96,6 +103,9 @@ class AppleTrailersIE(InfoExtractor):              trailer_info_json = self._search_regex(self._JSON_RE,                                                     on_click, 'trailer info')              trailer_info = json.loads(trailer_info_json) +            first_url = trailer_info.get('url') +            if not first_url: +                continue              title = trailer_info['title']              video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()              thumbnail = li.find('.//img').attrib['src'] @@ -107,7 +117,6 @@ class AppleTrailersIE(InfoExtractor):              if m:                  duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) -            first_url = trailer_info['url']              trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()              settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)              settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') @@ -144,3 +153,76 @@ class AppleTrailersIE(InfoExtractor):              'id': movie,              'entries': playlist,          } + + +class AppleTrailersSectionIE(InfoExtractor): +    IE_NAME = 'appletrailers:section' +    _SECTIONS = { +        'justadded': { +            'feed_path': 'just_added', +            'title': 'Just Added', +        }, +        'exclusive': { +            'feed_path': 'exclusive', +            'title': 'Exclusive', +        }, +        'justhd': { +            'feed_path': 'just_hd', +            'title': 'Just HD', +        }, +        'mostpopular': { +            'feed_path': 'most_pop', +            'title': 'Most Popular', +        }, +        'moviestudios': { +            'feed_path': 'studios', +            'title': 'Movie Studios', +        }, +    } +    _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS) +    _TESTS = [{ +        'url': 'http://trailers.apple.com/#section=justadded', +        'info_dict': { +            'title': 'Just Added', +            'id': 'justadded', +        }, +        'playlist_mincount': 80, +    }, { +        'url': 'http://trailers.apple.com/#section=exclusive', +        'info_dict': { +            'title': 'Exclusive', +            'id': 'exclusive', +        }, +        'playlist_mincount': 80, +    }, { +        'url': 'http://trailers.apple.com/#section=justhd', +        'info_dict': { +            'title': 'Just HD', +            'id': 'justhd', +        }, +        'playlist_mincount': 80, +    }, { +        'url': 'http://trailers.apple.com/#section=mostpopular', +        'info_dict': { +            'title': 'Most Popular', +            'id': 'mostpopular', +        }, +        'playlist_mincount': 80, +    }, { +        'url': 'http://trailers.apple.com/#section=moviestudios', +        'info_dict': { +            'title': 'Movie Studios', +            'id': 'moviestudios', +        }, +        'playlist_mincount': 80, +    }] + +    def _real_extract(self, url): +        section = self._match_id(url) +        section_data = self._download_json( +            'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'], +            section) +        entries = [ +            self.url_result('http://trailers.apple.com' + e['location']) +            for e in section_data] +        return self.playlist_result(entries, section, self._SECTIONS[section]['title']) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 73be6d204..687eb9f82 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -110,13 +110,19 @@ class ARDMediathekIE(InfoExtractor):                  server = stream.get('_server')                  for stream_url in stream_urls:                      ext = determine_ext(stream_url) +                    if quality != 'auto' and ext in ('f4m', 'm3u8'): +                        continue                      if ext == 'f4m': -                        formats.extend(self._extract_f4m_formats( +                        f4m_formats = self._extract_f4m_formats(                              stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', -                            video_id, preference=-1, f4m_id='hds')) +                            video_id, preference=-1, f4m_id='hds', fatal=False) +                        if f4m_formats: +                            formats.extend(f4m_formats)                      elif ext == 'm3u8': -                        formats.extend(self._extract_m3u8_formats( -                            stream_url, video_id, 'mp4', preference=1, m3u8_id='hls')) +                        m3u8_formats = self._extract_m3u8_formats( +                            stream_url, video_id, 'mp4', preference=1, m3u8_id='hls', fatal=False) +                        if m3u8_formats: +                            formats.extend(m3u8_formats)                      else:                          if server and server.startswith('rtmp'):                              f = { diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 2a00da3ee..10301a8ea 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -68,9 +68,13 @@ class ArteTVPlus7IE(InfoExtractor):      def _extract_url_info(cls, url):          mobj = re.match(cls._VALID_URL, url)          lang = mobj.group('lang') -        # This is not a real id, it can be for example AJT for the news -        # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal -        video_id = mobj.group('id') +        query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) +        if 'vid' in query: +            video_id = query['vid'][0] +        else: +            # This is not a real id, it can be for example AJT for the news +            # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal +            video_id = mobj.group('id')          return video_id, lang      def _real_extract(self, url): @@ -79,9 +83,15 @@ class ArteTVPlus7IE(InfoExtractor):          return self._extract_from_webpage(webpage, video_id, lang)      def _extract_from_webpage(self, webpage, video_id, lang): +        patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') +        ids = (video_id, '') +        # some pages contain multiple videos (like +        # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), +        # so we first try to look for json URLs that contain the video id from +        # the 'vid' parameter. +        patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates]          json_url = self._html_search_regex( -            [r'arte_vp_url=["\'](.*?)["\']', r'data-url=["\']([^"]+)["\']'], -            webpage, 'json vp url', default=None) +            patterns, webpage, 'json vp url', default=None)          if not json_url:              iframe_url = self._html_search_regex(                  r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 50e47ba0a..7ac3044c7 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals  import time  import hmac +import hashlib +import re  from .common import InfoExtractor  from ..compat import ( @@ -32,6 +34,19 @@ class AtresPlayerIE(InfoExtractor):                  'duration': 5527.6,                  'thumbnail': 're:^https?://.*\.jpg$',              }, +            'skip': 'This video is only available for registered users' +        }, +        { +            'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html', +            'md5': '0d0e918533bbd4b263f2de4d197d4aac', +            'info_dict': { +                'id': 'capitulo-112-david-bustamante', +                'ext': 'flv', +                'title': 'David Bustamante', +                'description': 'md5:f33f1c0a05be57f6708d4dd83a3b81c6', +                'duration': 1439.0, +                'thumbnail': 're:^https?://.*\.jpg$', +            },          },          {              'url': 'http://www.atresplayer.com/television/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_2014122400174.html', @@ -50,6 +65,13 @@ class AtresPlayerIE(InfoExtractor):      _LOGIN_URL = 'https://servicios.atresplayer.com/j_spring_security_check' +    _ERRORS = { +        'UNPUBLISHED': 'We\'re sorry, but this video is not yet available.', +        'DELETED': 'This video has expired and is no longer available for online streaming.', +        'GEOUNPUBLISHED': 'We\'re sorry, but this video is not available in your region due to right restrictions.', +        # 'PREMIUM': 'PREMIUM', +    } +      def _real_initialize(self):          self._login() @@ -83,58 +105,81 @@ class AtresPlayerIE(InfoExtractor):          episode_id = self._search_regex(              r'episode="([^"]+)"', webpage, 'episode id') +        request = sanitized_Request( +            self._PLAYER_URL_TEMPLATE % episode_id, +            headers={'User-Agent': self._USER_AGENT}) +        player = self._download_json(request, episode_id, 'Downloading player JSON') + +        episode_type = player.get('typeOfEpisode') +        error_message = self._ERRORS.get(episode_type) +        if error_message: +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) + +        formats = [] +        video_url = player.get('urlVideo') +        if video_url: +            format_info = { +                'url': video_url, +                'format_id': 'http', +            } +            mobj = re.search(r'(?P<bitrate>\d+)K_(?P<width>\d+)x(?P<height>\d+)', video_url) +            if mobj: +                format_info.update({ +                    'width': int_or_none(mobj.group('width')), +                    'height': int_or_none(mobj.group('height')), +                    'tbr': int_or_none(mobj.group('bitrate')), +                }) +            formats.append(format_info) + +        m3u8_url = player.get('urlVideoHls') +        if m3u8_url: +            m3u8_formats = self._extract_m3u8_formats( +                m3u8_url, episode_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) +            if m3u8_formats: +                formats.extend(m3u8_formats) +          timestamp = int_or_none(self._download_webpage(              self._TIME_API_URL,              video_id, 'Downloading timestamp', fatal=False), 1000, time.time())          timestamp_shifted = compat_str(timestamp + self._TIMESTAMP_SHIFT)          token = hmac.new(              self._MAGIC.encode('ascii'), -            (episode_id + timestamp_shifted).encode('utf-8') +            (episode_id + timestamp_shifted).encode('utf-8'), hashlib.md5          ).hexdigest() -        formats = [] -        for fmt in ['windows', 'android_tablet']: -            request = sanitized_Request( -                self._URL_VIDEO_TEMPLATE.format(fmt, episode_id, timestamp_shifted, token)) -            request.add_header('User-Agent', self._USER_AGENT) - -            fmt_json = self._download_json( -                request, video_id, 'Downloading %s video JSON' % fmt) - -            result = fmt_json.get('resultDes') -            if result.lower() != 'ok': -                raise ExtractorError( -                    '%s returned error: %s' % (self.IE_NAME, result), expected=True) - -            for format_id, video_url in fmt_json['resultObject'].items(): -                if format_id == 'token' or not video_url.startswith('http'): -                    continue -                if video_url.endswith('/Manifest'): -                    if 'geodeswowsmpra3player' in video_url: -                        f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] -                        f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) -                        # this videos are protected by DRM, the f4m downloader doesn't support them -                        continue -                    else: -                        f4m_url = video_url[:-9] + '/manifest.f4m' -                    formats.extend(self._extract_f4m_formats(f4m_url, video_id)) -                else: -                    formats.append({ -                        'url': video_url, -                        'format_id': 'android-%s' % format_id, -                        'preference': 1, -                    }) -        self._sort_formats(formats) +        request = sanitized_Request( +            self._URL_VIDEO_TEMPLATE.format('windows', episode_id, timestamp_shifted, token), +            headers={'User-Agent': self._USER_AGENT}) -        player = self._download_json( -            self._PLAYER_URL_TEMPLATE % episode_id, -            episode_id) +        fmt_json = self._download_json( +            request, video_id, 'Downloading windows video JSON') + +        result = fmt_json.get('resultDes') +        if result.lower() != 'ok': +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, result), expected=True) + +        for format_id, video_url in fmt_json['resultObject'].items(): +            if format_id == 'token' or not video_url.startswith('http'): +                continue +            if 'geodeswowsmpra3player' in video_url: +                f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] +                f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) +                # this videos are protected by DRM, the f4m downloader doesn't support them +                continue +            else: +                f4m_url = video_url[:-9] + '/manifest.f4m' +            f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) +            if f4m_formats: +                formats.extend(f4m_formats) +        self._sort_formats(formats)          path_data = player.get('pathData')          episode = self._download_xml( -            self._EPISODE_URL_TEMPLATE % path_data, -            video_id, 'Downloading episode XML') +            self._EPISODE_URL_TEMPLATE % path_data, video_id, +            'Downloading episode XML')          duration = float_or_none(xpath_text(              episode, './media/asset/info/technical/contentDuration', 'duration')) diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index b0b089dee..4382a302b 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -15,7 +15,7 @@ class AudiMediaIE(InfoExtractor):          'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test',          'md5': '79a8b71c46d49042609795ab59779b66',          'info_dict': { -            'id': '1564', +            'id': '1565',              'ext': 'mp4',              'title': '60 Seconds of Audi Sport 104/2015 - WEC Bahrain, Rookie Test',              'description': 'md5:60e5d30a78ced725f7b8d34370762941', diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index 693ba22c6..3eed91279 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -56,7 +56,7 @@ class AudiomackIE(InfoExtractor):          # API is inconsistent with errors          if 'url' not in api_response or not api_response['url'] or 'error' in api_response: -            raise ExtractorError('Invalid url %s', url) +            raise ExtractorError('Invalid url %s' % url)          # Audiomack wraps a lot of soundcloud tracks in their branded wrapper          # if so, pass the work off to the soundcloud extractor diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index d89e34ba0..691aecc0d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -733,6 +733,7 @@ class BBCIE(BBCCoUkIE):          # article with multiple videos embedded with playlist.sxml (e.g.          # http://www.bbc.com/sport/0/football/34475836)          playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage) +        playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))          if playlists:              entries = [                  self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index e63c2ac00..c8d921daf 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -34,15 +34,29 @@ class BeegIE(InfoExtractor):          video_id = self._match_id(url)          video = self._download_json( -            'http://beeg.com/api/v3/video/%s' % video_id, video_id) +            'http://beeg.com/api/v5/video/%s' % video_id, video_id) + +        def split(o, e): +            def cut(s, x): +                n.append(s[:x]) +                return s[x:] +            n = [] +            r = len(o) % e +            if r > 0: +                o = cut(o, r) +            while len(o) > e: +                o = cut(o, e) +            n.append(o) +            return n          def decrypt_key(key): -            # Reverse engineered from http://static.beeg.com/cpl/1067.js -            a = '8RPUUCS35ZWp3ADnKcSmpH71ZusrROo' +            # Reverse engineered from http://static.beeg.com/cpl/1105.js +            a = '5ShMcIQlssOd7zChAIOlmeTZDaUxULbJRnywYaiB'              e = compat_urllib_parse_unquote(key) -            return ''.join([ -                compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 25) +            o = ''.join([ +                compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 21)                  for n in range(len(e))]) +            return ''.join(split(o, 3)[::-1])          def decrypt_url(encrypted_url):              encrypted_url = self._proto_relative_url( diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py new file mode 100644 index 000000000..38bda3af5 --- /dev/null +++ b/youtube_dl/extractor/bleacherreport.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .amp import AMPIE +from ..utils import ( +    ExtractorError, +    int_or_none, +    parse_iso8601, +) + + +class BleacherReportIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)' +    _TESTS = [{ +        'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', +        'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', +        'info_dict': { +            'id': '2496438', +            'ext': 'mp4', +            'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', +            'uploader_id': 3992341, +            'description': 'CFB, ACC, Florida State', +            'timestamp': 1434380212, +            'upload_date': '20150615', +            'uploader': 'Team Stream Now ', +        }, +        'add_ie': ['Ooyala'], +    }, { +        'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', +        'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', +        'info_dict': { +            'id': '2586817', +            'ext': 'mp4', +            'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', +            'timestamp': 1446839961, +            'uploader': 'Sean Fay', +            'description': 'md5:825e94e0f3521df52fa83b2ed198fa20', +            'uploader_id': 6466954, +            'upload_date': '20151011', +        }, +        'add_ie': ['Youtube'], +    }] + +    def _real_extract(self, url): +        article_id = self._match_id(url) + +        article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article'] + +        thumbnails = [] +        primary_photo = article_data.get('primaryPhoto') +        if primary_photo: +            thumbnails = [{ +                'url': primary_photo['url'], +                'width': primary_photo.get('width'), +                'height': primary_photo.get('height'), +            }] + +        info = { +            '_type': 'url_transparent', +            'id': article_id, +            'title': article_data['title'], +            'uploader': article_data.get('author', {}).get('name'), +            'uploader_id': article_data.get('authorId'), +            'timestamp': parse_iso8601(article_data.get('createdAt')), +            'thumbnails': thumbnails, +            'comment_count': int_or_none(article_data.get('commentsCount')), +            'view_count': int_or_none(article_data.get('hitCount')), +        } + +        video = article_data.get('video') +        if video: +            video_type = video['type'] +            if video_type == 'cms.bleacherreport.com': +                info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] +            elif video_type == 'ooyala.com': +                info['url'] = 'ooyala:%s' % video['id'] +            elif video_type == 'youtube.com': +                info['url'] = video['id'] +            elif video_type == 'vine.co': +                info['url'] = 'https://vine.co/v/%s' % video['id'] +            else: +                info['url'] = video_type + video['id'] +            return info +        else: +            raise ExtractorError('no video in the article', expected=True) + + +class BleacherReportCMSIE(AMPIE): +    _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' +    _TESTS = [{ +        'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', +        'md5': '8c2c12e3af7805152675446c905d159b', +        'info_dict': { +            'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', +            'ext': 'flv', +            'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', +            'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', +        }, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        info = self._extract_feed_info('http://cms.bleacherreport.com/media/items/%s/akamai.json' % video_id) +        info['id'] = video_id +        return info diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py deleted file mode 100644 index 35375f7b1..000000000 --- a/youtube_dl/extractor/bliptv.py +++ /dev/null @@ -1,290 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - -from ..compat import compat_urlparse -from ..utils import ( -    clean_html, -    int_or_none, -    parse_iso8601, -    sanitized_Request, -    unescapeHTML, -    xpath_text, -    xpath_with_ns, -) - - -class BlipTVIE(InfoExtractor): -    _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+_]+)))' - -    _TESTS = [ -        { -            'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', -            'md5': '80baf1ec5c3d2019037c1c707d676b9f', -            'info_dict': { -                'id': '5779306', -                'ext': 'm4v', -                'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', -                'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', -                'timestamp': 1323138843, -                'upload_date': '20111206', -                'uploader': 'cbr', -                'uploader_id': '679425', -                'duration': 81, -            } -        }, -        { -            # https://github.com/rg3/youtube-dl/pull/2274 -            'note': 'Video with subtitles', -            'url': 'http://blip.tv/play/h6Uag5OEVgI.html', -            'md5': '309f9d25b820b086ca163ffac8031806', -            'info_dict': { -                'id': '6586561', -                'ext': 'mp4', -                'title': 'Red vs. Blue Season 11 Episode 1', -                'description': 'One-Zero-One', -                'timestamp': 1371261608, -                'upload_date': '20130615', -                'uploader': 'redvsblue', -                'uploader_id': '792887', -                'duration': 279, -            } -        }, -        { -            # https://bugzilla.redhat.com/show_bug.cgi?id=967465 -            'url': 'http://a.blip.tv/api.swf#h6Uag5KbVwI', -            'md5': '314e87b1ebe7a48fcbfdd51b791ce5a6', -            'info_dict': { -                'id': '6573122', -                'ext': 'mov', -                'upload_date': '20130520', -                'description': 'Two hapless space marines argue over what to do when they realize they have an astronomically huge problem on their hands.', -                'title': 'Red vs. Blue Season 11 Trailer', -                'timestamp': 1369029609, -                'uploader': 'redvsblue', -                'uploader_id': '792887', -            } -        }, -        { -            'url': 'http://blip.tv/play/gbk766dkj4Yn', -            'md5': 'fe0a33f022d49399a241e84a8ea8b8e3', -            'info_dict': { -                'id': '1749452', -                'ext': 'mp4', -                'upload_date': '20090208', -                'description': 'Witness the first appearance of the Nostalgia Critic character, as Doug reviews the movie Transformers.', -                'title': 'Nostalgia Critic: Transformers', -                'timestamp': 1234068723, -                'uploader': 'NostalgiaCritic', -                'uploader_id': '246467', -            } -        }, -        { -            # https://github.com/rg3/youtube-dl/pull/4404 -            'note': 'Audio only', -            'url': 'http://blip.tv/hilarios-productions/weekly-manga-recap-kingdom-7119982', -            'md5': '76c0a56f24e769ceaab21fbb6416a351', -            'info_dict': { -                'id': '7103299', -                'ext': 'flv', -                'title': 'Weekly Manga Recap: Kingdom', -                'description': 'And then Shin breaks the enemy line, and he's all like HWAH! And then he slices a guy and it's all like FWASHING! And... it's really hard to describe the best parts of this series without breaking down into sound effects, okay?', -                'timestamp': 1417660321, -                'upload_date': '20141204', -                'uploader': 'The Rollo T', -                'uploader_id': '407429', -                'duration': 7251, -                'vcodec': 'none', -            } -        }, -        { -            # missing duration -            'url': 'http://blip.tv/rss/flash/6700880', -            'info_dict': { -                'id': '6684191', -                'ext': 'm4v', -                'title': 'Cowboy Bebop: Gateway Shuffle Review', -                'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8', -                'timestamp': 1386639757, -                'upload_date': '20131210', -                'uploader': 'sfdebris', -                'uploader_id': '706520', -            } -        } -    ] - -    @staticmethod -    def _extract_url(webpage): -        mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) -        if mobj: -            return 'http://blip.tv/a/a-' + mobj.group(1) -        mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage) -        if mobj: -            return mobj.group(1) - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        lookup_id = mobj.group('lookup_id') - -        # See https://github.com/rg3/youtube-dl/issues/857 and -        # https://github.com/rg3/youtube-dl/issues/4197 -        if lookup_id: -            urlh = self._request_webpage( -                'http://blip.tv/play/%s' % lookup_id, lookup_id, 'Resolving lookup id') -            url = compat_urlparse.urlparse(urlh.geturl()) -            qs = compat_urlparse.parse_qs(url.query) -            mobj = re.match(self._VALID_URL, qs['file'][0]) - -        video_id = mobj.group('id') - -        rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS') - -        def _x(p): -            return xpath_with_ns(p, { -                'blip': 'http://blip.tv/dtd/blip/1.0', -                'media': 'http://search.yahoo.com/mrss/', -                'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', -            }) - -        item = rss.find('channel/item') - -        video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id -        title = xpath_text(item, 'title', 'title', fatal=True) -        description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description')) -        timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp')) -        uploader = xpath_text(item, _x('blip:user'), 'uploader') -        uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id') -        duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration')) -        media_thumbnail = item.find(_x('media:thumbnail')) -        thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None -                     else xpath_text(item, 'image', 'thumbnail')) -        categories = [category.text for category in item.findall('category') if category is not None] - -        formats = [] -        subtitles_urls = {} - -        media_group = item.find(_x('media:group')) -        for media_content in media_group.findall(_x('media:content')): -            url = media_content.get('url') -            role = media_content.get(_x('blip:role')) -            msg = self._download_webpage( -                url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url', -                video_id, 'Resolving URL for %s' % role) -            real_url = compat_urlparse.parse_qs(msg.strip())['message'][0] - -            media_type = media_content.get('type') -            if media_type == 'text/srt' or url.endswith('.srt'): -                LANGS = { -                    'english': 'en', -                } -                lang = role.rpartition('-')[-1].strip().lower() -                langcode = LANGS.get(lang, lang) -                subtitles_urls[langcode] = url -            elif media_type.startswith('video/'): -                formats.append({ -                    'url': real_url, -                    'format_id': role, -                    'format_note': media_type, -                    'vcodec': media_content.get(_x('blip:vcodec')) or 'none', -                    'acodec': media_content.get(_x('blip:acodec')), -                    'filesize': media_content.get('filesize'), -                    'width': int_or_none(media_content.get('width')), -                    'height': int_or_none(media_content.get('height')), -                }) -        self._check_formats(formats, video_id) -        self._sort_formats(formats) - -        subtitles = self.extract_subtitles(video_id, subtitles_urls) - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'timestamp': timestamp, -            'uploader': uploader, -            'uploader_id': uploader_id, -            'duration': duration, -            'thumbnail': thumbnail, -            'categories': categories, -            'formats': formats, -            'subtitles': subtitles, -        } - -    def _get_subtitles(self, video_id, subtitles_urls): -        subtitles = {} -        for lang, url in subtitles_urls.items(): -            # For some weird reason, blip.tv serves a video instead of subtitles -            # when we request with a common UA -            req = sanitized_Request(url) -            req.add_header('User-Agent', 'youtube-dl') -            subtitles[lang] = [{ -                # The extension is 'srt' but it's actually an 'ass' file -                'ext': 'ass', -                'data': self._download_webpage(req, None, note=False), -            }] -        return subtitles - - -class BlipTVUserIE(InfoExtractor): -    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$' -    _PAGE_SIZE = 12 -    IE_NAME = 'blip.tv:user' -    _TEST = { -        'url': 'http://blip.tv/actone', -        'info_dict': { -            'id': 'actone', -            'title': 'Act One: The Series', -        }, -        'playlist_count': 5, -    } - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        username = mobj.group(1) - -        page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' - -        page = self._download_webpage(url, username, 'Downloading user page') -        mobj = re.search(r'data-users-id="([^"]+)"', page) -        page_base = page_base % mobj.group(1) -        title = self._og_search_title(page) - -        # Download video ids using BlipTV Ajax calls. Result size per -        # query is limited (currently to 12 videos) so we need to query -        # page by page until there are no video ids - it means we got -        # all of them. - -        video_ids = [] -        pagenum = 1 - -        while True: -            url = page_base + "&page=" + str(pagenum) -            page = self._download_webpage( -                url, username, 'Downloading video ids from page %d' % pagenum) - -            # Extract video identifiers -            ids_in_page = [] - -            for mobj in re.finditer(r'href="/([^"]+)"', page): -                if mobj.group(1) not in ids_in_page: -                    ids_in_page.append(unescapeHTML(mobj.group(1))) - -            video_ids.extend(ids_in_page) - -            # A little optimization - if current page is not -            # "full", ie. does not contain PAGE_SIZE video ids then -            # we can assume that this page is the last one - there -            # are no more ids on further pages - no need to query -            # again. - -            if len(ids_in_page) < self._PAGE_SIZE: -                break - -            pagenum += 1 - -        urls = ['http://blip.tv/%s' % video_id for video_id in video_ids] -        url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls] -        return self.playlist_result( -            url_entries, playlist_title=title, playlist_id=username) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 66e394e10..e66854538 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -1,18 +1,21 @@  # coding: utf-8  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..utils import (      ExtractorError,      int_or_none,      parse_duration, +    xpath_element, +    xpath_text,  )  class BRIE(InfoExtractor):      IE_DESC = 'Bayerischer Rundfunk Mediathek' -    _VALID_URL = r'https?://(?:www\.)?br\.de/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' -    _BASE_URL = 'http://www.br.de' +    _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html'      _TESTS = [          { @@ -22,7 +25,7 @@ class BRIE(InfoExtractor):                  'id': '48f656ef-287e-486f-be86-459122db22cc',                  'ext': 'mp4',                  'title': 'Die böse Überraschung', -                'description': 'Betriebliche Altersvorsorge: Die böse Überraschung', +                'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9',                  'duration': 180,                  'uploader': 'Reinhard Weber',                  'upload_date': '20150422', @@ -30,23 +33,23 @@ class BRIE(InfoExtractor):          },          {              'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', -            'md5': 'a44396d73ab6a68a69a568fae10705bb', +            'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef',              'info_dict': {                  'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05', -                'ext': 'mp4', +                'ext': 'flv',                  'title': 'Manfred Schreiber ist tot', -                'description': 'Abendschau kompakt: Manfred Schreiber ist tot', +                'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97',                  'duration': 26,              }          },          { -            'url': 'http://www.br.de/radio/br-klassik/sendungen/allegro/premiere-urauffuehrung-the-land-2015-dance-festival-muenchen-100.html', +            'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html',              'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d',              'info_dict': {                  'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b',                  'ext': 'aac',                  'title': 'Kurzweilig und sehr bewegend', -                'description': '"The Land" von Peeping Tom: Kurzweilig und sehr bewegend', +                'description': 'md5:0351996e3283d64adeb38ede91fac54e',                  'duration': 296,              }          }, @@ -57,7 +60,7 @@ class BRIE(InfoExtractor):                  'id': '6ba73750-d405-45d3-861d-1ce8c524e059',                  'ext': 'mp4',                  'title': 'Umweltbewusster Häuslebauer', -                'description': 'Uwe Erdelt: Umweltbewusster Häuslebauer', +                'description': 'md5:d52dae9792d00226348c1dbb13c9bae2',                  'duration': 116,              }          }, @@ -68,7 +71,7 @@ class BRIE(InfoExtractor):                  'id': 'd982c9ce-8648-4753-b358-98abb8aec43d',                  'ext': 'mp4',                  'title': 'Folge 1 - Metaphysik', -                'description': 'Kant für Anfänger: Folge 1 - Metaphysik', +                'description': 'md5:bb659990e9e59905c3d41e369db1fbe3',                  'duration': 893,                  'uploader': 'Eva Maria Steimle',                  'upload_date': '20140117', @@ -77,28 +80,31 @@ class BRIE(InfoExtractor):      ]      def _real_extract(self, url): -        display_id = self._match_id(url) +        base_url, display_id = re.search(self._VALID_URL, url).groups()          page = self._download_webpage(url, display_id)          xml_url = self._search_regex(              r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') -        xml = self._download_xml(self._BASE_URL + xml_url, None) +        xml = self._download_xml(base_url + xml_url, display_id)          medias = []          for xml_media in xml.findall('video') + xml.findall('audio'): +            media_id = xml_media.get('externalId')              media = { -                'id': xml_media.get('externalId'), -                'title': xml_media.find('title').text, -                'duration': parse_duration(xml_media.find('duration').text), -                'formats': self._extract_formats(xml_media.find('assets')), -                'thumbnails': self._extract_thumbnails(xml_media.find('teaserImage/variants')), -                'description': ' '.join(xml_media.find('shareTitle').text.splitlines()), -                'webpage_url': xml_media.find('permalink').text +                'id': media_id, +                'title': xpath_text(xml_media, 'title', 'title', True), +                'duration': parse_duration(xpath_text(xml_media, 'duration')), +                'formats': self._extract_formats(xpath_element( +                    xml_media, 'assets'), media_id), +                'thumbnails': self._extract_thumbnails(xpath_element( +                    xml_media, 'teaserImage/variants'), base_url), +                'description': xpath_text(xml_media, 'desc'), +                'webpage_url': xpath_text(xml_media, 'permalink'), +                'uploader': xpath_text(xml_media, 'author'),              } -            if xml_media.find('author').text: -                media['uploader'] = xml_media.find('author').text -            if xml_media.find('broadcastDate').text: -                media['upload_date'] = ''.join(reversed(xml_media.find('broadcastDate').text.split('.'))) +            broadcast_date = xpath_text(xml_media, 'broadcastDate') +            if broadcast_date: +                media['upload_date'] = ''.join(reversed(broadcast_date.split('.')))              medias.append(media)          if len(medias) > 1: @@ -109,35 +115,58 @@ class BRIE(InfoExtractor):              raise ExtractorError('No media entries found')          return medias[0] -    def _extract_formats(self, assets): - -        def text_or_none(asset, tag): -            elem = asset.find(tag) -            return None if elem is None else elem.text - -        formats = [{ -            'url': text_or_none(asset, 'downloadUrl'), -            'ext': text_or_none(asset, 'mediaType'), -            'format_id': asset.get('type'), -            'width': int_or_none(text_or_none(asset, 'frameWidth')), -            'height': int_or_none(text_or_none(asset, 'frameHeight')), -            'tbr': int_or_none(text_or_none(asset, 'bitrateVideo')), -            'abr': int_or_none(text_or_none(asset, 'bitrateAudio')), -            'vcodec': text_or_none(asset, 'codecVideo'), -            'acodec': text_or_none(asset, 'codecAudio'), -            'container': text_or_none(asset, 'mediaType'), -            'filesize': int_or_none(text_or_none(asset, 'size')), -        } for asset in assets.findall('asset') -            if asset.find('downloadUrl') is not None] - +    def _extract_formats(self, assets, media_id): +        formats = [] +        for asset in assets.findall('asset'): +            format_url = xpath_text(asset, ['downloadUrl', 'url']) +            asset_type = asset.get('type') +            if asset_type == 'HDS': +                f4m_formats = self._extract_f4m_formats( +                    format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False) +                if f4m_formats: +                    formats.extend(f4m_formats) +            elif asset_type == 'HLS': +                m3u8_formats = self._extract_m3u8_formats( +                    format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False) +                if m3u8_formats: +                    formats.extend(m3u8_formats) +            else: +                format_info = { +                    'ext': xpath_text(asset, 'mediaType'), +                    'width': int_or_none(xpath_text(asset, 'frameWidth')), +                    'height': int_or_none(xpath_text(asset, 'frameHeight')), +                    'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')), +                    'abr': int_or_none(xpath_text(asset, 'bitrateAudio')), +                    'vcodec': xpath_text(asset, 'codecVideo'), +                    'acodec': xpath_text(asset, 'codecAudio'), +                    'container': xpath_text(asset, 'mediaType'), +                    'filesize': int_or_none(xpath_text(asset, 'size')), +                } +                format_url = self._proto_relative_url(format_url) +                if format_url: +                    http_format_info = format_info.copy() +                    http_format_info.update({ +                        'url': format_url, +                        'format_id': 'http-%s' % asset_type, +                    }) +                    formats.append(http_format_info) +                server_prefix = xpath_text(asset, 'serverPrefix') +                if server_prefix: +                    rtmp_format_info = format_info.copy() +                    rtmp_format_info.update({ +                        'url': server_prefix, +                        'play_path': xpath_text(asset, 'fileName'), +                        'format_id': 'rtmp-%s' % asset_type, +                    }) +                    formats.append(rtmp_format_info)          self._sort_formats(formats)          return formats -    def _extract_thumbnails(self, variants): +    def _extract_thumbnails(self, variants, base_url):          thumbnails = [{ -            'url': self._BASE_URL + variant.find('url').text, -            'width': int_or_none(variant.find('width').text), -            'height': int_or_none(variant.find('height').text), -        } for variant in variants.findall('variant')] +            'url': base_url + xpath_text(variant, 'url'), +            'width': int_or_none(xpath_text(variant, 'width')), +            'height': int_or_none(xpath_text(variant, 'height')), +        } for variant in variants.findall('variant') if xpath_text(variant, 'url')]          thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True)          return thumbnails diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f5ebae1e6..03a4f446e 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -355,7 +355,7 @@ class BrightcoveLegacyIE(InfoExtractor):  class BrightcoveNewIE(InfoExtractor):      IE_NAME = 'brightcove:new' -    _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+)' +    _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>(?:ref:)?\d+)'      _TESTS = [{          'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',          'md5': 'c8100925723840d4b0d243f7025703be', @@ -387,14 +387,24 @@ class BrightcoveNewIE(InfoExtractor):          'params': {              'skip_download': True,          } +    }, { +        # ref: prefixed video id +        'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', +        'only_matching': True,      }]      @staticmethod +    def _extract_url(webpage): +        urls = BrightcoveNewIE._extract_urls(webpage) +        return urls[0] if urls else None + +    @staticmethod      def _extract_urls(webpage):          # Reference:          # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe -        # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript) +        # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript          # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html +        # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player          entries = [] @@ -407,9 +417,10 @@ class BrightcoveNewIE(InfoExtractor):          for video_id, account_id, player_id, embed in re.findall(                  # According to examples from [3] it's unclear whether video id                  # may be optional and what to do when it is +                # According to [4] data-video-id may be prefixed with ref:                  r'''(?sx)                      <video[^>]+ -                        data-video-id=["\'](\d+)["\'][^>]*>.*? +                        data-video-id=["\']((?:ref:)?\d+)["\'][^>]*>.*?                      </video>.*?                      <script[^>]+                          src=["\'](?:https?:)?//players\.brightcove\.net/ diff --git a/youtube_dl/extractor/canal13cl.py b/youtube_dl/extractor/canal13cl.py deleted file mode 100644 index 93241fefe..000000000 --- a/youtube_dl/extractor/canal13cl.py +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class Canal13clIE(InfoExtractor): -    _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)' -    _TEST = { -        'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', -        'md5': '4cb1fa38adcad8fea88487a078831755', -        'info_dict': { -            'id': '1403022125', -            'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', -            'ext': 'mp4', -            'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda', -            'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.', -        } -    } - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        display_id = mobj.group('id') - -        webpage = self._download_webpage(url, display_id) - -        title = self._html_search_meta( -            'twitter:title', webpage, 'title', fatal=True) -        description = self._html_search_meta( -            'twitter:description', webpage, 'description') -        url = self._html_search_regex( -            r'articuloVideo = \"(.*?)\"', webpage, 'url') -        real_id = self._search_regex( -            r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id) -        thumbnail = self._html_search_regex( -            r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail') - -        return { -            'id': real_id, -            'display_id': display_id, -            'url': url, -            'title': title, -            'description': description, -            'ext': 'mp4', -            'thumbnail': thumbnail, -        } diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index 0b67ba67d..242fba311 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -23,6 +23,8 @@ class ChaturbateIE(InfoExtractor):          'only_matching': True,      }] +    _ROOM_OFFLINE = 'Room is currently offline' +      def _real_extract(self, url):          video_id = self._match_id(url) @@ -34,9 +36,16 @@ class ChaturbateIE(InfoExtractor):          if not m3u8_url:              error = self._search_regex( -                r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>', -                webpage, 'error', group='error') -            raise ExtractorError(error, expected=True) +                [r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>', +                 r'<div[^>]+id=(["\'])defchat\1[^>]*>\s*<p><strong>(?P<error>[^<]+)<'], +                webpage, 'error', group='error', default=None) +            if not error: +                if any(p not in webpage for p in ( +                        self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')): +                    error = self._ROOM_OFFLINE +            if error: +                raise ExtractorError(error, expected=True) +            raise ExtractorError('Unable to find stream URL')          formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index fd1770dac..6d9cd8abd 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -5,7 +5,6 @@ import re  from .common import InfoExtractor  from ..utils import ExtractorError -from .bliptv import BlipTVIE  from .screenwavemedia import ScreenwaveMediaIE @@ -34,18 +33,17 @@ class CinemassacreIE(InfoExtractor):              },          },          { -            # blip.tv embedded video +            # Youtube embedded video              'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', -            'md5': 'ca9b3c8dd5a66f9375daeb5135f5a3de', +            'md5': 'df4cf8a1dcedaec79a73d96d83b99023',              'info_dict': { -                'id': '4065369', -                'ext': 'flv', +                'id': 'OEVzPCY2T-g', +                'ext': 'mp4',                  'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles',                  'upload_date': '20061207', -                'uploader': 'cinemassacre', -                'uploader_id': '250778', -                'timestamp': 1283233867, -                'description': 'md5:0a108c78d130676b207d0f6d029ecffd', +                'uploader': 'Cinemassacre', +                'uploader_id': 'JamesNintendoNerd', +                'description': 'md5:784734696c2b8b7f4b8625cc799e07f6',              }          },          { @@ -89,8 +87,6 @@ class CinemassacreIE(InfoExtractor):              ],              webpage, 'player data URL', default=None, group='url')          if not playerdata_url: -            playerdata_url = BlipTVIE._extract_url(webpage) -        if not playerdata_url:              raise ExtractorError('Unable to find player data')          video_title = self._html_search_regex( diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index d46592cc5..2996b6b09 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -1,7 +1,7 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..utils import determine_ext +from ..utils import int_or_none  _translation_table = { @@ -42,31 +42,26 @@ class CliphunterIE(InfoExtractor):          video_title = self._search_regex(              r'mediaTitle = "([^"]+)"', webpage, 'title') -        fmts = {} -        for fmt in ('mp4', 'flv'): -            fmt_list = self._parse_json(self._search_regex( -                r'var %sjson\s*=\s*(\[.*?\]);' % fmt, webpage, '%s formats' % fmt), video_id) -            for f in fmt_list: -                fmts[f['fname']] = _decode(f['sUrl']) - -        qualities = self._parse_json(self._search_regex( -            r'var player_btns\s*=\s*(.*?);\n', webpage, 'quality info'), video_id) +        gexo_files = self._parse_json( +            self._search_regex( +                r'var\s+gexoFiles\s*=\s*({.+?});', webpage, 'gexo files'), +            video_id)          formats = [] -        for fname, url in fmts.items(): -            f = { -                'url': url, -            } -            if fname in qualities: -                qual = qualities[fname] -                f.update({ -                    'format_id': '%s_%sp' % (determine_ext(url), qual['h']), -                    'width': qual['w'], -                    'height': qual['h'], -                    'tbr': qual['br'], -                }) -            formats.append(f) - +        for format_id, f in gexo_files.items(): +            video_url = f.get('url') +            if not video_url: +                continue +            fmt = f.get('fmt') +            height = f.get('h') +            format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id +            formats.append({ +                'url': _decode(video_url), +                'format_id': format_id, +                'width': int_or_none(f.get('w')), +                'height': int_or_none(height), +                'tbr': int_or_none(f.get('br')), +            })          self._sort_formats(formats)          thumbnail = self._search_regex( diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 5dd69bff7..5c3908f72 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -1,15 +1,11 @@  # coding: utf-8  from __future__ import unicode_literals -import json +from .theplatform import ThePlatformIE +from ..utils import int_or_none -from .common import InfoExtractor -from ..utils import ( -    ExtractorError, -) - -class CNETIE(InfoExtractor): +class CNETIE(ThePlatformIE):      _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'      _TESTS = [{          'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', @@ -18,25 +14,20 @@ class CNETIE(InfoExtractor):              'ext': 'flv',              'title': 'Hands-on with Microsoft Windows 8.1 Update',              'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', -            'thumbnail': 're:^http://.*/flmswindows8.jpg$',              'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861',              'uploader': 'Sarah Mitroff', +            'duration': 70,          }, -        'params': { -            'skip_download': 'requires rtmpdump', -        }      }, {          'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/',          'info_dict': {              'id': '56527b93-d25d-44e3-b738-f989ce2e49ba',              'ext': 'flv', +            'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)',              'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole',              'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40',              'uploader': 'Ashley Esqueda', -            'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', -        }, -        'params': { -            'skip_download': True,  # requires rtmpdump +            'duration': 1482,          },      }] @@ -45,26 +36,13 @@ class CNETIE(InfoExtractor):          webpage = self._download_webpage(url, display_id)          data_json = self._html_search_regex( -            r"<div class=\"cnetVideoPlayer\"\s+.*?data-cnet-video-options='([^']+)'", +            r"data-cnet-video(?:-uvp)?-options='([^']+)'",              webpage, 'data json') -        data = json.loads(data_json) -        vdata = data['video'] -        if not vdata: -            vdata = data['videos'][0] -        if not vdata: -            raise ExtractorError('Cannot find video data') - -        mpx_account = data['config']['players']['default']['mpx_account'] -        vid = vdata['files'].get('rtmp', vdata['files']['hds']) -        tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid) +        data = self._parse_json(data_json, display_id) +        vdata = data.get('video') or data['videos'][0]          video_id = vdata['id'] -        title = vdata.get('headline') -        if title is None: -            title = vdata.get('title') -        if title is None: -            raise ExtractorError('Cannot find title!') -        thumbnail = vdata.get('image', {}).get('path') +        title = vdata['title']          author = vdata.get('author')          if author:              uploader = '%s %s' % (author['firstName'], author['lastName']) @@ -73,13 +51,34 @@ class CNETIE(InfoExtractor):              uploader = None              uploader_id = None +        mpx_account = data['config']['uvpConfig']['default']['mpx_account'] + +        metadata = self.get_metadata('%s/%s' % (mpx_account, list(vdata['files'].values())[0]), video_id) +        description = vdata.get('description') or metadata.get('description') +        duration = int_or_none(vdata.get('duration')) or metadata.get('duration') + +        formats = [] +        subtitles = {} +        for (fkey, vid) in vdata['files'].items(): +            if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: +                continue +            release_url = 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true' % (mpx_account, vid) +            if fkey == 'hds': +                release_url += '&manifest=f4m' +            tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) +            formats.extend(tp_formats) +            subtitles = self._merge_subtitles(subtitles, tp_subtitles) +        self._sort_formats(formats) +          return { -            '_type': 'url_transparent', -            'url': tp_link,              'id': video_id,              'display_id': display_id,              'title': title, +            'description': description, +            'thumbnail': metadata.get('thumbnail'), +            'duration': duration,              'uploader': uploader,              'uploader_id': uploader_id, -            'thumbnail': thumbnail, +            'subtitles': subtitles, +            'formats': formats,          } diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 81f3d7697..2efa200b5 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -1,10 +1,12 @@  # encoding: utf-8  from __future__ import unicode_literals -import json -  from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( +    int_or_none, +    parse_duration, +    parse_iso8601, +)  class ComCarCoffIE(InfoExtractor): @@ -16,6 +18,7 @@ class ComCarCoffIE(InfoExtractor):              'ext': 'mp4',              'upload_date': '20141127',              'timestamp': 1417107600, +            'duration': 1232,              'title': 'Happy Thanksgiving Miranda',              'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.',              'thumbnail': 'http://ccc.crackle.com/images/s5e4_thumb.jpg', @@ -31,9 +34,10 @@ class ComCarCoffIE(InfoExtractor):              display_id = 'comediansincarsgettingcoffee.com'          webpage = self._download_webpage(url, display_id) -        full_data = json.loads(self._search_regex( -            r'<script type="application/json" id="videoData">(?P<json>.+?)</script>', -            webpage, 'full data json')) +        full_data = self._parse_json( +            self._search_regex( +                r'window\.app\s*=\s*({.+?});\n', webpage, 'full data json'), +            display_id)['videoData']          video_id = full_data['activeVideo']['video']          video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id] @@ -45,12 +49,18 @@ class ComCarCoffIE(InfoExtractor):          formats = self._extract_m3u8_formats(              video_data['mediaUrl'], video_id, ext='mp4') +        timestamp = int_or_none(video_data.get('pubDateTime')) or parse_iso8601( +            video_data.get('pubDate')) +        duration = int_or_none(video_data.get('durationSeconds')) or parse_duration( +            video_data.get('duration')) +          return {              'id': video_id,              'display_id': display_id,              'title': video_data['title'],              'description': video_data.get('description'), -            'timestamp': parse_iso8601(video_data.get('pubDate')), +            'timestamp': timestamp, +            'duration': duration,              'thumbnails': thumbnails,              'formats': formats,              'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))), diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 76f5b8b05..34a28c126 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -29,6 +29,7 @@ from ..utils import (      clean_html,      compiled_regex_type,      determine_ext, +    error_to_compat_str,      ExtractorError,      fix_xml_ampersands,      float_or_none, @@ -332,7 +333,8 @@ class InfoExtractor(object):                  return False              if errnote is None:                  errnote = 'Unable to download webpage' -            errmsg = '%s: %s' % (errnote, compat_str(err)) + +            errmsg = '%s: %s' % (errnote, error_to_compat_str(err))              if fatal:                  raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)              else: @@ -622,7 +624,7 @@ class InfoExtractor(object):                  else:                      raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)              except (IOError, netrc.NetrcParseError) as err: -                self._downloader.report_warning('parsing .netrc: %s' % compat_str(err)) +                self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))          return (username, password) @@ -882,7 +884,7 @@ class InfoExtractor(object):              fatal=fatal)          if manifest is False: -            return manifest +            return []          formats = []          manifest_version = '1.0' @@ -953,7 +955,7 @@ class InfoExtractor(object):              errnote=errnote or 'Failed to download m3u8 information',              fatal=fatal)          if res is False: -            return res +            return []          m3u8_doc, urlh = res          m3u8_url = urlh.geturl()          last_info = None diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 7b685d157..b3ee67018 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -58,18 +58,23 @@ class CSpanIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) +        video_type = None          webpage = self._download_webpage(url, video_id) -        matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage) -        if matches: +        # We first look for clipid, because clipprog always appears before +        patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] +        results = list(filter(None, (re.search(p, webpage) for p in patterns))) +        if results: +            matches = results[0]              video_type, video_id = matches.groups() -            if video_type == 'prog': -                video_type = 'program' +            video_type = 'clip' if video_type == 'id' else 'program'          else:              senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)              if senate_isvp_url:                  title = self._og_search_title(webpage)                  surl = smuggle_url(senate_isvp_url, {'force_title': title})                  return self.url_result(surl, 'SenateISVP', video_id, title) +        if video_type is None or video_id is None: +            raise ExtractorError('unable to find video id and type')          def get_text_attr(d, attr):              return d.get(attr, {}).get('#text') diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index ab7f3aec4..0c5b6617f 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -7,10 +7,10 @@ import itertools  from .common import InfoExtractor -from ..compat import compat_str  from ..utils import ( -    ExtractorError,      determine_ext, +    error_to_compat_str, +    ExtractorError,      int_or_none,      parse_iso8601,      sanitized_Request, @@ -99,6 +99,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor):          {              'url': 'http://www.dailymotion.com/video/xhza0o',              'only_matching': True, +        }, +        # with subtitles +        { +            'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', +            'only_matching': True,          }      ] @@ -122,7 +127,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor):              webpage, 'comment count', fatal=False))          player_v5 = self._search_regex( -            [r'buildPlayer\(({.+?})\);', r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);'], +            [r'buildPlayer\(({.+?})\);\n',  # See https://github.com/rg3/youtube-dl/issues/7826 +             r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', +             r'buildPlayer\(({.+?})\);'],              webpage, 'player v5', default=None)          if player_v5:              player = self._parse_json(player_v5, video_id) @@ -172,11 +179,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor):              uploader_id = metadata.get('owner', {}).get('id')              subtitles = {} -            for subtitle_lang, subtitle in metadata.get('subtitles', {}).get('data', {}).items(): -                subtitles[subtitle_lang] = [{ -                    'ext': determine_ext(subtitle_url), -                    'url': subtitle_url, -                } for subtitle_url in subtitle.get('urls', [])] +            subtitles_data = metadata.get('subtitles', {}).get('data', {}) +            if subtitles_data and isinstance(subtitles_data, dict): +                for subtitle_lang, subtitle in subtitles_data.items(): +                    subtitles[subtitle_lang] = [{ +                        'ext': determine_ext(subtitle_url), +                        'url': subtitle_url, +                    } for subtitle_url in subtitle.get('urls', [])]              return {                  'id': video_id, @@ -269,7 +278,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):                  'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,                  video_id, note=False)          except ExtractorError as err: -            self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err)) +            self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))              return {}          info = json.loads(sub_list)          if (info['total'] > 0): diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 934da765e..9a94cf361 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -25,6 +25,18 @@ class DaumIE(InfoExtractor):              'duration': 3868,          },      }, { +        # Test for https://github.com/rg3/youtube-dl/issues/7949 +        'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=M1O35s8HPOo0&clipid=73147290', +        'md5': 'c92d78bcee4424451f1667f275c1dc97', +        'info_dict': { +            'id': '73147290', +            'ext': 'mp4', +            'title': '싸이 - 나팔바지 [유희열의 스케치북] 299회 20151218', +            'description': '싸이 - 나팔바지', +            'upload_date': '20151219', +            'duration': 232, +        }, +    }, {          'url': 'http://tvpot.daum.net/v/vab4dyeDBysyBssyukBUjBz',          'only_matching': True,      }, { @@ -37,9 +49,11 @@ class DaumIE(InfoExtractor):          video_id = mobj.group('id')          canonical_url = 'http://tvpot.daum.net/v/%s' % video_id          webpage = self._download_webpage(canonical_url, video_id) +        og_url = self._og_search_url(webpage, default=None) or self._search_regex( +            r'<link[^>]+rel=(["\'])canonical\1[^>]+href=(["\'])(?P<url>.+?)\2', +            webpage, 'canonical url', group='url')          full_id = self._search_regex( -            r'src=["\']http://videofarm\.daum\.net/controller/video/viewer/Video\.html\?.*?vid=(.+?)[&"\']', -            webpage, 'full id') +            r'tvpot\.daum\.net/v/([^/]+)', og_url, 'full id')          query = compat_urllib_parse.urlencode({'vid': full_id})          info = self._download_xml(              'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 9737cff14..0d140f12f 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -1,26 +1,89 @@  # coding: utf-8  from __future__ import unicode_literals +import re +import base64 +  from .common import InfoExtractor  from ..compat import compat_urllib_parse  from ..utils import (      int_or_none,      parse_iso8601,      sanitized_Request, +    smuggle_url, +    unsmuggle_url,  )  class DCNIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' + +    def _real_extract(self, url): +        show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() +        if video_id and int(video_id) > 0: +            return self.url_result( +                'http://www.dcndigital.ae/media/%s' % video_id, 'DCNVideo') +        elif season_id and int(season_id) > 0: +            return self.url_result(smuggle_url( +                'http://www.dcndigital.ae/program/season/%s' % season_id, +                {'show_id': show_id}), 'DCNSeason') +        else: +            return self.url_result( +                'http://www.dcndigital.ae/program/%s' % show_id, 'DCNSeason') + + +class DCNBaseIE(InfoExtractor): +    def _extract_video_info(self, video_data, video_id, is_live): +        title = video_data.get('title_en') or video_data['title_ar'] +        img = video_data.get('img') +        thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None +        duration = int_or_none(video_data.get('duration')) +        description = video_data.get('description_en') or video_data.get('description_ar') +        timestamp = parse_iso8601(video_data.get('create_time'), ' ') + +        return { +            'id': video_id, +            'title': self._live_title(title) if is_live else title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'timestamp': timestamp, +            'is_live': is_live, +        } + +    def _extract_video_formats(self, webpage, video_id, entry_protocol): +        formats = [] +        m3u8_url = self._html_search_regex( +            r'file\s*:\s*"([^"]+)', webpage, 'm3u8 url', fatal=False) +        if m3u8_url: +            m3u8_formats = self._extract_m3u8_formats( +                m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=None) +            if m3u8_formats: +                formats.extend(m3u8_formats) + +        rtsp_url = self._search_regex( +            r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) +        if rtsp_url: +            formats.append({ +                'url': rtsp_url, +                'format_id': 'rtsp', +            }) + +        self._sort_formats(formats) +        return formats + + +class DCNVideoIE(DCNBaseIE): +    IE_NAME = 'dcn:video' +    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)'      _TEST = { -        'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887', +        'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375',          'info_dict':          {              'id': '17375',              'ext': 'mp4',              'title': 'رحلة العمر : الحلقة 1',              'description': 'md5:0156e935d870acb8ef0a66d24070c6d6', -            'thumbnail': 're:^https?://.*\.jpg$',              'duration': 2041,              'timestamp': 1227504126,              'upload_date': '20081124', @@ -37,46 +100,95 @@ class DCNIE(InfoExtractor):          request = sanitized_Request(              'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id,              headers={'Origin': 'http://www.dcndigital.ae'}) - -        video = self._download_json(request, video_id) -        title = video.get('title_en') or video['title_ar'] +        video_data = self._download_json(request, video_id) +        info = self._extract_video_info(video_data, video_id, False)          webpage = self._download_webpage(              'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' +              compat_urllib_parse.urlencode({ -                'id': video['id'], -                'user_id': video['user_id'], -                'signature': video['signature'], +                'id': video_data['id'], +                'user_id': video_data['user_id'], +                'signature': video_data['signature'],                  'countries': 'Q0M=',                  'filter': 'DENY',              }), video_id) +        info['formats'] = self._extract_video_formats(webpage, video_id, 'm3u8_native') +        return info -        m3u8_url = self._html_search_regex(r'file:\s*"([^"]+)', webpage, 'm3u8 url') -        formats = self._extract_m3u8_formats( -            m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') -        rtsp_url = self._search_regex( -            r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) -        if rtsp_url: -            formats.append({ -                'url': rtsp_url, -                'format_id': 'rtsp', +class DCNLiveIE(DCNBaseIE): +    IE_NAME = 'dcn:live' +    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)' + +    def _real_extract(self, url): +        channel_id = self._match_id(url) + +        request = sanitized_Request( +            'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id, +            headers={'Origin': 'http://www.dcndigital.ae'}) + +        channel_data = self._download_json(request, channel_id) +        info = self._extract_video_info(channel_data, channel_id, True) + +        webpage = self._download_webpage( +            'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + +            compat_urllib_parse.urlencode({ +                'id': base64.b64encode(channel_data['user_id'].encode()).decode(), +                'channelid': base64.b64encode(channel_data['id'].encode()).decode(), +                'signature': channel_data['signature'], +                'countries': 'Q0M=', +                'filter': 'DENY', +            }), channel_id) +        info['formats'] = self._extract_video_formats(webpage, channel_id, 'm3u8') +        return info + + +class DCNSeasonIE(InfoExtractor): +    IE_NAME = 'dcn:season' +    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' +    _TEST = { +        'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', +        'info_dict': +        { +            'id': '7910', +            'title': 'محاضرات الشيخ الشعراوي', +        }, +        'playlist_mincount': 27, +    } + +    def _real_extract(self, url): +        url, smuggled_data = unsmuggle_url(url, {}) +        show_id, season_id = re.match(self._VALID_URL, url).groups() + +        data = {} +        if season_id: +            data['season'] = season_id +            show_id = smuggled_data.get('show_id') +            if show_id is None: +                request = sanitized_Request( +                    'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, +                    headers={'Origin': 'http://www.dcndigital.ae'}) +                season = self._download_json(request, season_id) +                show_id = season['id'] +        data['show_id'] = show_id +        request = sanitized_Request( +            'http://admin.mangomolo.com/analytics/index.php/plus/show', +            compat_urllib_parse.urlencode(data), +            { +                'Origin': 'http://www.dcndigital.ae', +                'Content-Type': 'application/x-www-form-urlencoded'              }) -        self._sort_formats(formats) +        show = self._download_json(request, show_id) +        if not season_id: +            season_id = show['default_season'] +        for season in show['seasons']: +            if season['id'] == season_id: +                title = season.get('title_en') or season['title_ar'] -        img = video.get('img') -        thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None -        duration = int_or_none(video.get('duration')) -        description = video.get('description_en') or video.get('description_ar') -        timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ') +                entries = [] +                for video in show['videos']: +                    entries.append(self.url_result( +                        'http://www.dcndigital.ae/media/%s' % video['id'], 'DCNVideo')) -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'duration': duration, -            'timestamp': timestamp, -            'formats': formats, -        } +                return self.playlist_result(entries, season_id, title) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index d836c1a6c..60ed438f8 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals  import itertools -from .common import InfoExtractor +from .amp import AMPIE  from ..compat import (      compat_HTTPError,      compat_urllib_parse, @@ -12,14 +12,11 @@ from ..compat import (  from ..utils import (      ExtractorError,      clean_html, -    determine_ext, -    int_or_none, -    parse_iso8601,      sanitized_Request,  ) -class DramaFeverBaseIE(InfoExtractor): +class DramaFeverBaseIE(AMPIE):      _LOGIN_URL = 'https://www.dramafever.com/accounts/login/'      _NETRC_MACHINE = 'dramafever' @@ -80,60 +77,25 @@ class DramaFeverIE(DramaFeverBaseIE):              'timestamp': 1404336058,              'upload_date': '20140702',              'duration': 343, -        } +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }      def _real_extract(self, url):          video_id = self._match_id(url).replace('/', '.')          try: -            feed = self._download_json( -                'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id, -                video_id, 'Downloading episode JSON')['channel']['item'] +            info = self._extract_feed_info( +                'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id)          except ExtractorError as e:              if isinstance(e.cause, compat_HTTPError):                  raise ExtractorError(                      'Currently unavailable in your country.', expected=True)              raise -        media_group = feed.get('media-group', {}) - -        formats = [] -        for media_content in media_group['media-content']: -            src = media_content.get('@attributes', {}).get('url') -            if not src: -                continue -            ext = determine_ext(src) -            if ext == 'f4m': -                formats.extend(self._extract_f4m_formats( -                    src, video_id, f4m_id='hds')) -            elif ext == 'm3u8': -                formats.extend(self._extract_m3u8_formats( -                    src, video_id, 'mp4', m3u8_id='hls')) -            else: -                formats.append({ -                    'url': src, -                }) -        self._sort_formats(formats) - -        title = media_group.get('media-title') -        description = media_group.get('media-description') -        duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration')) -        thumbnail = self._proto_relative_url( -            media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url')) -        timestamp = parse_iso8601(feed.get('pubDate'), ' ') - -        subtitles = {} -        for media_subtitle in media_group.get('media-subTitle', []): -            lang = media_subtitle.get('@attributes', {}).get('lang') -            href = media_subtitle.get('@attributes', {}).get('href') -            if not lang or not href: -                continue -            subtitles[lang] = [{ -                'ext': 'ttml', -                'url': href, -            }] -          series_id, episode_number = video_id.split('.')          episode_info = self._download_json(              # We only need a single episode info, so restricting page size to one episode @@ -146,21 +108,12 @@ class DramaFeverIE(DramaFeverBaseIE):              if value:                  subfile = value[0].get('subfile') or value[0].get('new_subfile')                  if subfile and subfile != 'http://www.dramafever.com/st/': -                    subtitles.setdefault('English', []).append({ +                    info['subtitiles'].setdefault('English', []).append({                          'ext': 'srt',                          'url': subfile,                      }) -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'timestamp': timestamp, -            'duration': duration, -            'formats': formats, -            'subtitles': subtitles, -        } +        return info  class DramaFeverSeriesIE(DramaFeverBaseIE): diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py index 02c6a4615..476cce2d0 100644 --- a/youtube_dl/extractor/ellentv.py +++ b/youtube_dl/extractor/ellentv.py @@ -13,12 +13,12 @@ class EllenTVIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)'      _TEST = {          'url': 'http://www.ellentv.com/videos/0-ipq1gsai/', -        'md5': '8e3c576bf2e9bfff4d76565f56f94c9c', +        'md5': '4294cf98bc165f218aaa0b89e0fd8042',          'info_dict': {              'id': '0_ipq1gsai', -            'ext': 'mp4', +            'ext': 'mov',              'title': 'Fast Fingers of Fate', -            'description': 'md5:587e79fbbd0d73b148bc596d99ce48e6', +            'description': 'md5:3539013ddcbfa64b2a6d1b38d910868a',              'timestamp': 1428035648,              'upload_date': '20150403',              'uploader_id': 'batchUser', diff --git a/youtube_dl/extractor/esri.py b/youtube_dl/extractor/esri.py index bf5d2019f..d4205d7fb 100644 --- a/youtube_dl/extractor/esri.py +++ b/youtube_dl/extractor/esri.py @@ -61,7 +61,7 @@ class EsriVideoIE(InfoExtractor):              webpage, 'duration', fatal=False))          upload_date = unified_strdate(self._html_search_meta( -            'last-modified', webpage, 'upload date', fatal=None)) +            'last-modified', webpage, 'upload date', fatal=False))          return {              'id': video_id, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 321eec59e..5e43f2359 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -7,11 +7,11 @@ import socket  from .common import InfoExtractor  from ..compat import (      compat_http_client, -    compat_str,      compat_urllib_error,      compat_urllib_parse_unquote,  )  from ..utils import ( +    error_to_compat_str,      ExtractorError,      limit_length,      sanitized_Request, @@ -74,7 +74,7 @@ class FacebookIE(InfoExtractor):              return          login_page_req = sanitized_Request(self._LOGIN_URL) -        login_page_req.add_header('Cookie', 'locale=en_US') +        self._set_cookie('facebook.com', 'locale', 'en_US')          login_page = self._download_webpage(login_page_req, None,                                              note='Downloading login page',                                              errnote='Unable to download login page') @@ -100,13 +100,25 @@ class FacebookIE(InfoExtractor):              login_results = self._download_webpage(request, None,                                                     note='Logging in', errnote='unable to fetch login page')              if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None: +                error = self._html_search_regex( +                    r'(?s)<div[^>]+class=(["\']).*?login_error_box.*?\1[^>]*><div[^>]*>.*?</div><div[^>]*>(?P<error>.+?)</div>', +                    login_results, 'login error', default=None, group='error') +                if error: +                    raise ExtractorError('Unable to login: %s' % error, expected=True)                  self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')                  return +            fb_dtsg = self._search_regex( +                r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg', default=None) +            h = self._search_regex( +                r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h', default=None) + +            if not fb_dtsg or not h: +                return +              check_form = { -                'fb_dtsg': self._search_regex(r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg'), -                'h': self._search_regex( -                    r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h'), +                'fb_dtsg': fb_dtsg, +                'h': h,                  'name_action_selected': 'dont_save',              }              check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) @@ -116,7 +128,7 @@ class FacebookIE(InfoExtractor):              if re.search(r'id="checkpointSubmitButton"', check_response) is not None:                  self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.')          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning('unable to log in: %s' % compat_str(err)) +            self._downloader.report_warning('unable to log in: %s' % error_to_compat_str(err))              return      def _real_initialize(self): diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index cebdd0193..6f9b003c2 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -2,6 +2,11 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..utils import ( +    xpath_element, +    xpath_text, +    int_or_none, +)  class FazIE(InfoExtractor): @@ -37,31 +42,32 @@ class FazIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) +        description = self._og_search_description(webpage)          config_xml_url = self._search_regex( -            r'writeFLV\(\'(.+?)\',', webpage, 'config xml url') +            r'videoXMLURL\s*=\s*"([^"]+)', webpage, 'config xml url')          config = self._download_xml(              config_xml_url, video_id, 'Downloading config xml') -        encodings = config.find('ENCODINGS') +        encodings = xpath_element(config, 'ENCODINGS', 'encodings', True)          formats = []          for pref, code in enumerate(['LOW', 'HIGH', 'HQ']): -            encoding = encodings.find(code) -            if encoding is None: -                continue -            encoding_url = encoding.find('FILENAME').text -            formats.append({ -                'url': encoding_url, -                'format_id': code.lower(), -                'quality': pref, -            }) +            encoding = xpath_element(encodings, code) +            if encoding: +                encoding_url = xpath_text(encoding, 'FILENAME') +                if encoding_url: +                    formats.append({ +                        'url': encoding_url, +                        'format_id': code.lower(), +                        'quality': pref, +                        'tbr': int_or_none(xpath_text(encoding, 'AVERAGEBITRATE')), +                    })          self._sort_formats(formats) -        descr = self._html_search_regex( -            r'<p class="Content Copy">(.*?)</p>', webpage, 'description', fatal=False)          return {              'id': video_id,              'title': self._og_search_title(webpage),              'formats': formats, -            'description': descr, -            'thumbnail': config.find('STILL/STILL_BIG').text, +            'description': description.strip() if description else None, +            'thumbnail': xpath_text(config, 'STILL/STILL_BIG'), +            'duration': int_or_none(xpath_text(config, 'DURATION')),          } diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index 40ea27895..5f6e65dae 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -1,12 +1,10 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  from ..utils import (      clean_html,      determine_ext, -    ExtractorError, +    js_to_json,  ) @@ -32,24 +30,22 @@ class FKTVIE(InfoExtractor):              'http://fernsehkritik.tv/folge-%s/play' % episode, episode)          title = clean_html(self._html_search_regex(              '<h3>([^<]+)</h3>', webpage, 'title')) -        matches = re.search( -            r'(?s)<video(?:(?!poster)[^>])+(?:poster="([^"]+)")?[^>]*>(.*)</video>', -            webpage) -        if matches is None: -            raise ExtractorError('Unable to extract the video') - -        poster, sources = matches.groups() -        if poster is None: -            self.report_warning('unable to extract thumbnail') - -        urls = re.findall(r'<source[^>]+src="([^"]+)"', sources) -        formats = [{ -            'url': furl, -            'format_id': determine_ext(furl), -        } for furl in urls] +        thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False) +        sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json) + +        formats = [] +        for source in sources: +            furl = source.get('src') +            if furl: +                formats.append({ +                    'url': furl, +                    'format_id': determine_ext(furl), +                }) +        self._sort_formats(formats) +          return {              'id': episode,              'title': title,              'formats': formats, -            'thumbnail': poster, +            'thumbnail': thumbnail,          } diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 91cd46e76..18f439df9 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -1,67 +1,93 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor +from ..compat import compat_urllib_parse  from ..utils import (      ExtractorError, -    find_xpath_attr, -    sanitized_Request, +    int_or_none, +    qualities,  )  class FlickrIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' +    _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/[\w\-_@]+/(?P<id>\d+)'      _TEST = {          'url': 'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/', -        'md5': '6fdc01adbc89d72fc9c4f15b4a4ba87b', +        'md5': '164fe3fa6c22e18d448d4d5af2330f31',          'info_dict': {              'id': '5645318632', -            'ext': 'mp4', -            "description": "Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.", -            "uploader_id": "forestwander-nature-pictures", -            "title": "Dark Hollow Waterfalls" +            'ext': 'mpg', +            'description': 'Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.', +            'title': 'Dark Hollow Waterfalls', +            'duration': 19, +            'timestamp': 1303528740, +            'upload_date': '20110423', +            'uploader_id': '10922353@N03', +            'uploader': 'Forest Wander', +            'comment_count': int, +            'view_count': int, +            'tags': list,          }      } -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) +    _API_BASE_URL = 'https://api.flickr.com/services/rest?' -        video_id = mobj.group('id') -        video_uploader_id = mobj.group('uploader_id') -        webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id -        req = sanitized_Request(webpage_url) -        req.add_header( -            'User-Agent', -            # it needs a more recent version -            'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20150101 Firefox/38.0 (Chrome)') -        webpage = self._download_webpage(req, video_id) +    def _call_api(self, method, video_id, api_key, note, secret=None): +        query = { +            'photo_id': video_id, +            'method': 'flickr.%s' % method, +            'api_key': api_key, +            'format': 'json', +            'nojsoncallback': 1, +        } +        if secret: +            query['secret'] = secret +        data = self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note) +        if data['stat'] != 'ok': +            raise ExtractorError(data['message']) +        return data -        secret = self._search_regex(r'secret"\s*:\s*"(\w+)"', webpage, 'secret') +    def _real_extract(self, url): +        video_id = self._match_id(url) -        first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' -        first_xml = self._download_xml(first_url, video_id, 'Downloading first data webpage') +        api_key = self._download_json( +            'https://www.flickr.com/hermes_error_beacon.gne', video_id, +            'Downloading api key')['site_key'] -        node_id = find_xpath_attr( -            first_xml, './/{http://video.yahoo.com/YEP/1.0/}Item', 'id', -            'id').text +        video_info = self._call_api( +            'photos.getInfo', video_id, api_key, 'Downloading video info')['photo'] +        if video_info['media'] == 'video': +            streams = self._call_api( +                'video.getStreamInfo', video_id, api_key, +                'Downloading streams info', video_info['secret'])['streams'] -        second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' -        second_xml = self._download_xml(second_url, video_id, 'Downloading second data webpage') +            preference = qualities( +                ['288p', 'iphone_wifi', '100', '300', '700', '360p', 'appletv', '720p', '1080p', 'orig']) -        self.report_extraction(video_id) +            formats = [] +            for stream in streams['stream']: +                stream_type = str(stream.get('type')) +                formats.append({ +                    'format_id': stream_type, +                    'url': stream['_content'], +                    'preference': preference(stream_type), +                }) +            self._sort_formats(formats) -        stream = second_xml.find('.//STREAM') -        if stream is None: -            raise ExtractorError('Unable to extract video url') -        video_url = stream.attrib['APP'] + stream.attrib['FULLPATH'] +            owner = video_info.get('owner', {}) -        return { -            'id': video_id, -            'url': video_url, -            'ext': 'mp4', -            'title': self._og_search_title(webpage), -            'description': self._og_search_description(webpage), -            'thumbnail': self._og_search_thumbnail(webpage), -            'uploader_id': video_uploader_id, -        } +            return { +                'id': video_id, +                'title': video_info['title']['_content'], +                'description': video_info.get('description', {}).get('_content'), +                'formats': formats, +                'timestamp': int_or_none(video_info.get('dateuploaded')), +                'duration': int_or_none(video_info.get('video', {}).get('duration')), +                'uploader_id': owner.get('nsid'), +                'uploader': owner.get('realname'), +                'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), +                'view_count': int_or_none(video_info.get('views')), +                'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])] +            } +        else: +            raise ExtractorError('not a video', expected=True) diff --git a/youtube_dl/extractor/footyroom.py b/youtube_dl/extractor/footyroom.py index 4c7dbca40..370fd006f 100644 --- a/youtube_dl/extractor/footyroom.py +++ b/youtube_dl/extractor/footyroom.py @@ -13,6 +13,7 @@ class FootyRoomIE(InfoExtractor):              'title': 'Schalke 04 0 – 2 Real Madrid',          },          'playlist_count': 3, +        'skip': 'Video for this match is not available',      }, {          'url': 'http://footyroom.com/georgia-0-2-germany-2015-03/',          'info_dict': { diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 3a4a59135..318ac013d 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -2,14 +2,10 @@ from __future__ import unicode_literals  import re -from .common import InfoExtractor -from ..utils import ( -    parse_iso8601, -    int_or_none, -) +from .amp import AMPIE -class FoxNewsIE(InfoExtractor): +class FoxNewsIE(AMPIE):      IE_DESC = 'Fox News and Fox Business Video'      _VALID_URL = r'https?://(?P<host>video\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'      _TESTS = [ @@ -20,10 +16,10 @@ class FoxNewsIE(InfoExtractor):                  'id': '3937480',                  'ext': 'flv',                  'title': 'Frozen in Time', -                'description': 'Doctors baffled by 16-year-old girl that is the size of a toddler', +                'description': '16-year-old girl is size of toddler',                  'duration': 265, -                'timestamp': 1304411491, -                'upload_date': '20110503', +                # 'timestamp': 1304411491, +                # 'upload_date': '20110503',                  'thumbnail': 're:^https?://.*\.jpg$',              },          }, @@ -34,10 +30,10 @@ class FoxNewsIE(InfoExtractor):                  'id': '3922535568001',                  'ext': 'mp4',                  'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal", -                'description': "Congressman discusses the president's executive action", +                'description': "Congressman discusses president's plan",                  'duration': 292, -                'timestamp': 1417662047, -                'upload_date': '20141204', +                # 'timestamp': 1417662047, +                # 'upload_date': '20141204',                  'thumbnail': 're:^https?://.*\.jpg$',              },          }, @@ -52,52 +48,9 @@ class FoxNewsIE(InfoExtractor):      ]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        host = mobj.group('host') +        host, video_id = re.match(self._VALID_URL, url).groups() -        video = self._download_json( -            'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id) - -        item = video['channel']['item'] -        title = item['title'] -        description = item['description'] -        timestamp = parse_iso8601(item['dc-date']) - -        media_group = item['media-group'] -        duration = None -        formats = [] -        for media in media_group['media-content']: -            attributes = media['@attributes'] -            video_url = attributes['url'] -            if video_url.endswith('.f4m'): -                formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id)) -            elif video_url.endswith('.m3u8'): -                formats.extend(self._extract_m3u8_formats(video_url, video_id, 'flv')) -            elif not video_url.endswith('.smil'): -                duration = int_or_none(attributes.get('duration')) -                formats.append({ -                    'url': video_url, -                    'format_id': media['media-category']['@attributes']['label'], -                    'preference': 1, -                    'vbr': int_or_none(attributes.get('bitrate')), -                    'filesize': int_or_none(attributes.get('fileSize')) -                }) -        self._sort_formats(formats) - -        media_thumbnail = media_group['media-thumbnail']['@attributes'] -        thumbnails = [{ -            'url': media_thumbnail['url'], -            'width': int_or_none(media_thumbnail.get('width')), -            'height': int_or_none(media_thumbnail.get('height')), -        }] if media_thumbnail else [] - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'duration': duration, -            'timestamp': timestamp, -            'formats': formats, -            'thumbnails': thumbnails, -        } +        info = self._extract_feed_info( +            'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) +        info['id'] = video_id +        return info diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 6613ee17a..fdc51f44f 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -1,8 +1,6 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  from ..utils import int_or_none @@ -23,8 +21,7 @@ class FranceInterIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) @@ -33,7 +30,7 @@ class FranceInterIE(InfoExtractor):          video_url = 'http://www.franceinter.fr/' + path          title = self._html_search_regex( -            r'<span class="title">(.+?)</span>', webpage, 'title') +            r'<span class="title-diffusion">(.+?)</span>', webpage, 'title')          description = self._html_search_regex(              r'<span class="description">(.*?)</span>',              webpage, 'description', fatal=False) diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py new file mode 100644 index 000000000..d1a95d87f --- /dev/null +++ b/youtube_dl/extractor/funimation.py @@ -0,0 +1,193 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    clean_html, +    determine_ext, +    encode_dict, +    int_or_none, +    sanitized_Request, +    ExtractorError, +    urlencode_postdata +) + + +class FunimationIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?funimation\.com/shows/[^/]+/videos/(?:official|promotional)/(?P<id>[^/?#&]+)' + +    _NETRC_MACHINE = 'funimation' + +    _TESTS = [{ +        'url': 'http://www.funimation.com/shows/air/videos/official/breeze', +        'info_dict': { +            'id': '658', +            'display_id': 'breeze', +            'ext': 'mp4', +            'title': 'Air - 1 - Breeze', +            'description': 'md5:1769f43cd5fc130ace8fd87232207892', +            'thumbnail': 're:https?://.*\.jpg', +        }, +    }, { +        'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play', +        'info_dict': { +            'id': '31128', +            'display_id': 'role-play', +            'ext': 'mp4', +            'title': '.hack//SIGN - 1 - Role Play', +            'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd', +            'thumbnail': 're:https?://.*\.jpg', +        }, +    }, { +        'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview', +        'info_dict': { +            'id': '9635', +            'display_id': 'broadcast-dub-preview', +            'ext': 'mp4', +            'title': 'Attack on Titan: Junior High - Broadcast Dub Preview', +            'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803', +            'thumbnail': 're:https?://.*\.(?:jpg|png)', +        }, +    }] + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            return +        data = urlencode_postdata(encode_dict({ +            'email_field': username, +            'password_field': password, +        })) +        login_request = sanitized_Request('http://www.funimation.com/login', data, headers={ +            'User-Agent': 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0', +            'Content-Type': 'application/x-www-form-urlencoded' +        }) +        login_page = self._download_webpage( +            login_request, None, 'Logging in as %s' % username) +        if any(p in login_page for p in ('funimation.com/logout', '>Log Out<')): +            return +        error = self._html_search_regex( +            r'(?s)<div[^>]+id=["\']errorMessages["\'][^>]*>(.+?)</div>', +            login_page, 'error messages', default=None) +        if error: +            raise ExtractorError('Unable to login: %s' % error, expected=True) +        raise ExtractorError('Unable to log in') + +    def _real_initialize(self): +        self._login() + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        errors = [] +        formats = [] + +        ERRORS_MAP = { +            'ERROR_MATURE_CONTENT_LOGGED_IN': 'matureContentLoggedIn', +            'ERROR_MATURE_CONTENT_LOGGED_OUT': 'matureContentLoggedOut', +            'ERROR_SUBSCRIPTION_LOGGED_OUT': 'subscriptionLoggedOut', +            'ERROR_VIDEO_EXPIRED': 'videoExpired', +            'ERROR_TERRITORY_UNAVAILABLE': 'territoryUnavailable', +            'SVODBASIC_SUBSCRIPTION_IN_PLAYER': 'basicSubscription', +            'SVODNON_SUBSCRIPTION_IN_PLAYER': 'nonSubscription', +            'ERROR_PLAYER_NOT_RESPONDING': 'playerNotResponding', +            'ERROR_UNABLE_TO_CONNECT_TO_CDN': 'unableToConnectToCDN', +            'ERROR_STREAM_NOT_FOUND': 'streamNotFound', +        } + +        USER_AGENTS = ( +            # PC UA is served with m3u8 that provides some bonus lower quality formats +            ('pc', 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'), +            # Mobile UA allows to extract direct links and also does not fail when +            # PC UA fails with hulu error (e.g. +            # http://www.funimation.com/shows/hacksign/videos/official/role-play) +            ('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'), +        ) + +        for kind, user_agent in USER_AGENTS: +            request = sanitized_Request(url) +            request.add_header('User-Agent', user_agent) +            webpage = self._download_webpage( +                request, display_id, 'Downloading %s webpage' % kind) + +            playlist = self._parse_json( +                self._search_regex( +                    r'var\s+playersData\s*=\s*(\[.+?\]);\n', +                    webpage, 'players data'), +                display_id)[0]['playlist'] + +            items = next(item['items'] for item in playlist if item.get('items')) +            item = next(item for item in items if item.get('itemAK') == display_id) + +            error_messages = {} +            video_error_messages = self._search_regex( +                r'var\s+videoErrorMessages\s*=\s*({.+?});\n', +                webpage, 'error messages', default=None) +            if video_error_messages: +                error_messages_json = self._parse_json(video_error_messages, display_id, fatal=False) +                if error_messages_json: +                    for _, error in error_messages_json.items(): +                        type_ = error.get('type') +                        description = error.get('description') +                        content = error.get('content') +                        if type_ == 'text' and description and content: +                            error_message = ERRORS_MAP.get(description) +                            if error_message: +                                error_messages[error_message] = content + +            for video in item.get('videoSet', []): +                auth_token = video.get('authToken') +                if not auth_token: +                    continue +                funimation_id = video.get('FUNImationID') or video.get('videoId') +                preference = 1 if video.get('languageMode') == 'dub' else 0 +                if not auth_token.startswith('?'): +                    auth_token = '?%s' % auth_token +                for quality, height in (('sd', 480), ('hd', 720), ('hd1080', 1080)): +                    format_url = video.get('%sUrl' % quality) +                    if not format_url: +                        continue +                    if not format_url.startswith(('http', '//')): +                        errors.append(format_url) +                        continue +                    if determine_ext(format_url) == 'm3u8': +                        m3u8_formats = self._extract_m3u8_formats( +                            format_url + auth_token, display_id, 'mp4', entry_protocol='m3u8_native', +                            preference=preference, m3u8_id='%s-hls' % funimation_id, fatal=False) +                        if m3u8_formats: +                            formats.extend(m3u8_formats) +                    else: +                        tbr = int_or_none(self._search_regex( +                            r'-(\d+)[Kk]', format_url, 'tbr', default=None)) +                        formats.append({ +                            'url': format_url + auth_token, +                            'format_id': '%s-http-%dp' % (funimation_id, height), +                            'height': height, +                            'tbr': tbr, +                            'preference': preference, +                        }) + +        if not formats and errors: +            raise ExtractorError( +                '%s returned error: %s' +                % (self.IE_NAME, clean_html(error_messages.get(errors[0], errors[0]))), +                expected=True) + +        self._sort_formats(formats) + +        title = item['title'] +        artist = item.get('artist') +        if artist: +            title = '%s - %s' % (artist, title) +        description = self._og_search_description(webpage) or item.get('description') +        thumbnail = self._og_search_thumbnail(webpage) or item.get('posterUrl') +        video_id = item.get('itemId') or display_id + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e3bdff2d8..3c3066e38 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -44,7 +44,6 @@ from .myvi import MyviIE  from .condenast import CondeNastIE  from .udn import UDNEmbedIE  from .senateisvp import SenateISVPIE -from .bliptv import BlipTVIE  from .svt import SVTIE  from .pornhub import PornHubIE  from .xhamster import XHamsterEmbedIE @@ -54,6 +53,9 @@ from .onionstudios import OnionStudiosIE  from .snagfilms import SnagFilmsEmbedIE  from .screenwavemedia import ScreenwaveMediaIE  from .mtv import MTVServicesEmbeddedIE +from .pladform import PladformIE +from .googledrive import GoogleDriveIE +from .jwplatform import JWPlatformIE  class GenericIE(InfoExtractor): @@ -1439,11 +1441,6 @@ class GenericIE(InfoExtractor):                  'id': match.group('id')              } -        # Look for embedded blip.tv player -        bliptv_url = BlipTVIE._extract_url(webpage) -        if bliptv_url: -            return self.url_result(bliptv_url, 'BlipTV') -          # Look for SVT player          svt_url = SVTIE._extract_url(webpage)          if svt_url: @@ -1741,10 +1738,9 @@ class GenericIE(InfoExtractor):              return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')          # Look for Pladform embeds -        mobj = re.search( -            r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage) -        if mobj is not None: -            return self.url_result(mobj.group('url'), 'Pladform') +        pladform_url = PladformIE._extract_url(webpage) +        if pladform_url: +            return self.url_result(pladform_url)          # Look for Playwire embeds          mobj = re.search( @@ -1769,6 +1765,11 @@ class GenericIE(InfoExtractor):          if nbc_sports_url:              return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') +        # Look for Google Drive embeds +        google_drive_url = GoogleDriveIE._extract_url(webpage) +        if google_drive_url: +            return self.url_result(google_drive_url, 'GoogleDrive') +          # Look for UDN embeds          mobj = re.search(              r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) @@ -1796,6 +1797,11 @@ class GenericIE(InfoExtractor):          if snagfilms_url:              return self.url_result(snagfilms_url) +        # Look for JWPlatform embeds +        jwplatform_url = JWPlatformIE._extract_url(webpage) +        if jwplatform_url: +            return self.url_result(jwplatform_url, 'JWPlatform') +          # Look for ScreenwaveMedia embeds          mobj = re.search(ScreenwaveMediaIE.EMBED_PATTERN, webpage)          if mobj is not None: diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py new file mode 100644 index 000000000..f354c9c7a --- /dev/null +++ b/youtube_dl/extractor/googledrive.py @@ -0,0 +1,88 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    int_or_none, +) + + +class GoogleDriveIE(InfoExtractor): +    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})' +    _TEST = { +        'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', +        'md5': '881f7700aec4f538571fa1e0eed4a7b6', +        'info_dict': { +            'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', +            'ext': 'mp4', +            'title': 'Big Buck Bunny.mp4', +            'duration': 46, +        } +    } +    _FORMATS_EXT = { +        '5': 'flv', +        '6': 'flv', +        '13': '3gp', +        '17': '3gp', +        '18': 'mp4', +        '22': 'mp4', +        '34': 'flv', +        '35': 'flv', +        '36': '3gp', +        '37': 'mp4', +        '38': 'mp4', +        '43': 'webm', +        '44': 'webm', +        '45': 'webm', +        '46': 'webm', +        '59': 'mp4', +    } + +    @staticmethod +    def _extract_url(webpage): +        mobj = re.search( +            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', +            webpage) +        if mobj: +            return 'https://drive.google.com/file/d/%s' % mobj.group('id') + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage( +            'http://docs.google.com/file/d/%s' % video_id, video_id, encoding='unicode_escape') + +        reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) +        if reason: +            raise ExtractorError(reason) + +        title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title') +        duration = int_or_none(self._search_regex( +            r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None)) +        fmt_stream_map = self._search_regex( +            r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',') +        fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') + +        formats = [] +        for fmt, fmt_stream in zip(fmt_list, fmt_stream_map): +            fmt_id, fmt_url = fmt_stream.split('|') +            resolution = fmt.split('/')[1] +            width, height = resolution.split('x') +            formats.append({ +                'url': fmt_url, +                'format_id': fmt_id, +                'resolution': resolution, +                'width': int_or_none(width), +                'height': int_or_none(height), +                'ext': self._FORMATS_EXT[fmt_id], +            }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'thumbnail': self._og_search_thumbnail(webpage), +            'duration': duration, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/gputechconf.py b/youtube_dl/extractor/gputechconf.py new file mode 100644 index 000000000..145b55bf3 --- /dev/null +++ b/youtube_dl/extractor/gputechconf.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    xpath_element, +    xpath_text, +    int_or_none, +    parse_duration, +) + + +class GPUTechConfIE(InfoExtractor): +    _VALID_URL = r'https?://on-demand\.gputechconf\.com/gtc/2015/video/S(?P<id>\d+)\.html' +    _TEST = { +        'url': 'http://on-demand.gputechconf.com/gtc/2015/video/S5156.html', +        'md5': 'a8862a00a0fd65b8b43acc5b8e33f798', +        'info_dict': { +            'id': '5156', +            'ext': 'mp4', +            'title': 'Coordinating More Than 3 Million CUDA Threads for Social Network Analysis', +            'duration': 1219, +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        root_path = self._search_regex(r'var\s+rootPath\s*=\s*"([^"]+)', webpage, 'root path', 'http://evt.dispeak.com/nvidia/events/gtc15/') +        xml_file_id = self._search_regex(r'var\s+xmlFileId\s*=\s*"([^"]+)', webpage, 'xml file id') + +        doc = self._download_xml('%sxml/%s.xml' % (root_path, xml_file_id), video_id) + +        metadata = xpath_element(doc, 'metadata') +        http_host = xpath_text(metadata, 'httpHost', 'http host', True) +        mbr_videos = xpath_element(metadata, 'MBRVideos') + +        formats = [] +        for mbr_video in mbr_videos.findall('MBRVideo'): +            stream_name = xpath_text(mbr_video, 'streamName') +            if stream_name: +                formats.append({ +                    'url': 'http://%s/%s' % (http_host, stream_name.replace('mp4:', '')), +                    'tbr': int_or_none(xpath_text(mbr_video, 'bitrate')), +                }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': xpath_text(metadata, 'title'), +            'duration': parse_duration(xpath_text(metadata, 'endTime')), +            'creator': xpath_text(metadata, 'speaker'), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py new file mode 100644 index 000000000..05d27e75d --- /dev/null +++ b/youtube_dl/extractor/hotstar.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    determine_ext, +    int_or_none, +) + + +class HotStarIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?hotstar\.com/.*?[/-](?P<id>\d{10})' +    _TEST = { +        'url': 'http://www.hotstar.com/on-air-with-aib--english-1000076273', +        'info_dict': { +            'id': '1000076273', +            'ext': 'mp4', +            'title': 'On Air With AIB - English', +            'description': 'md5:c957d8868e9bc793ccb813691cc4c434', +            'timestamp': 1447227000, +            'upload_date': '20151111', +            'duration': 381, +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        } +    } + +    _GET_CONTENT_TEMPLATE = 'http://account.hotstar.com/AVS/besc?action=GetAggregatedContentDetails&channel=PCTV&contentId=%s' +    _GET_CDN_TEMPLATE = 'http://getcdn.hotstar.com/AVS/besc?action=GetCDN&asJson=Y&channel=%s&id=%s&type=%s' + +    def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', fatal=True): +        json_data = super(HotStarIE, self)._download_json(url_or_request, video_id, note, fatal=fatal) +        if json_data['resultCode'] != 'OK': +            if fatal: +                raise ExtractorError(json_data['errorDescription']) +            return None +        return json_data['resultObj'] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        video_data = self._download_json( +            self._GET_CONTENT_TEMPLATE % video_id, +            video_id)['contentInfo'][0] + +        formats = [] +        # PCTV for extracting f4m manifest +        for f in ('TABLET',): +            format_data = self._download_json( +                self._GET_CDN_TEMPLATE % (f, video_id, 'VOD'), +                video_id, 'Downloading %s JSON metadata' % f, fatal=False) +            if format_data: +                format_url = format_data['src'] +                ext = determine_ext(format_url) +                if ext == 'm3u8': +                    m3u8_formats = self._extract_m3u8_formats(format_url, video_id, 'mp4', m3u8_id='hls', fatal=False) +                    if m3u8_formats: +                        formats.extend(m3u8_formats) +                elif ext == 'f4m': +                    # produce broken files +                    continue +                else: +                    formats.append({ +                        'url': format_url, +                        'width': int_or_none(format_data.get('width')), +                        'height': int_or_none(format_data.get('height')), +                    }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': video_data['episodeTitle'], +            'description': video_data.get('description'), +            'duration': int_or_none(video_data.get('duration')), +            'timestamp': int_or_none(video_data.get('broadcastDate')), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index bf2d2041b..a2e18c8a7 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -3,6 +3,10 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    parse_iso8601, +)  class IGNIE(InfoExtractor): @@ -11,25 +15,24 @@ class IGNIE(InfoExtractor):      Some videos of it.ign.com are also supported      """ -    _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles|(?:[^/]*/feature))(/.+)?/(?P<name_or_id>.+)' +    _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?P<type>videos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P<name_or_id>.+)'      IE_NAME = 'ign.com' -    _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' -    _DESCRIPTION_RE = [ -        r'<span class="page-object-description">(.+?)</span>', -        r'id="my_show_video">.*?<p>(.*?)</p>', -        r'<meta name="description" content="(.*?)"', -    ] +    _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s' +    _EMBED_RE = r'<iframe[^>]+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']'      _TESTS = [          {              'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', -            'md5': 'eac8bdc1890980122c3b66f14bdd02e9', +            'md5': 'febda82c4bafecd2d44b6e1a18a595f8',              'info_dict': {                  'id': '8f862beef863986b2785559b9e1aa599',                  'ext': 'mp4',                  'title': 'The Last of Us Review',                  'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', +                'timestamp': 1370440800, +                'upload_date': '20130605', +                'uploader_id': 'cberidon@ign.com',              }          },          { @@ -44,6 +47,9 @@ class IGNIE(InfoExtractor):                          'ext': 'mp4',                          'title': 'GTA 5 Video Review',                          'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', +                        'timestamp': 1379339880, +                        'upload_date': '20130916', +                        'uploader_id': 'danieljkrupa@gmail.com',                      },                  },                  { @@ -52,6 +58,9 @@ class IGNIE(InfoExtractor):                          'ext': 'mp4',                          'title': '26 Twisted Moments from GTA 5 in Slow Motion',                          'description': 'The twisted beauty of GTA 5 in stunning slow motion.', +                        'timestamp': 1386878820, +                        'upload_date': '20131212', +                        'uploader_id': 'togilvie@ign.com',                      },                  },              ], @@ -66,12 +75,20 @@ class IGNIE(InfoExtractor):                  'id': '078fdd005f6d3c02f63d795faa1b984f',                  'ext': 'mp4',                  'title': 'Rewind Theater - Wild Trailer Gamescom 2014', -                'description': ( -                    'Giant skeletons, bloody hunts, and captivating' -                    ' natural beauty take our breath away.' -                ), +                'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.', +                'timestamp': 1408047180, +                'upload_date': '20140814', +                'uploader_id': 'jamesduggan1990@gmail.com',              },          }, +        { +            'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', +            'only_matching': True, +        }, +        { +            'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', +            'only_matching': True, +        },      ]      def _find_video_id(self, webpage): @@ -82,7 +99,7 @@ class IGNIE(InfoExtractor):              r'<object id="vid_(.+?)"',              r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',          ] -        return self._search_regex(res_id, webpage, 'video id') +        return self._search_regex(res_id, webpage, 'video id', default=None)      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -91,7 +108,7 @@ class IGNIE(InfoExtractor):          webpage = self._download_webpage(url, name_or_id)          if page_type != 'video':              multiple_urls = re.findall( -                '<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', +                r'<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]',                  webpage)              if multiple_urls:                  entries = [self.url_result(u, ie='IGN') for u in multiple_urls] @@ -102,22 +119,50 @@ class IGNIE(InfoExtractor):                  }          video_id = self._find_video_id(webpage) -        result = self._get_video_info(video_id) -        description = self._html_search_regex(self._DESCRIPTION_RE, -                                              webpage, 'video description', flags=re.DOTALL) -        result['description'] = description -        return result +        if not video_id: +            return self.url_result(self._search_regex(self._EMBED_RE, webpage, 'embed url')) +        return self._get_video_info(video_id)      def _get_video_info(self, video_id): -        config_url = self._CONFIG_URL_TEMPLATE % video_id -        config = self._download_json(config_url, video_id) -        media = config['playlist']['media'] +        api_data = self._download_json(self._API_URL_TEMPLATE % video_id, video_id) + +        formats = [] +        m3u8_url = api_data['refs'].get('m3uUrl') +        if m3u8_url: +            m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) +            if m3u8_formats: +                formats.extend(m3u8_formats) +        f4m_url = api_data['refs'].get('f4mUrl') +        if f4m_url: +            f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) +            if f4m_formats: +                formats.extend(f4m_formats) +        for asset in api_data['assets']: +            formats.append({ +                'url': asset['url'], +                'tbr': asset.get('actual_bitrate_kbps'), +                'fps': asset.get('frame_rate'), +                'height': int_or_none(asset.get('height')), +                'width': int_or_none(asset.get('width')), +            }) +        self._sort_formats(formats) + +        thumbnails = [{ +            'url': thumbnail['url'] +        } for thumbnail in api_data.get('thumbnails', [])] + +        metadata = api_data['metadata']          return { -            'id': media['metadata']['videoId'], -            'url': media['url'], -            'title': media['metadata']['title'], -            'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'), +            'id': api_data.get('videoId') or video_id, +            'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'], +            'description': metadata.get('description'), +            'timestamp': parse_iso8601(metadata.get('publishDate')), +            'duration': int_or_none(metadata.get('duration')), +            'display_id': metadata.get('slug') or video_id, +            'uploader_id': metadata.get('creator'), +            'thumbnails': thumbnails, +            'formats': formats,          } @@ -125,16 +170,17 @@ class OneUPIE(IGNIE):      _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)\.html'      IE_NAME = '1up.com' -    _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>' -      _TESTS = [{          'url': 'http://gamevideos.1up.com/video/id/34976.html', -        'md5': '68a54ce4ebc772e4b71e3123d413163d', +        'md5': 'c9cc69e07acb675c31a16719f909e347',          'info_dict': {              'id': '34976',              'ext': 'mp4',              'title': 'Sniper Elite V2 - Trailer', -            'description': 'md5:5d289b722f5a6d940ca3136e9dae89cf', +            'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826', +            'timestamp': 1313099220, +            'upload_date': '20110811', +            'uploader_id': 'IGN',          }      }] @@ -143,3 +189,36 @@ class OneUPIE(IGNIE):          result = super(OneUPIE, self)._real_extract(url)          result['id'] = mobj.group('name_or_id')          return result + + +class PCMagIE(IGNIE): +    _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?P<type>videos|article2)(/.+)?/(?P<name_or_id>.+)' +    IE_NAME = 'pcmag' + +    _EMBED_RE = r'iframe.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content.html?[^"]*url=([^"]+)["&]' + +    _TESTS = [{ +        'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', +        'md5': '212d6154fd0361a2781075f1febbe9ad', +        'info_dict': { +            'id': 'ee10d774b508c9b8ec07e763b9125b91', +            'ext': 'mp4', +            'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?', +            'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3', +            'timestamp': 1420571160, +            'upload_date': '20150106', +            'uploader_id': 'cozzipix@gmail.com', +        } +    }, { +        'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp', +        'md5': '94130c1ca07ba0adb6088350681f16c1', +        'info_dict': { +            'id': '042e560ba94823d43afcb12ddf7142ca', +            'ext': 'mp4', +            'title': 'HTC\'s Weird New Re Camera - What\'s New Now', +            'description': 'md5:53433c45df96d2ea5d0fda18be2ca908', +            'timestamp': 1412953920, +            'upload_date': '20141010', +            'uploader_id': 'chris_snyder@pcmag.com', +        } +    }] diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 70c8ca64e..85e9344aa 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -13,7 +13,7 @@ from ..utils import (  class ImgurIE(InfoExtractor): -    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!gallery)(?P<id>[a-zA-Z0-9]+)' +    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|topic/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$'      _TESTS = [{          'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -21,7 +21,7 @@ class ImgurIE(InfoExtractor):              'id': 'A61SaA1',              'ext': 'mp4',              'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', -            'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', +            'description': 'Imgur: The most awesome images on the Internet.',          },      }, {          'url': 'https://imgur.com/A61SaA1', @@ -29,8 +29,20 @@ class ImgurIE(InfoExtractor):              'id': 'A61SaA1',              'ext': 'mp4',              'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', -            'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', +            'description': 'Imgur: The most awesome images on the Internet.',          }, +    }, { +        'url': 'https://imgur.com/gallery/YcAQlkx', +        'info_dict': { +            'id': 'YcAQlkx', +            'ext': 'mp4', +            'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', +            'description': 'Imgur: The most awesome images on the Internet.' + +        } +    }, { +        'url': 'http://imgur.com/topic/Funny/N8rOudd', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -100,25 +112,38 @@ class ImgurIE(InfoExtractor):  class ImgurAlbumIE(InfoExtractor): -    _VALID_URL = r'https?://(?:i\.)?imgur\.com/gallery/(?P<id>[a-zA-Z0-9]+)' +    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:a|gallery|topic/[^/]+)/)?(?P<id>[a-zA-Z0-9]{5})(?:[/?#&]+)?$' -    _TEST = { +    _TESTS = [{          'url': 'http://imgur.com/gallery/Q95ko',          'info_dict': {              'id': 'Q95ko',          },          'playlist_count': 25, -    } +    }, { +        'url': 'http://imgur.com/a/j6Orj', +        'only_matching': True, +    }, { +        'url': 'http://imgur.com/topic/Aww/ll5Vk', +        'only_matching': True, +    }]      def _real_extract(self, url):          album_id = self._match_id(url)          album_images = self._download_json(              'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, -            album_id)['data']['images'] - -        entries = [ -            self.url_result('http://imgur.com/%s' % image['hash']) -            for image in album_images if image.get('hash')] - -        return self.playlist_result(entries, album_id) +            album_id, fatal=False) + +        if album_images: +            data = album_images.get('data') +            if data and isinstance(data, dict): +                images = data.get('images') +                if images and isinstance(images, list): +                    entries = [ +                        self.url_result('http://imgur.com/%s' % image['hash']) +                        for image in images if image.get('hash')] +                    return self.playlist_result(entries, album_id) + +        # Fallback to single video +        return self.url_result('http://imgur.com/%s' % album_id, ImgurIE.ie_key()) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 71cfd12c5..016af2084 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -1,3 +1,5 @@ +# coding: utf-8 +  from __future__ import unicode_literals  import base64 @@ -5,8 +7,9 @@ import base64  from .common import InfoExtractor  from ..compat import (      compat_urllib_parse_unquote, -    compat_urlparse, +    compat_parse_qs,  ) +from ..utils import determine_ext  class InfoQIE(InfoExtractor): @@ -16,7 +19,7 @@ class InfoQIE(InfoExtractor):          'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things',          'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2',          'info_dict': { -            'id': '12-jan-pythonthings', +            'id': 'A-Few-of-My-Favorite-Python-Things',              'ext': 'mp4',              'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.',              'title': 'A Few of My Favorite [Python] Things', @@ -24,40 +27,84 @@ class InfoQIE(InfoExtractor):      }, {          'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript',          'only_matching': True, +    }, { +        'url': 'http://www.infoq.com/cn/presentations/openstack-continued-delivery', +        'md5': '4918d0cca1497f2244572caf626687ef', +        'info_dict': { +            'id': 'openstack-continued-delivery', +            'title': 'OpenStack持续交付之路', +            'ext': 'flv', +            'description': 'md5:308d981fb28fa42f49f9568322c683ff', +        },      }] -    def _real_extract(self, url): -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) +    def _extract_bokecc_videos(self, webpage, video_id): +        # TODO: bokecc.com is a Chinese video cloud platform +        # It should have an independent extractor but I don't have other +        # examples using bokecc +        player_params_str = self._html_search_regex( +            r'<script[^>]+src="http://p\.bokecc\.com/player\?([^"]+)', +            webpage, 'player params', default=None) -        video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title') -        video_description = self._html_search_meta('description', webpage, 'description') +        player_params = compat_parse_qs(player_params_str) + +        info_xml = self._download_xml( +            'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( +                player_params['siteid'][0], player_params['vid'][0]), video_id) + +        return [{ +            'format_id': 'bokecc', +            'url': quality.find('./copy').attrib['playurl'], +            'preference': int(quality.attrib['value']), +        } for quality in info_xml.findall('./video/quality')] +    def _extract_rtmp_videos(self, webpage):          # The server URL is hardcoded          video_url = 'rtmpe://video.infoq.com/cfx/st/'          # Extract video URL          encoded_id = self._search_regex( -            r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id') +            r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id', default=None) +          real_id = compat_urllib_parse_unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))          playpath = 'mp4:' + real_id -        video_filename = playpath.split('/')[-1] -        video_id, extension = video_filename.split('.') - -        http_base = self._search_regex( -            r'EXPRESSINSTALL_SWF\s*=\s*[^"]*"((?:https?:)?//[^/"]+/)', webpage, -            'HTTP base URL') - -        formats = [{ +        return [{              'format_id': 'rtmp',              'url': video_url, -            'ext': extension, +            'ext': determine_ext(playpath),              'play_path': playpath, -        }, { +        }] + +    def _extract_http_videos(self, webpage): +        http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') + +        policy = self._search_regex(r'InfoQConstants.scp\s*=\s*\'([^\']+)\'', webpage, 'policy') +        signature = self._search_regex(r'InfoQConstants.scs\s*=\s*\'([^\']+)\'', webpage, 'signature') +        key_pair_id = self._search_regex(r'InfoQConstants.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id') + +        return [{              'format_id': 'http', -            'url': compat_urlparse.urljoin(url, http_base) + real_id, +            'url': http_video_url, +            'http_headers': { +                'Cookie': 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( +                    policy, signature, key_pair_id), +            },          }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title') +        video_description = self._html_search_meta('description', webpage, 'description') + +        if '/cn/' in url: +            # for China videos, HTTP video URL exists but always fails with 403 +            formats = self._extract_bokecc_videos(webpage, video_id) +        else: +            formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage) +          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index c158f2064..e5e16ca3b 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -47,7 +47,7 @@ class InstagramIE(InfoExtractor):  class InstagramUserIE(InfoExtractor): -    _VALID_URL = r'https://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])' +    _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'      IE_DESC = 'Instagram user profile'      IE_NAME = 'instagram:user'      _TEST = { diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 2df1da3f0..66a70a181 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -16,7 +16,7 @@ class IqiyiIE(InfoExtractor):      IE_NAME = 'iqiyi'      IE_DESC = '爱奇艺' -    _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html' +    _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html'      _TESTS = [{          'url': 'http://www.iqiyi.com/v_19rrojlavg.html', @@ -84,6 +84,15 @@ class IqiyiIE(InfoExtractor):          'params': {              'skip_download': True,          }, +    }, { +        'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html', +        'only_matching': True, +    }, { +        'url': 'http://www.iqiyi.com/a_19rrhbc6kt.html', +        'only_matching': True, +    }, { +        'url': 'http://yule.iqiyi.com/pcb.html', +        'only_matching': True,      }]      _FORMATS_MAP = [ @@ -205,9 +214,8 @@ class IqiyiIE(InfoExtractor):      def get_enc_key(self, swf_url, video_id):          # TODO: automatic key extraction -        # last update at 2015-10-22 for Zombie::bite -        # '7223c67061dbea1259d0ceb44f44b6d62288f4f80c972170de5201d2321060270e05'[2:66][0::2] -        enc_key = '2c76de15dcb44bd28ff0927d50d31620' +        # last update at 2015-12-18 for Zombie::bite +        enc_key = '8b6b683780897eb8d9a48a02ccc4817d'[::-1]          return enc_key      def _real_extract(self, url): diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py new file mode 100644 index 000000000..a92adf2b3 --- /dev/null +++ b/youtube_dl/extractor/jwplatform.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class JWPlatformIE(InfoExtractor): +    _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' +    _TEST = { +        'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', +        'md5': 'fa8899fa601eb7c83a64e9d568bdf325', +        'info_dict': { +            'id': 'nPripu9l', +            'ext': 'mov', +            'title': 'Big Buck Bunny Trailer', +            'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', +            'upload_date': '20081127', +            'timestamp': 1227796140, +        } +    } + +    @staticmethod +    def _extract_url(webpage): +        mobj = re.search( +            r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', +            webpage) +        if mobj: +            return mobj.group('url') + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) +        video_data = json_data['playlist'][0] +        subtitles = {} +        for track in video_data['tracks']: +            if track['kind'] == 'captions': +                subtitles[track['label']] = [{'url': self._proto_relative_url(track['file'])}] + +        formats = [] +        for source in video_data['sources']: +            source_url = self._proto_relative_url(source['file']) +            source_type = source.get('type') or '' +            if source_type == 'application/vnd.apple.mpegurl': +                m3u8_formats = self._extract_m3u8_formats( +                    source_url, video_id, 'mp4', 'm3u8_native', fatal=False) +                if m3u8_formats: +                    formats.extend(m3u8_formats) +            elif source_type.startswith('audio'): +                formats.append({ +                    'url': source_url, +                    'vcodec': 'none', +                }) +            else: +                formats.append({ +                    'url': source_url, +                    'width': int_or_none(source.get('width')), +                    'height': int_or_none(source.get('height')), +                }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': video_data['title'], +            'description': video_data.get('description'), +            'thumbnail': self._proto_relative_url(video_data.get('image')), +            'timestamp': int_or_none(video_data.get('pubdate')), +            'subtitles': subtitles, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 583b1a5ad..4807c8110 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -45,7 +45,7 @@ class KalturaIE(InfoExtractor):              'info_dict': {                  'id': '1_1jc2y3e4',                  'ext': 'mp4', -                'title': 'Track 4', +                'title': 'Straight from the Heart',                  'upload_date': '20131219',                  'uploader_id': 'mlundberg@wolfgangsvault.com',                  'description': 'The Allman Brothers Band, 12/16/1981', @@ -115,12 +115,9 @@ class KalturaIE(InfoExtractor):                  'version': '-1',              },              { -                'action': 'getContextData', -                'contextDataParams:objectType': 'KalturaEntryContextDataParams', -                'contextDataParams:referrer': 'http://www.kaltura.com/', -                'contextDataParams:streamerType': 'http', +                'action': 'getbyentryid',                  'entryId': video_id, -                'service': 'baseentry', +                'service': 'flavorAsset',              },          ]          return self._kaltura_api_call( @@ -133,7 +130,7 @@ class KalturaIE(InfoExtractor):          partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5')          entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5') -        info, source_data = self._get_video_info(entry_id, partner_id) +        info, flavor_assets = self._get_video_info(entry_id, partner_id)          source_url = smuggled_data.get('source_url')          if source_url: @@ -144,7 +141,10 @@ class KalturaIE(InfoExtractor):              referrer = None          formats = [] -        for f in source_data['flavorAssets']: +        for f in flavor_assets: +            # Continue if asset is not ready +            if f['status'] != 2: +                continue              video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id'])              if referrer:                  video_url += '?referrer=%s' % referrer @@ -160,6 +160,14 @@ class KalturaIE(InfoExtractor):                  'width': int_or_none(f.get('width')),                  'url': video_url,              }) +        m3u8_url = info['dataUrl'].replace('format/url', 'format/applehttp') +        if referrer: +            m3u8_url += '?referrer=%s' % referrer +        m3u8_formats = self._extract_m3u8_formats( +            m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) +        if m3u8_formats: +            formats.extend(m3u8_formats) +          self._check_formats(formats, entry_id)          self._sort_formats(formats) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 6d7733e41..688eb2308 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -1,27 +1,29 @@  from __future__ import unicode_literals  import re -import json  import itertools  from .common import InfoExtractor  from ..compat import (      compat_str, -    compat_urllib_parse_urlparse,      compat_urlparse,  )  from ..utils import ( -    ExtractorError,      find_xpath_attr, -    int_or_none, -    orderedSet, +    xpath_attr,      xpath_with_ns, +    xpath_text, +    orderedSet, +    int_or_none, +    float_or_none, +    parse_iso8601, +    determine_ext,  )  class LivestreamIE(InfoExtractor):      IE_NAME = 'livestream' -    _VALID_URL = r'https?://(?:new\.)?livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>[0-9]+)(?:/player)?)?/?(?:$|[?#])' +    _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?'      _TESTS = [{          'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',          'md5': '53274c76ba7754fb0e8d072716f2292b', @@ -29,7 +31,9 @@ class LivestreamIE(InfoExtractor):              'id': '4719370',              'ext': 'mp4',              'title': 'Live from Webster Hall NYC', +            'timestamp': 1350008072,              'upload_date': '20121012', +            'duration': 5968.0,              'like_count': int,              'view_count': int,              'thumbnail': 're:^http://.*\.jpg$' @@ -55,39 +59,20 @@ class LivestreamIE(InfoExtractor):          'url': 'http://livestream.com/bsww/concacafbeachsoccercampeonato2015',          'only_matching': True,      }] +    _API_URL_TEMPLATE = 'http://livestream.com/api/accounts/%s/events/%s' + +    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): +        base_ele = find_xpath_attr( +            smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase') +        base = base_ele.get('content') if base_ele else 'http://livestreamvod-f.akamaihd.net/' -    def _parse_smil(self, video_id, smil_url):          formats = [] -        _SWITCH_XPATH = ( -            './/{http://www.w3.org/2001/SMIL20/Language}body/' -            '{http://www.w3.org/2001/SMIL20/Language}switch') -        smil_doc = self._download_xml( -            smil_url, video_id, -            note='Downloading SMIL information', -            errnote='Unable to download SMIL information', -            fatal=False) -        if smil_doc is False:  # Download failed -            return formats -        title_node = find_xpath_attr( -            smil_doc, './/{http://www.w3.org/2001/SMIL20/Language}meta', -            'name', 'title') -        if title_node is None: -            self.report_warning('Cannot find SMIL id') -            switch_node = smil_doc.find(_SWITCH_XPATH) -        else: -            title_id = title_node.attrib['content'] -            switch_node = find_xpath_attr( -                smil_doc, _SWITCH_XPATH, 'id', title_id) -        if switch_node is None: -            raise ExtractorError('Cannot find switch node') -        video_nodes = switch_node.findall( -            '{http://www.w3.org/2001/SMIL20/Language}video') +        video_nodes = smil.findall(self._xpath_ns('.//video', namespace))          for vn in video_nodes: -            tbr = int_or_none(vn.attrib.get('system-bitrate')) +            tbr = int_or_none(vn.attrib.get('system-bitrate'), 1000)              furl = ( -                'http://livestream-f.akamaihd.net/%s?v=3.0.3&fp=WIN%%2014,0,0,145' % -                (vn.attrib['src'])) +                '%s%s?v=3.0.3&fp=WIN%%2014,0,0,145' % (base, vn.attrib['src']))              if 'clipBegin' in vn.attrib:                  furl += '&ssek=' + vn.attrib['clipBegin']              formats.append({ @@ -106,97 +91,151 @@ class LivestreamIE(InfoExtractor):              ('sd', 'progressive_url'),              ('hd', 'progressive_url_hd'),          ) -        formats = [{ -            'format_id': format_id, -            'url': video_data[key], -            'quality': i + 1, -        } for i, (format_id, key) in enumerate(FORMAT_KEYS) -            if video_data.get(key)] + +        formats = [] +        for format_id, key in FORMAT_KEYS: +            video_url = video_data.get(key) +            if video_url: +                ext = determine_ext(video_url) +                if ext == 'm3u8': +                    continue +                bitrate = int_or_none(self._search_regex( +                    r'(\d+)\.%s' % ext, video_url, 'bitrate', default=None)) +                formats.append({ +                    'url': video_url, +                    'format_id': format_id, +                    'tbr': bitrate, +                    'ext': ext, +                })          smil_url = video_data.get('smil_url')          if smil_url: -            formats.extend(self._parse_smil(video_id, smil_url)) +            smil_formats = self._extract_smil_formats(smil_url, video_id) +            if smil_formats: +                formats.extend(smil_formats) + +        m3u8_url = video_data.get('m3u8_url') +        if m3u8_url: +            m3u8_formats = self._extract_m3u8_formats( +                m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) +            if m3u8_formats: +                formats.extend(m3u8_formats) + +        f4m_url = video_data.get('f4m_url') +        if f4m_url: +            f4m_formats = self._extract_f4m_formats( +                f4m_url, video_id, f4m_id='hds', fatal=False) +            if f4m_formats: +                formats.extend(f4m_formats)          self._sort_formats(formats) +        comments = [{ +            'author_id': comment.get('author_id'), +            'author': comment.get('author', {}).get('full_name'), +            'id': comment.get('id'), +            'text': comment['text'], +            'timestamp': parse_iso8601(comment.get('created_at')), +        } for comment in video_data.get('comments', {}).get('data', [])] +          return {              'id': video_id,              'formats': formats,              'title': video_data['caption'], +            'description': video_data.get('description'),              'thumbnail': video_data.get('thumbnail_url'), -            'upload_date': video_data['updated_at'].replace('-', '')[:8], +            'duration': float_or_none(video_data.get('duration'), 1000), +            'timestamp': parse_iso8601(video_data.get('publish_at')),              'like_count': video_data.get('likes', {}).get('total'), +            'comment_count': video_data.get('comments', {}).get('total'),              'view_count': video_data.get('views'), +            'comments': comments,          } -    def _extract_event(self, info): -        event_id = compat_str(info['id']) -        account = compat_str(info['owner_account_id']) -        root_url = ( -            'https://new.livestream.com/api/accounts/{account}/events/{event}/' -            'feed.json'.format(account=account, event=event_id)) - -        def _extract_videos(): -            last_video = None -            for i in itertools.count(1): -                if last_video is None: -                    info_url = root_url -                else: -                    info_url = '{root}?&id={id}&newer=-1&type=video'.format( -                        root=root_url, id=last_video) -                videos_info = self._download_json(info_url, event_id, 'Downloading page {0}'.format(i))['data'] -                videos_info = [v['data'] for v in videos_info if v['type'] == 'video'] -                if not videos_info: -                    break -                for v in videos_info: -                    yield self._extract_video_info(v) -                last_video = videos_info[-1]['id'] -        return self.playlist_result(_extract_videos(), event_id, info['full_name']) +    def _extract_stream_info(self, stream_info): +        broadcast_id = stream_info['broadcast_id'] +        is_live = stream_info.get('is_live') + +        formats = [] +        smil_url = stream_info.get('play_url') +        if smil_url: +            smil_formats = self._extract_smil_formats(smil_url, broadcast_id) +            if smil_formats: +                formats.extend(smil_formats) + +        entry_protocol = 'm3u8' if is_live else 'm3u8_native' +        m3u8_url = stream_info.get('m3u8_url') +        if m3u8_url: +            m3u8_formats = self._extract_m3u8_formats( +                m3u8_url, broadcast_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False) +            if m3u8_formats: +                formats.extend(m3u8_formats) + +        rtsp_url = stream_info.get('rtsp_url') +        if rtsp_url: +            formats.append({ +                'url': rtsp_url, +                'format_id': 'rtsp', +            }) +        self._sort_formats(formats) + +        return { +            'id': broadcast_id, +            'formats': formats, +            'title': self._live_title(stream_info['stream_title']) if is_live else stream_info['stream_title'], +            'thumbnail': stream_info.get('thumbnail_url'), +            'is_live': is_live, +        } + +    def _extract_event(self, event_data): +        event_id = compat_str(event_data['id']) +        account_id = compat_str(event_data['owner_account_id']) +        feed_root_url = self._API_URL_TEMPLATE % (account_id, event_id) + '/feed.json' + +        stream_info = event_data.get('stream_info') +        if stream_info: +            return self._extract_stream_info(stream_info) + +        last_video = None +        entries = [] +        for i in itertools.count(1): +            if last_video is None: +                info_url = feed_root_url +            else: +                info_url = '{root}?&id={id}&newer=-1&type=video'.format( +                    root=feed_root_url, id=last_video) +            videos_info = self._download_json( +                info_url, event_id, 'Downloading page {0}'.format(i))['data'] +            videos_info = [v['data'] for v in videos_info if v['type'] == 'video'] +            if not videos_info: +                break +            for v in videos_info: +                entries.append(self.url_result( +                    'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v['id']), +                    'Livestream', v['id'], v['caption'])) +            last_video = videos_info[-1]['id'] +        return self.playlist_result(entries, event_id, event_data['full_name'])      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') -        event_name = mobj.group('event_name') -        webpage = self._download_webpage(url, video_id or event_name) - -        og_video = self._og_search_video_url( -            webpage, 'player url', fatal=False, default=None) -        if og_video is not None: -            query_str = compat_urllib_parse_urlparse(og_video).query -            query = compat_urlparse.parse_qs(query_str) -            if 'play_url' in query: -                api_url = query['play_url'][0].replace('.smil', '') -                info = json.loads(self._download_webpage( -                    api_url, video_id, 'Downloading video info')) -                return self._extract_video_info(info) - -        config_json = self._search_regex( -            r'window.config = ({.*?});', webpage, 'window config') -        info = json.loads(config_json)['event'] - -        def is_relevant(vdata, vid): -            result = vdata['type'] == 'video' -            if video_id is not None: -                result = result and compat_str(vdata['data']['id']) == vid -            return result - -        if video_id is None: -            # This is an event page: -            return self._extract_event(info) +        event = mobj.group('event_id') or mobj.group('event_name') +        account = mobj.group('account_id') or mobj.group('account_name') +        api_url = self._API_URL_TEMPLATE % (account, event) +        if video_id: +            video_data = self._download_json( +                api_url + '/videos/%s' % video_id, video_id) +            return self._extract_video_info(video_data)          else: -            videos = [self._extract_video_info(video_data['data']) -                      for video_data in info['feed']['data'] -                      if is_relevant(video_data, video_id)] -            if not videos: -                raise ExtractorError('Cannot find video %s' % video_id) -            return videos[0] +            event_data = self._download_json(api_url, video_id) +            return self._extract_event(event_data)  # The original version of Livestream uses a different system  class LivestreamOriginalIE(InfoExtractor):      IE_NAME = 'livestream:original'      _VALID_URL = r'''(?x)https?://original\.livestream\.com/ -        (?P<user>[^/]+)/(?P<type>video|folder) -        (?:\?.*?Id=|/)(?P<id>.*?)(&|$) +        (?P<user>[^/\?#]+)(?:/(?P<type>video|folder) +        (?:(?:\?.*?Id=|/)(?P<id>.*?)(&|$))?)?          '''      _TESTS = [{          'url': 'http://original.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', @@ -204,6 +243,8 @@ class LivestreamOriginalIE(InfoExtractor):              'id': 'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',              'ext': 'mp4',              'title': 'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital', +            'duration': 771.301, +            'view_count': int,          },      }, {          'url': 'https://original.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3', @@ -211,26 +252,62 @@ class LivestreamOriginalIE(InfoExtractor):              'id': 'a07bf706-d0e4-4e75-a747-b021d84f2fd3',          },          'playlist_mincount': 4, +    }, { +        # live stream +        'url': 'http://original.livestream.com/znsbahamas', +        'only_matching': True,      }] -    def _extract_video(self, user, video_id): -        api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id) - +    def _extract_video_info(self, user, video_id): +        api_url = 'http://x%sx.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id=%s' % (user, video_id)          info = self._download_xml(api_url, video_id) -        # this url is used on mobile devices -        stream_url = 'http://x{0}x.api.channel.livestream.com/3.0/getstream.json?id={1}'.format(user, video_id) -        stream_info = self._download_json(stream_url, video_id) +          item = info.find('channel').find('item') -        ns = {'media': 'http://search.yahoo.com/mrss'} -        thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url'] +        title = xpath_text(item, 'title') +        media_ns = {'media': 'http://search.yahoo.com/mrss'} +        thumbnail_url = xpath_attr( +            item, xpath_with_ns('media:thumbnail', media_ns), 'url') +        duration = float_or_none(xpath_attr( +            item, xpath_with_ns('media:content', media_ns), 'duration')) +        ls_ns = {'ls': 'http://api.channel.livestream.com/2.0'} +        view_count = int_or_none(xpath_text( +            item, xpath_with_ns('ls:viewsCount', ls_ns)))          return {              'id': video_id, -            'title': item.find('title').text, -            'url': stream_info['progressiveUrl'], +            'title': title,              'thumbnail': thumbnail_url, +            'duration': duration, +            'view_count': view_count,          } +    def _extract_video_formats(self, video_data, video_id, entry_protocol): +        formats = [] + +        progressive_url = video_data.get('progressiveUrl') +        if progressive_url: +            formats.append({ +                'url': progressive_url, +                'format_id': 'http', +            }) + +        m3u8_url = video_data.get('httpUrl') +        if m3u8_url: +            m3u8_formats = self._extract_m3u8_formats( +                m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False) +            if m3u8_formats: +                formats.extend(m3u8_formats) + +        rtsp_url = video_data.get('rtspUrl') +        if rtsp_url: +            formats.append({ +                'url': rtsp_url, +                'format_id': 'rtsp', +            }) + +        self._sort_formats(formats) +        return formats +      def _extract_folder(self, url, folder_id):          webpage = self._download_webpage(url, folder_id)          paths = orderedSet(re.findall( @@ -239,24 +316,45 @@ class LivestreamOriginalIE(InfoExtractor):                  <a\s+href="(?=https?://livestre\.am/)              )([^"]+)"''', webpage)) -        return { -            '_type': 'playlist', -            'id': folder_id, -            'entries': [{ -                '_type': 'url', -                'url': compat_urlparse.urljoin(url, p), -            } for p in paths], -        } +        entries = [{ +            '_type': 'url', +            'url': compat_urlparse.urljoin(url, p), +        } for p in paths] + +        return self.playlist_result(entries, folder_id)      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        id = mobj.group('id')          user = mobj.group('user')          url_type = mobj.group('type') +        content_id = mobj.group('id')          if url_type == 'folder': -            return self._extract_folder(url, id) +            return self._extract_folder(url, content_id)          else: -            return self._extract_video(user, id) +            # this url is used on mobile devices +            stream_url = 'http://x%sx.api.channel.livestream.com/3.0/getstream.json' % user +            info = {} +            if content_id: +                stream_url += '?id=%s' % content_id +                info = self._extract_video_info(user, content_id) +            else: +                content_id = user +                webpage = self._download_webpage(url, content_id) +                info = { +                    'title': self._og_search_title(webpage), +                    'description': self._og_search_description(webpage), +                    'thumbnail': self._search_regex(r'channelLogo.src\s*=\s*"([^"]+)"', webpage, 'thumbnail', None), +                } +            video_data = self._download_json(stream_url, content_id) +            is_live = video_data.get('isLive') +            entry_protocol = 'm3u8' if is_live else 'm3u8_native' +            info.update({ +                'id': content_id, +                'title': self._live_title(info['title']) if is_live else info['title'], +                'formats': self._extract_video_formats(video_data, content_id, entry_protocol), +                'is_live': is_live, +            }) +            return info  # The server doesn't support HEAD request, the generic extractor can't detect diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index e3236f7b5..863efd896 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -1,12 +1,9 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  from ..utils import ( -    determine_ext, -    js_to_json, +    int_or_none,      parse_duration,      remove_end,  ) @@ -23,9 +20,11 @@ class LRTIE(InfoExtractor):              'title': 'Septynios Kauno dienos',              'description': 'md5:24d84534c7dc76581e59f5689462411a',              'duration': 1783, +            'view_count': int, +            'like_count': int,          },          'params': { -            'skip_download': True,  # HLS download +            'skip_download': True,  # m3u8 download          },      } @@ -34,29 +33,23 @@ class LRTIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          title = remove_end(self._og_search_title(webpage), ' - LRT') +        m3u8_url = self._search_regex( +            r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*location\.hash\.substring\(1\)', +            webpage, 'm3u8 url', group='url') +        formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') +          thumbnail = self._og_search_thumbnail(webpage)          description = self._og_search_description(webpage)          duration = parse_duration(self._search_regex( -            r"'duration':\s*'([^']+)',", webpage, -            'duration', fatal=False, default=None)) +            r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1', +            webpage, 'duration', default=None, group='duration')) -        formats = [] -        for js in re.findall(r'(?s)config:\s*(\{.*?\})', webpage): -            data = self._parse_json(js, video_id, transform_source=js_to_json) -            if 'provider' not in data: -                continue -            if data['provider'] == 'rtmp': -                formats.append({ -                    'format_id': 'rtmp', -                    'ext': determine_ext(data['file']), -                    'url': data['streamer'], -                    'play_path': 'mp4:%s' % data['file'], -                    'preference': -1, -                    'rtmp_real_time': True, -                }) -            else: -                formats.extend( -                    self._extract_m3u8_formats(data['file'], video_id, 'mp4')) +        view_count = int_or_none(self._html_search_regex( +            r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>', +            webpage, 'view count', fatal=False, group='count')) +        like_count = int_or_none(self._search_regex( +            r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<', +            webpage, 'like count', fatal=False, group='count'))          return {              'id': video_id, @@ -65,4 +58,6 @@ class LRTIE(InfoExtractor):              'thumbnail': thumbnail,              'description': description,              'duration': duration, +            'view_count': view_count, +            'like_count': like_count,          } diff --git a/youtube_dl/extractor/makertv.py b/youtube_dl/extractor/makertv.py new file mode 100644 index 000000000..3c34d4604 --- /dev/null +++ b/youtube_dl/extractor/makertv.py @@ -0,0 +1,32 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MakerTVIE(InfoExtractor): +    _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)*video|makerplayer.com/embed/maker)/(?P<id>[a-zA-Z0-9]{12})' +    _TEST = { +        'url': 'http://www.maker.tv/video/Fh3QgymL9gsc', +        'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', +        'info_dict': { +            'id': 'Fh3QgymL9gsc', +            'ext': 'mp4', +            'title': 'Maze Runner: The Scorch Trials Official Movie Review', +            'description': 'md5:11ff3362d7ef1d679fdb649f6413975a', +            'upload_date': '20150918', +            'timestamp': 1442549540, +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        jwplatform_id = self._search_regex(r'jw_?id="([^"]+)"', webpage, 'jwplatform id') + +        return { +            '_type': 'url_transparent', +            'id': video_id, +            'url': 'jwplatform:%s' % jwplatform_id, +            'ie_key': 'JWPlatform', +        } diff --git a/youtube_dl/extractor/movshare.py b/youtube_dl/extractor/movshare.py deleted file mode 100644 index 6101063f2..000000000 --- a/youtube_dl/extractor/movshare.py +++ /dev/null @@ -1,27 +0,0 @@ -from __future__ import unicode_literals - -from .novamov import NovaMovIE - - -class MovShareIE(NovaMovIE): -    IE_NAME = 'movshare' -    IE_DESC = 'MovShare' - -    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'movshare\.(?:net|sx|ag)'} - -    _HOST = 'www.movshare.net' - -    _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' -    _TITLE_REGEX = r'<strong>Title:</strong> ([^<]+)</p>' -    _DESCRIPTION_REGEX = r'<strong>Description:</strong> ([^<]+)</p>' - -    _TEST = { -        'url': 'http://www.movshare.net/video/559e28be54d96', -        'md5': 'abd31a2132947262c50429e1d16c1bfd', -        'info_dict': { -            'id': '559e28be54d96', -            'ext': 'flv', -            'title': 'dissapeared image', -            'description': 'optical illusion  dissapeared image  magic illusion', -        } -    } diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index e683d24c4..340c922bd 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -3,14 +3,12 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import ( -    compat_str, -    compat_HTTPError, -) +from ..compat import compat_HTTPError  from ..utils import (      ExtractorError,      find_xpath_attr,      lowercase_escape, +    smuggle_url,      unescapeHTML,  ) @@ -62,12 +60,13 @@ class NBCIE(InfoExtractor):          theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(              [                  r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"', +                r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"',                  r'"embedURL"\s*:\s*"([^"]+)"'              ],              webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))          if theplatform_url.startswith('//'):              theplatform_url = 'http:' + theplatform_url -        return self.url_result(theplatform_url) +        return self.url_result(smuggle_url(theplatform_url, {'source_url': url}))  class NBCSportsVPlayerIE(InfoExtractor): @@ -187,7 +186,7 @@ class NBCNewsIE(InfoExtractor):                  'title': info.find('headline').text,                  'ext': 'flv',                  'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, -                'description': compat_str(info.find('caption').text), +                'description': info.find('caption').text,                  'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,              }          else: diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 16213eed9..894c51399 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -88,10 +88,10 @@ class NDRIE(NDRBaseIE):              'embedURL', webpage, 'embed URL', fatal=True)          description = self._search_regex(              r'<p[^>]+itemprop="description">([^<]+)</p>', -            webpage, 'description', fatal=False) +            webpage, 'description', default=None) or self._og_search_description(webpage)          timestamp = parse_iso8601(              self._search_regex( -                r'<span itemprop="datePublished" content="([^"]+)">', +                r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"',                  webpage, 'upload date', fatal=False))          return {              '_type': 'url_transparent', diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 76bd21e6d..d440313d5 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -9,6 +9,7 @@ from .common import InfoExtractor  from ..compat import (      compat_str,      compat_urllib_parse, +    compat_urlparse,  )  from ..utils import (      clean_html, @@ -82,14 +83,21 @@ class NocoIE(InfoExtractor):          if 'erreur' in login:              raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True) +    @staticmethod +    def _ts(): +        return int(time.time() * 1000) +      def _call_api(self, path, video_id, note, sub_lang=None): -        ts = compat_str(int(time.time() * 1000)) +        ts = compat_str(self._ts() + self._ts_offset)          tk = hashlib.md5((hashlib.md5(ts.encode('ascii')).hexdigest() + '#8S?uCraTedap6a').encode('ascii')).hexdigest()          url = self._API_URL_TEMPLATE % (path, ts, tk)          if sub_lang:              url += self._SUB_LANG_TEMPLATE % sub_lang -        resp = self._download_json(url, video_id, note) +        request = sanitized_Request(url) +        request.add_header('Referer', self._referer) + +        resp = self._download_json(request, video_id, note)          if isinstance(resp, dict) and resp.get('error'):              self._raise_error(resp['error'], resp['description']) @@ -102,8 +110,22 @@ class NocoIE(InfoExtractor):              expected=True)      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url) + +        # Timestamp adjustment offset between server time and local time +        # must be calculated in order to use timestamps closest to server's +        # in all API requests (see https://github.com/rg3/youtube-dl/issues/7864) +        webpage = self._download_webpage(url, video_id) + +        player_url = self._search_regex( +            r'(["\'])(?P<player>https?://noco\.tv/(?:[^/]+/)+NocoPlayer.+?\.swf.*?)\1', +            webpage, 'noco player', group='player', +            default='http://noco.tv/cdata/js/player/NocoPlayer-v1.2.40.swf') + +        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(player_url).query) +        ts = int_or_none(qs.get('ts', [None])[0]) +        self._ts_offset = ts - self._ts() if ts else 0 +        self._referer = player_url          medias = self._call_api(              'shows/%s/medias' % video_id, @@ -155,8 +177,8 @@ class NocoIE(InfoExtractor):                          'format_id': format_id_extended,                          'width': int_or_none(fmt.get('res_width')),                          'height': int_or_none(fmt.get('res_lines')), -                        'abr': int_or_none(fmt.get('audiobitrate')), -                        'vbr': int_or_none(fmt.get('videobitrate')), +                        'abr': int_or_none(fmt.get('audiobitrate'), 1000), +                        'vbr': int_or_none(fmt.get('videobitrate'), 1000),                          'filesize': int_or_none(fmt.get('filesize')),                          'format_note': qualities[format_id].get('quality_name'),                          'quality': qualities[format_id].get('priority'), diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index 6163e8855..d68c1ad79 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -17,15 +17,16 @@ class NovaMovIE(InfoExtractor):      IE_NAME = 'novamov'      IE_DESC = 'NovaMov' -    _VALID_URL_TEMPLATE = r'http://(?:(?:www\.)?%(host)s/(?:file|video)/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<id>[a-z\d]{13})' +    _VALID_URL_TEMPLATE = r'http://(?:(?:www\.)?%(host)s/(?:file|video|mobile/#/videos)/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<id>[a-z\d]{13})'      _VALID_URL = _VALID_URL_TEMPLATE % {'host': 'novamov\.com'}      _HOST = 'www.novamov.com'      _FILE_DELETED_REGEX = r'This file no longer exists on our servers!</h2>' -    _FILEKEY_REGEX = r'flashvars\.filekey="(?P<filekey>[^"]+)";' +    _FILEKEY_REGEX = r'flashvars\.filekey=(?P<filekey>"?[^"]+"?);'      _TITLE_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>'      _DESCRIPTION_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>' +    _URL_TEMPLATE = 'http://%s/video/%s'      _TEST = {          'url': 'http://www.novamov.com/video/4rurhn9x446jj', @@ -39,20 +40,28 @@ class NovaMovIE(InfoExtractor):          'skip': '"Invalid token" errors abound (in web interface as well as youtube-dl, there is nothing we can do about it.)'      } +    def _check_existence(self, webpage, video_id): +        if re.search(self._FILE_DELETED_REGEX, webpage) is not None: +            raise ExtractorError('Video %s does not exist' % video_id, expected=True) +      def _real_extract(self, url):          video_id = self._match_id(url) -        url = 'http://%s/video/%s' % (self._HOST, video_id) +        url = self._URL_TEMPLATE % (self._HOST, video_id)          webpage = self._download_webpage(              url, video_id, 'Downloading video page') -        if re.search(self._FILE_DELETED_REGEX, webpage) is not None: -            raise ExtractorError('Video %s does not exist' % video_id, expected=True) +        self._check_existence(webpage, video_id)          def extract_filekey(default=NO_DEFAULT): -            return self._search_regex( +            filekey = self._search_regex(                  self._FILEKEY_REGEX, webpage, 'filekey', default=default) +            if filekey is not default and (filekey[0] != '"' or filekey[-1] != '"'): +                return self._search_regex( +                    r'var\s+%s\s*=\s*"([^"]+)"' % re.escape(filekey), webpage, 'filekey', default=default) +            else: +                return filekey          filekey = extract_filekey(default=None) @@ -69,6 +78,7 @@ class NovaMovIE(InfoExtractor):              request.add_header('Referer', post_url)              webpage = self._download_webpage(                  request, video_id, 'Downloading continue to the video page') +            self._check_existence(webpage, video_id)          filekey = extract_filekey() @@ -92,3 +102,89 @@ class NovaMovIE(InfoExtractor):              'title': title,              'description': description          } + + +class WholeCloudIE(NovaMovIE): +    IE_NAME = 'wholecloud' +    IE_DESC = 'WholeCloud' + +    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': '(?:wholecloud\.net|movshare\.(?:net|sx|ag))'} + +    _HOST = 'www.wholecloud.net' + +    _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' +    _TITLE_REGEX = r'<strong>Title:</strong> ([^<]+)</p>' +    _DESCRIPTION_REGEX = r'<strong>Description:</strong> ([^<]+)</p>' + +    _TEST = { +        'url': 'http://www.wholecloud.net/video/559e28be54d96', +        'md5': 'abd31a2132947262c50429e1d16c1bfd', +        'info_dict': { +            'id': '559e28be54d96', +            'ext': 'flv', +            'title': 'dissapeared image', +            'description': 'optical illusion  dissapeared image  magic illusion', +        } +    } + + +class NowVideoIE(NovaMovIE): +    IE_NAME = 'nowvideo' +    IE_DESC = 'NowVideo' + +    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:to|ch|ec|sx|eu|at|ag|co|li)'} + +    _HOST = 'www.nowvideo.to' + +    _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' +    _TITLE_REGEX = r'<h4>([^<]+)</h4>' +    _DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>' + +    _TEST = { +        'url': 'http://www.nowvideo.sx/video/f1d6fce9a968b', +        'md5': '12c82cad4f2084881d8bc60ee29df092', +        'info_dict': { +            'id': 'f1d6fce9a968b', +            'ext': 'flv', +            'title': 'youtubedl test video BaWjenozKc', +            'description': 'Description', +        }, +    } + + +class VideoWeedIE(NovaMovIE): +    IE_NAME = 'videoweed' +    IE_DESC = 'VideoWeed' + +    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'videoweed\.(?:es|com)'} + +    _HOST = 'www.videoweed.es' + +    _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' +    _TITLE_REGEX = r'<h1 class="text_shadow">([^<]+)</h1>' +    _URL_TEMPLATE = 'http://%s/file/%s' + +    _TEST = { +        'url': 'http://www.videoweed.es/file/b42178afbea14', +        'md5': 'abd31a2132947262c50429e1d16c1bfd', +        'info_dict': { +            'id': 'b42178afbea14', +            'ext': 'flv', +            'title': 'optical illusion  dissapeared image magic illusion', +            'description': '' +        }, +    } + + +class CloudTimeIE(NovaMovIE): +    IE_NAME = 'cloudtime' +    IE_DESC = 'CloudTime' + +    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'cloudtime\.to'} + +    _HOST = 'www.cloudtime.to' + +    _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' +    _TITLE_REGEX = r'<div[^>]+class=["\']video_det["\'][^>]*>\s*<strong>([^<]+)</strong>' + +    _TEST = None diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index d480fb58c..446f5901c 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -1,7 +1,10 @@  # encoding: utf-8  from __future__ import unicode_literals -from .brightcove import BrightcoveLegacyIE +from .brightcove import ( +    BrightcoveLegacyIE, +    BrightcoveNewIE, +)  from .common import InfoExtractor  from ..compat import compat_str  from ..utils import ( @@ -23,9 +26,12 @@ class NownessBaseIE(InfoExtractor):                              note='Downloading player JavaScript',                              errnote='Unable to download player JavaScript')                          bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code) -                        if bc_url is None: -                            raise ExtractorError('Could not find player definition') -                        return self.url_result(bc_url, 'BrightcoveLegacy') +                        if bc_url: +                            return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) +                        bc_url = BrightcoveNewIE._extract_url(player_code) +                        if bc_url: +                            return self.url_result(bc_url, BrightcoveNewIE.ie_key()) +                        raise ExtractorError('Could not find player definition')                      elif source == 'vimeo':                          return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')                      elif source == 'youtube': diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 67e34b294..fd107aca2 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -71,7 +71,7 @@ class NowTVBaseIE(InfoExtractor):  class NowTVIE(NowTVBaseIE): -    _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:list/[^/]+/)?(?P<id>[^/]+)/(?:player|preview)' +    _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P<id>[^/]+)/(?:player|preview)'      _TESTS = [{          # rtl @@ -190,6 +190,9 @@ class NowTVIE(NowTVBaseIE):      }, {          'url': 'http://www.nowtv.de/rtl2/echtzeit/list/aktuell/schnelles-geld-am-ende-der-welt/player',          'only_matching': True, +    }, { +        'url': 'http://www.nowtv.de/rtl2/zuhause-im-glueck/jahr/2015/11/eine-erschuetternde-diagnose/player', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py deleted file mode 100644 index 57ee3d366..000000000 --- a/youtube_dl/extractor/nowvideo.py +++ /dev/null @@ -1,28 +0,0 @@ -from __future__ import unicode_literals - -from .novamov import NovaMovIE - - -class NowVideoIE(NovaMovIE): -    IE_NAME = 'nowvideo' -    IE_DESC = 'NowVideo' - -    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:to|ch|ec|sx|eu|at|ag|co|li)'} - -    _HOST = 'www.nowvideo.to' - -    _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' -    _FILEKEY_REGEX = r'var fkzd="([^"]+)";' -    _TITLE_REGEX = r'<h4>([^<]+)</h4>' -    _DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>' - -    _TEST = { -        'url': 'http://www.nowvideo.ch/video/0mw0yow7b6dxa', -        'md5': 'f8fbbc8add72bd95b7850c6a02fc8817', -        'info_dict': { -            'id': '0mw0yow7b6dxa', -            'ext': 'flv', -            'title': 'youtubedl test video _BaW_jenozKc.mp4', -            'description': 'Description', -        } -    } diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 35067e271..8603fd692 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -27,6 +27,7 @@ class OoyalaBaseIE(InfoExtractor):              'duration': float_or_none(metadata.get('duration'), 1000),          } +        urls = []          formats = []          for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'):              auth_data = self._download_json( @@ -38,20 +39,28 @@ class OoyalaBaseIE(InfoExtractor):              if cur_auth_data['authorized']:                  for stream in cur_auth_data['streams']:                      url = base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8') +                    if url in urls: +                        continue +                    urls.append(url)                      delivery_type = stream['delivery_type'] -                    if delivery_type == 'remote_asset': -                        video_info['url'] = url -                        return video_info -                    if delivery_type == 'hls': -                        formats.extend(self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) -                    elif delivery_type == 'hds': -                        formats.extend(self._extract_f4m_formats(url, embed_code, -1, 'hds', fatal=False)) +                    if delivery_type == 'hls' or '.m3u8' in url: +                        m3u8_formats = self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) +                        if m3u8_formats: +                            formats.extend(m3u8_formats) +                    elif delivery_type == 'hds' or '.f4m' in url: +                        f4m_formats = self._extract_f4m_formats(url, embed_code, f4m_id='hds', fatal=False) +                        if f4m_formats: +                            formats.extend(f4m_formats) +                    elif '.smil' in url: +                        smil_formats = self._extract_smil_formats(url, embed_code, fatal=False) +                        if smil_formats: +                            formats.extend(smil_formats)                      else:                          formats.append({                              'url': url,                              'ext': stream.get('delivery_type'),                              'vcodec': stream.get('video_codec'), -                            'format_id': '%s-%s-%sp' % (stream.get('profile'), delivery_type, stream.get('height')), +                            'format_id': delivery_type,                              'width': int_or_none(stream.get('width')),                              'height': int_or_none(stream.get('height')),                              'abr': int_or_none(stream.get('audio_bitrate')), diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index b787e2a73..97e8ffc97 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -15,16 +15,181 @@ from ..utils import (  class PBSIE(InfoExtractor): +    _STATIONS = ( +        (r'(?:video|www|player)\.pbs\.org', 'PBS: Public Broadcasting Service'),  # http://www.pbs.org/ +        (r'video\.aptv\.org', 'APT - Alabama Public Television (WBIQ)'),  # http://aptv.org/ +        (r'video\.gpb\.org', 'GPB/Georgia Public Broadcasting (WGTV)'),  # http://www.gpb.org/ +        (r'video\.mpbonline\.org', 'Mississippi Public Broadcasting (WMPN)'),  # http://www.mpbonline.org +        (r'video\.wnpt\.org', 'Nashville Public Television (WNPT)'),  # http://www.wnpt.org +        (r'video\.wfsu\.org', 'WFSU-TV (WFSU)'),  # http://wfsu.org/ +        (r'video\.wsre\.org', 'WSRE (WSRE)'),  # http://www.wsre.org +        (r'video\.wtcitv\.org', 'WTCI (WTCI)'),  # http://www.wtcitv.org +        (r'video\.pba\.org', 'WPBA/Channel 30 (WPBA)'),  # http://pba.org/ +        (r'video\.alaskapublic\.org', 'Alaska Public Media (KAKM)'),  # http://alaskapublic.org/kakm +        # (r'kuac\.org', 'KUAC (KUAC)'),  # http://kuac.org/kuac-tv/ +        # (r'ktoo\.org', '360 North (KTOO)'),  # http://www.ktoo.org/ +        # (r'azpm\.org', 'KUAT 6 (KUAT)'),  # http://www.azpm.org/ +        (r'video\.azpbs\.org', 'Arizona PBS (KAET)'),  # http://www.azpbs.org +        (r'portal\.knme\.org', 'KNME-TV/Channel 5 (KNME)'),  # http://www.newmexicopbs.org/ +        (r'video\.vegaspbs\.org', 'Vegas PBS (KLVX)'),  # http://vegaspbs.org/ +        (r'watch\.aetn\.org', 'AETN/ARKANSAS ETV NETWORK (KETS)'),  # http://www.aetn.org/ +        (r'video\.ket\.org', 'KET (WKLE)'),  # http://www.ket.org/ +        (r'video\.wkno\.org', 'WKNO/Channel 10 (WKNO)'),  # http://www.wkno.org/ +        (r'video\.lpb\.org', 'LPB/LOUISIANA PUBLIC BROADCASTING (WLPB)'),  # http://www.lpb.org/ +        (r'videos\.oeta\.tv', 'OETA (KETA)'),  # http://www.oeta.tv +        (r'video\.optv\.org', 'Ozarks Public Television (KOZK)'),  # http://www.optv.org/ +        (r'watch\.wsiu\.org', 'WSIU Public Broadcasting (WSIU)'),  # http://www.wsiu.org/ +        (r'video\.keet\.org', 'KEET TV (KEET)'),  # http://www.keet.org +        (r'pbs\.kixe\.org', 'KIXE/Channel 9 (KIXE)'),  # http://kixe.org/ +        (r'video\.kpbs\.org', 'KPBS San Diego (KPBS)'),  # http://www.kpbs.org/ +        (r'video\.kqed\.org', 'KQED (KQED)'),  # http://www.kqed.org +        (r'vids\.kvie\.org', 'KVIE Public Television (KVIE)'),  # http://www.kvie.org +        (r'video\.pbssocal\.org', 'PBS SoCal/KOCE (KOCE)'),  # http://www.pbssocal.org/ +        (r'video\.valleypbs\.org', 'ValleyPBS (KVPT)'),  # http://www.valleypbs.org/ +        (r'video\.cptv\.org', 'CONNECTICUT PUBLIC TELEVISION (WEDH)'),  # http://cptv.org +        (r'watch\.knpb\.org', 'KNPB Channel 5 (KNPB)'),  # http://www.knpb.org/ +        (r'video\.soptv\.org', 'SOPTV (KSYS)'),  # http://www.soptv.org +        # (r'klcs\.org', 'KLCS/Channel 58 (KLCS)'),  # http://www.klcs.org +        # (r'krcb\.org', 'KRCB Television & Radio (KRCB)'),  # http://www.krcb.org +        # (r'kvcr\.org', 'KVCR TV/DT/FM :: Vision for the Future (KVCR)'),  # http://kvcr.org +        (r'video\.rmpbs\.org', 'Rocky Mountain PBS (KRMA)'),  # http://www.rmpbs.org +        (r'video\.kenw\.org', 'KENW-TV3 (KENW)'),  # http://www.kenw.org +        (r'video\.kued\.org', 'KUED Channel 7 (KUED)'),  # http://www.kued.org +        (r'video\.wyomingpbs\.org', 'Wyoming PBS (KCWC)'),  # http://www.wyomingpbs.org +        (r'video\.cpt12\.org', 'Colorado Public Television / KBDI 12 (KBDI)'),  # http://www.cpt12.org/ +        (r'video\.kbyueleven\.org', 'KBYU-TV (KBYU)'),  # http://www.kbyutv.org/ +        (r'video\.thirteen\.org', 'Thirteen/WNET New York (WNET)'),  # http://www.thirteen.org +        (r'video\.wgbh\.org', 'WGBH/Channel 2 (WGBH)'),  # http://wgbh.org +        (r'video\.wgby\.org', 'WGBY (WGBY)'),  # http://www.wgby.org +        (r'watch\.njtvonline\.org', 'NJTV Public Media NJ (WNJT)'),  # http://www.njtvonline.org/ +        # (r'ripbs\.org', 'Rhode Island PBS (WSBE)'),  # http://www.ripbs.org/home/ +        (r'watch\.wliw\.org', 'WLIW21 (WLIW)'),  # http://www.wliw.org/ +        (r'video\.mpt\.tv', 'mpt/Maryland Public Television (WMPB)'),  # http://www.mpt.org +        (r'watch\.weta\.org', 'WETA Television and Radio (WETA)'),  # http://www.weta.org +        (r'video\.whyy\.org', 'WHYY (WHYY)'),  # http://www.whyy.org +        (r'video\.wlvt\.org', 'PBS 39 (WLVT)'),  # http://www.wlvt.org/ +        (r'video\.wvpt\.net', 'WVPT - Your Source for PBS and More! (WVPT)'),  # http://www.wvpt.net +        (r'video\.whut\.org', 'Howard University Television (WHUT)'),  # http://www.whut.org +        (r'video\.wedu\.org', 'WEDU PBS (WEDU)'),  # http://www.wedu.org +        (r'video\.wgcu\.org', 'WGCU Public Media (WGCU)'),  # http://www.wgcu.org/ +        # (r'wjct\.org', 'WJCT Public Broadcasting (WJCT)'),  # http://www.wjct.org +        (r'video\.wpbt2\.org', 'WPBT2 (WPBT)'),  # http://www.wpbt2.org +        (r'video\.wucftv\.org', 'WUCF TV (WUCF)'),  # http://wucftv.org +        (r'video\.wuft\.org', 'WUFT/Channel 5 (WUFT)'),  # http://www.wuft.org +        (r'watch\.wxel\.org', 'WXEL/Channel 42 (WXEL)'),  # http://www.wxel.org/home/ +        (r'video\.wlrn\.org', 'WLRN/Channel 17 (WLRN)'),  # http://www.wlrn.org/ +        (r'video\.wusf\.usf\.edu', 'WUSF Public Broadcasting (WUSF)'),  # http://wusf.org/ +        (r'video\.scetv\.org', 'ETV (WRLK)'),  # http://www.scetv.org +        (r'video\.unctv\.org', 'UNC-TV (WUNC)'),  # http://www.unctv.org/ +        # (r'pbsguam\.org', 'PBS Guam (KGTF)'),  # http://www.pbsguam.org/ +        (r'video\.pbshawaii\.org', 'PBS Hawaii - Oceanic Cable Channel 10 (KHET)'),  # http://www.pbshawaii.org/ +        (r'video\.idahoptv\.org', 'Idaho Public Television (KAID)'),  # http://idahoptv.org +        (r'video\.ksps\.org', 'KSPS (KSPS)'),  # http://www.ksps.org/home/ +        (r'watch\.opb\.org', 'OPB (KOPB)'),  # http://www.opb.org +        (r'watch\.nwptv\.org', 'KWSU/Channel 10 & KTNW/Channel 31 (KWSU)'),  # http://www.kwsu.org +        (r'video\.will\.illinois\.edu', 'WILL-TV (WILL)'),  # http://will.illinois.edu/ +        (r'video\.networkknowledge\.tv', 'Network Knowledge - WSEC/Springfield (WSEC)'),  # http://www.wsec.tv +        (r'video\.wttw\.com', 'WTTW11 (WTTW)'),  # http://www.wttw.com/ +        # (r'wtvp\.org', 'WTVP & WTVP.org, Public Media for Central Illinois (WTVP)'),  # http://www.wtvp.org/ +        (r'video\.iptv\.org', 'Iowa Public Television/IPTV (KDIN)'),  # http://www.iptv.org/ +        (r'video\.ninenet\.org', 'Nine Network (KETC)'),  # http://www.ninenet.org +        (r'video\.wfwa\.org', 'PBS39 Fort Wayne (WFWA)'),  # http://wfwa.org/ +        (r'video\.wfyi\.org', 'WFYI Indianapolis (WFYI)'),  # http://www.wfyi.org +        (r'video\.mptv\.org', 'Milwaukee Public Television (WMVS)'),  # http://www.mptv.org +        (r'video\.wnin\.org', 'WNIN (WNIN)'),  # http://www.wnin.org/ +        (r'video\.wnit\.org', 'WNIT Public Television (WNIT)'),  # http://www.wnit.org/ +        (r'video\.wpt\.org', 'WPT (WPNE)'),  # http://www.wpt.org/ +        (r'video\.wvut\.org', 'WVUT/Channel 22 (WVUT)'),  # http://wvut.org/ +        (r'video\.weiu\.net', 'WEIU/Channel 51 (WEIU)'),  # http://www.weiu.net +        (r'video\.wqpt\.org', 'WQPT-TV (WQPT)'),  # http://www.wqpt.org +        (r'video\.wycc\.org', 'WYCC PBS Chicago (WYCC)'),  # http://www.wycc.org +        # (r'lakeshorepublicmedia\.org', 'Lakeshore Public Television (WYIN)'),  # http://lakeshorepublicmedia.org/ +        (r'video\.wipb\.org', 'WIPB-TV (WIPB)'),  # http://wipb.org +        (r'video\.indianapublicmedia\.org', 'WTIU (WTIU)'),  # http://indianapublicmedia.org/tv/ +        (r'watch\.cetconnect\.org', 'CET  (WCET)'),  # http://www.cetconnect.org +        (r'video\.thinktv\.org', 'ThinkTVNetwork (WPTD)'),  # http://www.thinktv.org +        (r'video\.wbgu\.org', 'WBGU-TV (WBGU)'),  # http://wbgu.org +        (r'video\.wgvu\.org', 'WGVU TV (WGVU)'),  # http://www.wgvu.org/ +        (r'video\.netnebraska\.org', 'NET1 (KUON)'),  # http://netnebraska.org +        (r'video\.pioneer\.org', 'Pioneer Public Television (KWCM)'),  # http://www.pioneer.org +        (r'watch\.sdpb\.org', 'SDPB Television (KUSD)'),  # http://www.sdpb.org +        (r'video\.tpt\.org', 'TPT (KTCA)'),  # http://www.tpt.org +        (r'watch\.ksmq\.org', 'KSMQ (KSMQ)'),  # http://www.ksmq.org/ +        (r'watch\.kpts\.org', 'KPTS/Channel 8 (KPTS)'),  # http://www.kpts.org/ +        (r'watch\.ktwu\.org', 'KTWU/Channel 11 (KTWU)'),  # http://ktwu.org +        # (r'shptv\.org', 'Smoky Hills Public Television (KOOD)'),  # http://www.shptv.org +        # (r'kcpt\.org', 'KCPT Kansas City Public Television (KCPT)'),  # http://kcpt.org/ +        # (r'blueridgepbs\.org', 'Blue Ridge PBS (WBRA)'),  # http://www.blueridgepbs.org/ +        (r'watch\.easttennesseepbs\.org', 'East Tennessee PBS (WSJK)'),  # http://easttennesseepbs.org +        (r'video\.wcte\.tv', 'WCTE-TV (WCTE)'),  # http://www.wcte.org +        (r'video\.wljt\.org', 'WLJT, Channel 11 (WLJT)'),  # http://wljt.org/ +        (r'video\.wosu\.org', 'WOSU TV (WOSU)'),  # http://wosu.org/ +        (r'video\.woub\.org', 'WOUB/WOUC (WOUB)'),  # http://woub.org/tv/index.php?section=5 +        (r'video\.wvpublic\.org', 'WVPB (WVPB)'),  # http://wvpublic.org/ +        (r'video\.wkyupbs\.org', 'WKYU-PBS (WKYU)'),  # http://www.wkyupbs.org +        # (r'wyes\.org', 'WYES-TV/New Orleans (WYES)'),  # http://www.wyes.org +        (r'video\.kera\.org', 'KERA 13 (KERA)'),  # http://www.kera.org/ +        (r'video\.mpbn\.net', 'MPBN (WCBB)'),  # http://www.mpbn.net/ +        (r'video\.mountainlake\.org', 'Mountain Lake PBS (WCFE)'),  # http://www.mountainlake.org/ +        (r'video\.nhptv\.org', 'NHPTV (WENH)'),  # http://nhptv.org/ +        (r'video\.vpt\.org', 'Vermont PBS (WETK)'),  # http://www.vpt.org +        (r'video\.witf\.org', 'witf (WITF)'),  # http://www.witf.org +        (r'watch\.wqed\.org', 'WQED Multimedia (WQED)'),  # http://www.wqed.org/ +        (r'video\.wmht\.org', 'WMHT Educational Telecommunications (WMHT)'),  # http://www.wmht.org/home/ +        (r'video\.deltabroadcasting\.org', 'Q-TV (WDCQ)'),  # http://www.deltabroadcasting.org +        (r'video\.dptv\.org', 'WTVS Detroit Public TV (WTVS)'),  # http://www.dptv.org/ +        (r'video\.wcmu\.org', 'CMU Public Television (WCMU)'),  # http://www.wcmu.org +        (r'video\.wkar\.org', 'WKAR-TV (WKAR)'),  # http://wkar.org/ +        (r'wnmuvideo\.nmu\.edu', 'WNMU-TV Public TV 13 (WNMU)'),  # http://wnmutv.nmu.edu +        (r'video\.wdse\.org', 'WDSE - WRPT (WDSE)'),  # http://www.wdse.org/ +        (r'video\.wgte\.org', 'WGTE TV (WGTE)'),  # http://www.wgte.org +        (r'video\.lptv\.org', 'Lakeland Public Television (KAWE)'),  # http://www.lakelandptv.org +        # (r'prairiepublic\.org', 'PRAIRIE PUBLIC (KFME)'),  # http://www.prairiepublic.org/ +        (r'video\.kmos\.org', 'KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS)'),  # http://www.kmos.org/ +        (r'watch\.montanapbs\.org', 'MontanaPBS (KUSM)'),  # http://montanapbs.org +        (r'video\.krwg\.org', 'KRWG/Channel 22 (KRWG)'),  # http://www.krwg.org +        (r'video\.kacvtv\.org', 'KACV (KACV)'),  # http://www.panhandlepbs.org/home/ +        (r'video\.kcostv\.org', 'KCOS/Channel 13 (KCOS)'),  # www.kcostv.org +        (r'video\.wcny\.org', 'WCNY/Channel 24 (WCNY)'),  # http://www.wcny.org +        (r'video\.wned\.org', 'WNED (WNED)'),  # http://www.wned.org/ +        (r'watch\.wpbstv\.org', 'WPBS (WPBS)'),  # http://www.wpbstv.org +        (r'video\.wskg\.org', 'WSKG Public TV (WSKG)'),  # http://wskg.org +        (r'video\.wxxi\.org', 'WXXI (WXXI)'),  # http://wxxi.org +        (r'video\.wpsu\.org', 'WPSU (WPSU)'),  # http://www.wpsu.org +        # (r'wqln\.org', 'WQLN/Channel 54 (WQLN)'),  # http://www.wqln.org +        (r'on-demand\.wvia\.org', 'WVIA Public Media Studios (WVIA)'),  # http://www.wvia.org/ +        (r'video\.wtvi\.org', 'WTVI (WTVI)'),  # http://www.wtvi.org/ +        # (r'whro\.org', 'WHRO (WHRO)'),  # http://whro.org +        (r'video\.westernreservepublicmedia\.org', 'Western Reserve PBS (WNEO)'),  # http://www.WesternReservePublicMedia.org/ +        (r'video\.ideastream\.org', 'WVIZ/PBS ideastream (WVIZ)'),  # http://www.wviz.org/ +        (r'video\.kcts9\.org', 'KCTS 9 (KCTS)'),  # http://kcts9.org/ +        (r'video\.basinpbs\.org', 'Basin PBS (KPBT)'),  # http://www.basinpbs.org +        (r'video\.houstonpbs\.org', 'KUHT / Channel 8 (KUHT)'),  # http://www.houstonpublicmedia.org/ +        # (r'tamu\.edu', 'KAMU - TV (KAMU)'),  # http://KAMU.tamu.edu +        # (r'kedt\.org', 'KEDT/Channel 16 (KEDT)'),  # http://www.kedt.org +        (r'video\.klrn\.org', 'KLRN (KLRN)'),  # http://www.klrn.org +        (r'video\.klru\.tv', 'KLRU (KLRU)'),  # http://www.klru.org +        # (r'kmbh\.org', 'KMBH-TV (KMBH)'),  # http://www.kmbh.org +        # (r'knct\.org', 'KNCT (KNCT)'),  # http://www.knct.org +        # (r'ktxt\.org', 'KTTZ-TV (KTXT)'),  # http://www.ktxt.org +        (r'video\.wtjx\.org', 'WTJX Channel 12 (WTJX)'),  # http://www.wtjx.org/ +        (r'video\.ideastations\.org', 'WCVE PBS (WCVE)'),  # http://ideastations.org/ +        (r'video\.kbtc\.org', 'KBTC Public Television (KBTC)'),  # http://kbtc.org +    ) + +    IE_NAME = 'pbs' +    IE_DESC = 'Public Broadcasting Service (PBS) and member stations: %s' % ', '.join(list(zip(*_STATIONS))[1]) +      _VALID_URL = r'''(?x)https?://          (?:             # Direct video URL -           video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | +           (?:%s)/(?:viralplayer|video)/(?P<id>[0-9]+)/? |             # Article with embedded player (or direct video)             (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |             # Player             (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/          ) -    ''' +    ''' % '|'.join(list(zip(*_STATIONS))[0])      _TESTS = [          { @@ -174,6 +339,10 @@ class PBSIE(InfoExtractor):          {              'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',              'only_matching': True, +        }, +        { +            'url': 'http://watch.knpb.org/video/2365616055/', +            'only_matching': True,          }      ]      _ERRORS = { @@ -204,6 +373,7 @@ class PBSIE(InfoExtractor):              MEDIA_ID_REGEXES = [                  r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed                  r'class="coveplayerid">([^<]+)<',                       # coveplayer +                r'<section[^>]+data-coveid="(\d+)"',                    # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/                  r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>',  # jwplayer              ] diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 63cc764bb..514e9b433 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -31,9 +31,8 @@ class PeriscopeIE(InfoExtractor):      }]      def _call_api(self, method, value): -        attribute = 'token' if len(value) > 13 else 'broadcast_id'          return self._download_json( -            'https://api.periscope.tv/api/v2/%s?%s=%s' % (method, attribute, value), value) +            'https://api.periscope.tv/api/v2/%s?broadcast_id=%s' % (method, value), value)      def _real_extract(self, url):          token = self._match_id(url) diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index 46cebc0d7..6ce2ec19d 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -1,10 +1,9 @@  from __future__ import unicode_literals -from .common import InfoExtractor -from .zdf import extract_from_xml_url +from .zdf import ZDFIE -class PhoenixIE(InfoExtractor): +class PhoenixIE(ZDFIE):      _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/          (?:              phoenix/die_sendungen/(?:[^/]+/)? @@ -41,5 +40,5 @@ class PhoenixIE(InfoExtractor):              r'<div class="phx_vod" id="phx_vod_([0-9]+)"',              webpage, 'internal video ID') -        api_url = 'http://www.phoenix.de/php/zdfplayer-v1.3/data/beitragsDetails.php?ak=web&id=%s' % internal_id -        return extract_from_xml_url(self, video_id, api_url) +        api_url = 'http://www.phoenix.de/php/mediaplayer/data/beitrags_details.php?ak=web&id=%s' % internal_id +        return self.extract_from_xml_url(video_id, api_url) diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py index 551c8c9f0..bc559d1df 100644 --- a/youtube_dl/extractor/pladform.py +++ b/youtube_dl/extractor/pladform.py @@ -1,6 +1,8 @@  # coding: utf-8  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..utils import (      ExtractorError, @@ -44,6 +46,13 @@ class PladformIE(InfoExtractor):          'only_matching': True,      }] +    @staticmethod +    def _extract_url(webpage): +        mobj = re.search( +            r'<iframe[^>]+src="(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage) +        if mobj: +            return mobj.group('url') +      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 7ff1d06c4..278b1d2bf 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -8,20 +8,24 @@ from ..compat import (      compat_urlparse,  )  from ..utils import ( +    ExtractorError, +    determine_ext,      parse_duration,      unified_strdate, +    int_or_none, +    xpath_text,  ) -class RaiIE(InfoExtractor): -    _VALID_URL = r'(?P<url>(?P<host>http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it))/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' +class RaiTVIE(InfoExtractor): +    _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'      _TESTS = [          {              'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', -            'md5': 'c064c0b2d09c278fb293116ef5d0a32d', +            'md5': '96382709b61dd64a6b88e0f791e6df4c',              'info_dict': {                  'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', -                'ext': 'mp4', +                'ext': 'flv',                  'title': 'Report del 07/04/2014',                  'description': 'md5:f27c544694cacb46a078db84ec35d2d9',                  'upload_date': '20140407', @@ -30,16 +34,14 @@ class RaiIE(InfoExtractor):          },          {              'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', -            'md5': '8bb9c151924ce241b74dd52ef29ceafa', +            'md5': 'd9751b78eac9710d62c2447b224dea39',              'info_dict': {                  'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', -                'ext': 'mp4', +                'ext': 'flv',                  'title': 'TG PRIMO TEMPO', -                'description': '',                  'upload_date': '20140612',                  'duration': 1758,              }, -            'skip': 'Error 404',          },          {              'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', @@ -55,110 +57,106 @@ class RaiIE(InfoExtractor):          },          {              'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html', -            'md5': '35694f062977fe6619943f08ed935730',              'info_dict': {                  'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132',                  'ext': 'mp4',                  'title': 'Alluvione in Sardegna e dissesto idrogeologico',                  'description': 'Edizione delle ore 20:30 ', -            } +            }, +            'skip': 'invalid urls',          },          {              'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', -            'md5': '02b64456f7cc09f96ff14e7dd489017e', +            'md5': '496ab63e420574447f70d02578333437',              'info_dict': {                  'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6',                  'ext': 'flv',                  'title': 'Il Candidato - Primo episodio: "Le Primarie"', -                'description': 'Primo appuntamento con "Il candidato" con Filippo Timi, alias Piero Zucca presidente!', -                'uploader': 'RaiTre', +                'description': 'md5:364b604f7db50594678f483353164fb8', +                'upload_date': '20140923', +                'duration': 386,              }          }, -        { -            'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', -            'md5': '037104d2c14132887e5e4cf114569214', -            'info_dict': { -                'id': '0c7a664b-d0f4-4b2c-8835-3f82e46f433e', -                'ext': 'flv', -                'title': 'Il pacco', -                'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', -                'uploader': 'RaiTre', -                'upload_date': '20141221', -            }, -        }      ] -    def _extract_relinker_url(self, webpage): -        return self._proto_relative_url(self._search_regex( -            [r'name="videourl" content="([^"]+)"', r'var\s+videoURL(?:_MP4)?\s*=\s*"([^"]+)"'], -            webpage, 'relinker url', default=None)) -      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        host = mobj.group('host') +        video_id = self._match_id(url) +        media = self._download_json( +            'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id, +            video_id, 'Downloading video JSON') -        webpage = self._download_webpage(url, video_id) +        thumbnails = [] +        for image_type in ('image', 'image_medium', 'image_300'): +            thumbnail_url = media.get(image_type) +            if thumbnail_url: +                thumbnails.append({ +                    'url': thumbnail_url, +                }) -        relinker_url = self._extract_relinker_url(webpage) - -        if not relinker_url: -            iframe_url = self._search_regex( -                [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', -                 r'drawMediaRaiTV\(["\'](.+?)["\']'], -                webpage, 'iframe') -            if not iframe_url.startswith('http'): -                iframe_url = compat_urlparse.urljoin(url, iframe_url) -            webpage = self._download_webpage( -                iframe_url, video_id) -            relinker_url = self._extract_relinker_url(webpage) - -        relinker = self._download_json( -            '%s&output=47' % relinker_url, video_id) - -        media_url = relinker['video'][0] -        ct = relinker.get('ct') -        if ct == 'f4m': -            formats = self._extract_f4m_formats( -                media_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id) -        else: -            formats = [{ -                'url': media_url, -                'format_id': ct, -            }] +        subtitles = [] +        formats = [] +        media_type = media['type'] +        if 'Audio' in media_type: +            formats.append({ +                'format_id': media.get('formatoAudio'), +                'url': media['audioUrl'], +                'ext': media.get('formatoAudio'), +            }) +        elif 'Video' in media_type: +            def fix_xml(xml): +                return xml.replace(' tag elementi', '').replace('>/', '</') + +            relinker = self._download_xml( +                media['mediaUri'] + '&output=43', video_id, transform_source=fix_xml) -        json_link = self._html_search_meta( -            'jsonlink', webpage, 'JSON link', default=None) -        if json_link: -            media = self._download_json( -                host + json_link, video_id, 'Downloading video JSON') -            title = media.get('name') -            description = media.get('desc') -            thumbnail = media.get('image_300') or media.get('image_medium') or media.get('image') -            duration = parse_duration(media.get('length')) -            uploader = media.get('author') -            upload_date = unified_strdate(media.get('date')) +            has_subtitle = False + +            for element in relinker.findall('element'): +                media_url = xpath_text(element, 'url') +                ext = determine_ext(media_url) +                content_type = xpath_text(element, 'content-type') +                if ext == 'm3u8': +                    m3u8_formats = self._extract_m3u8_formats( +                        media_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', +                        fatal=False) +                    if m3u8_formats: +                        formats.extend(m3u8_formats) +                elif ext == 'f4m': +                    f4m_formats = self._extract_f4m_formats( +                        media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', +                        video_id, f4m_id='hds', fatal=False) +                    if f4m_formats: +                        formats.extend(f4m_formats) +                elif ext == 'stl': +                    has_subtitle = True +                elif content_type.startswith('video/'): +                    bitrate = int_or_none(xpath_text(element, 'bitrate')) +                    formats.append({ +                        'url': media_url, +                        'tbr': bitrate if bitrate > 0 else None, +                        'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', +                    }) +                elif content_type.startswith('image/'): +                    thumbnails.append({ +                        'url': media_url, +                    }) + +            self._sort_formats(formats) + +            if has_subtitle: +                webpage = self._download_webpage(url, video_id) +                subtitles = self._get_subtitles(video_id, webpage)          else: -            title = (self._search_regex( -                r'var\s+videoTitolo\s*=\s*"(.+?)";', -                webpage, 'title', default=None) or self._og_search_title(webpage)).replace('\\"', '"') -            description = self._og_search_description(webpage) -            thumbnail = self._og_search_thumbnail(webpage) -            duration = None -            uploader = self._html_search_meta('Editore', webpage, 'uploader') -            upload_date = unified_strdate(self._html_search_meta( -                'item-date', webpage, 'upload date', default=None)) - -        subtitles = self.extract_subtitles(video_id, webpage) +            raise ExtractorError('not a media file')          return {              'id': video_id, -            'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'uploader': uploader, -            'upload_date': upload_date, -            'duration': duration, +            'title': media['name'], +            'description': media.get('desc'), +            'thumbnails': thumbnails, +            'uploader': media.get('author'), +            'upload_date': unified_strdate(media.get('date')), +            'duration': parse_duration(media.get('length')),              'formats': formats,              'subtitles': subtitles,          } @@ -177,3 +175,36 @@ class RaiIE(InfoExtractor):                  'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions),              }]          return subtitles + + +class RaiIE(InfoExtractor): +    _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' +    _TESTS = [ +        { +            'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', +            'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7', +            'info_dict': { +                'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', +                'ext': 'flv', +                'title': 'Il pacco', +                'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', +                'upload_date': '20141221', +            }, +        } +    ] + +    @classmethod +    def suitable(cls, url): +        return False if RaiTVIE.suitable(url) else super(RaiIE, cls).suitable(url) + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        iframe_url = self._search_regex( +            [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', +             r'drawMediaRaiTV\(["\'](.+?)["\']'], +            webpage, 'iframe') +        if not iframe_url.startswith('http'): +            iframe_url = compat_urlparse.urljoin(url, iframe_url) +        return self.url_result(iframe_url) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 6b09550b0..9db62adb1 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -17,9 +17,9 @@ from ..utils import (  class RutubeIE(InfoExtractor):      IE_NAME = 'rutube'      IE_DESC = 'Rutube videos' -    _VALID_URL = r'https?://rutube\.ru/video/(?P<id>[\da-z]{32})' +    _VALID_URL = r'https?://rutube\.ru/(?:video|play/embed)/(?P<id>[\da-z]{32})' -    _TEST = { +    _TESTS = [{          'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',          'info_dict': {              'id': '3eac3b4561676c17df9132a9a1e62e3e', @@ -36,7 +36,10 @@ class RutubeIE(InfoExtractor):              # It requires ffmpeg (m3u8 download)              'skip_download': True,          }, -    } +    }, { +        'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index d9df06861..f7fe1fece 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -131,7 +131,7 @@ class RUTVIE(InfoExtractor):          is_live = video_type == 'live'          json_data = self._download_json( -            'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if is_live else '', video_id), +            'http://player.rutv.ru/iframe/data%s/id/%s' % ('live' if is_live else 'video', video_id),              video_id, 'Downloading JSON')          if json_data['errors']: diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 919704261..7de7b7273 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -6,12 +6,12 @@ import re  from .common import InfoExtractor  from .brightcove import BrightcoveLegacyIE -from ..compat import compat_urllib_parse  from ..utils import (      ExtractorError,      sanitized_Request,      smuggle_url,      std_headers, +    urlencode_postdata,  ) @@ -57,7 +57,7 @@ class SafariBaseIE(InfoExtractor):          }          request = sanitized_Request( -            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form), headers=headers) +            self._LOGIN_URL, urlencode_postdata(login_form), headers=headers)          login_page = self._download_webpage(              request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index daf6ad555..ea8fc258d 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -158,6 +158,7 @@ class SohuIE(InfoExtractor):                          'file': clips_url[i],                          'new': su[i],                          'prod': 'flash', +                        'rb': 1,                      }                      if cdnId is not None: diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py deleted file mode 100644 index 5da66ca9e..000000000 --- a/youtube_dl/extractor/soompi.py +++ /dev/null @@ -1,146 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .crunchyroll import CrunchyrollIE - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( -    ExtractorError, -    int_or_none, -    remove_start, -    xpath_text, -) - - -class SoompiBaseIE(InfoExtractor): -    def _get_episodes(self, webpage, episode_filter=None): -        episodes = self._parse_json( -            self._search_regex( -                r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'), -            None) -        return list(filter(episode_filter, episodes)) - - -class SoompiIE(SoompiBaseIE, CrunchyrollIE): -    IE_NAME = 'soompi' -    _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)' -    _TESTS = [{ -        'url': 'http://tv.soompi.com/en/watch/29235', -        'info_dict': { -            'id': '29235', -            'ext': 'mp4', -            'title': 'Episode 1096', -            'description': '2015-05-20' -        }, -        'params': { -            'skip_download': True, -        }, -    }] - -    def _get_episode(self, webpage, video_id): -        return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0] - -    def _get_subtitles(self, config, video_id): -        sub_langs = {} -        for subtitle in config.findall('./{default}preload/subtitles/subtitle'): -            sub_langs[subtitle.attrib['id']] = subtitle.attrib['title'] - -        subtitles = {} -        for s in config.findall('./{default}preload/subtitle'): -            lang_code = sub_langs.get(s.attrib['id']) -            if not lang_code: -                continue -            sub_id = s.get('id') -            data = xpath_text(s, './data', 'data') -            iv = xpath_text(s, './iv', 'iv') -            if not id or not iv or not data: -                continue -            subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8') -            subtitles[lang_code] = self._extract_subtitles(subtitle) -        return subtitles - -    def _real_extract(self, url): -        video_id = self._match_id(url) - -        try: -            webpage = self._download_webpage( -                url, video_id, 'Downloading episode page') -        except ExtractorError as ee: -            if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: -                webpage = ee.cause.read() -                block_message = self._html_search_regex( -                    r'(?s)<div class="block-message">(.+?)</div>', webpage, -                    'block message', default=None) -                if block_message: -                    raise ExtractorError(block_message, expected=True) -            raise - -        formats = [] -        config = None -        for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage): -            config = self._download_xml( -                'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id), -                video_id, 'Downloading %s XML' % format_id) -            m3u8_url = xpath_text( -                config, './{default}preload/stream_info/file', -                '%s m3u8 URL' % format_id) -            if not m3u8_url: -                continue -            formats.extend(self._extract_m3u8_formats( -                m3u8_url, video_id, 'mp4', m3u8_id=format_id)) -        self._sort_formats(formats) - -        episode = self._get_episode(webpage, video_id) - -        title = episode['name'] -        description = episode.get('description') -        duration = int_or_none(episode.get('duration')) - -        thumbnails = [{ -            'id': thumbnail_id, -            'url': thumbnail_url, -        } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()] - -        subtitles = self.extract_subtitles(config, video_id) - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'thumbnails': thumbnails, -            'duration': duration, -            'formats': formats, -            'subtitles': subtitles -        } - - -class SoompiShowIE(SoompiBaseIE): -    IE_NAME = 'soompi:show' -    _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)' -    _TESTS = [{ -        'url': 'http://tv.soompi.com/en/shows/liar-game', -        'info_dict': { -            'id': 'liar-game', -            'title': 'Liar Game', -            'description': 'md5:52c02bce0c1a622a95823591d0589b66', -        }, -        'playlist_count': 14, -    }] - -    def _real_extract(self, url): -        show_id = self._match_id(url) - -        webpage = self._download_webpage( -            url, show_id, 'Downloading show page') - -        title = remove_start(self._og_search_title(webpage), 'SoompiTV | ') -        description = self._og_search_description(webpage) - -        entries = [ -            self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi') -            for episode in self._get_episodes(webpage)] - -        return self.playlist_result(entries, show_id, title, description) diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index ebb75f059..a9927f6e2 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -70,10 +70,12 @@ class SportDeutschlandIE(InfoExtractor):              smil_doc = self._download_xml(                  smil_url, video_id, note='Downloading SMIL metadata') -            base_url = smil_doc.find('./head/meta').attrib['base'] +            base_url_el = smil_doc.find('./head/meta') +            if base_url_el: +                base_url = base_url_el.attrib['base']              formats.extend([{                  'format_id': 'rmtp', -                'url': base_url, +                'url': base_url if base_url_el else n.attrib['src'],                  'play_path': n.attrib['src'],                  'ext': 'flv',                  'preference': -100, diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index 5d583c720..74d01183f 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -1,17 +1,18 @@  # encoding: utf-8  from __future__ import unicode_literals -import json +from .ard import ARDMediathekIE +from ..utils import ( +    ExtractorError, +    get_element_by_attribute, +) -from .common import InfoExtractor -from ..utils import js_to_json - -class SRMediathekIE(InfoExtractor): +class SRMediathekIE(ARDMediathekIE):      IE_DESC = 'Saarländischer Rundfunk'      _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' -    _TEST = { +    _TESTS = [{          'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455',          'info_dict': {              'id': '28455', @@ -20,24 +21,36 @@ class SRMediathekIE(InfoExtractor):              'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ',              'thumbnail': 're:^https?://.*\.jpg$',          }, -    } +        'skip': 'no longer available', +    }, { +        'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=37682', +        'info_dict': { +            'id': '37682', +            'ext': 'mp4', +            'title': 'Love, Cakes and Rock\'n\'Roll', +            'description': 'md5:18bf9763631c7d326c22603681e1123d', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +        'expected_warnings': ['Unable to download f4m manifest'] +    }]      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        murls = json.loads(js_to_json(self._search_regex( -            r'var mediaURLs\s*=\s*(.*?);\n', webpage, 'video URLs'))) -        formats = [{'url': murl} for murl in murls] -        self._sort_formats(formats) - -        title = json.loads(js_to_json(self._search_regex( -            r'var mediaTitles\s*=\s*(.*?);\n', webpage, 'title')))[0] +        if '>Der gewünschte Beitrag ist leider nicht mehr verfügbar.<' in webpage: +            raise ExtractorError('Video %s is no longer available' % video_id, expected=True) -        return { +        media_collection_url = self._search_regex( +            r'data-mediacollection-ardplayer="([^"]+)"', webpage, 'media collection url') +        info = self._extract_media_info(media_collection_url, webpage, video_id) +        info.update({              'id': video_id, -            'title': title, -            'formats': formats, +            'title': get_element_by_attribute('class', 'ardplayer-title', webpage),              'description': self._og_search_description(webpage),              'thumbnail': self._og_search_thumbnail(webpage), -        } +        }) +        return info diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py new file mode 100644 index 000000000..a363b4d40 --- /dev/null +++ b/youtube_dl/extractor/tele13.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( +    js_to_json, +    qualities, +    determine_ext, +) + + +class Tele13IE(InfoExtractor): +    _VALID_URL = r'^http://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' +    _TESTS = [ +        { +            'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', +            'md5': '4cb1fa38adcad8fea88487a078831755', +            'info_dict': { +                'id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', +                'ext': 'mp4', +                'title': 'El círculo de hierro de Michelle Bachelet en su regreso a La Moneda', +            }, +            'params': { +                # HTTP Error 404: Not Found +                'skip_download': True, +            }, +        }, +        { +            'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok', +            'md5': '867adf6a3b3fef932c68a71d70b70946', +            'info_dict': { +                'id': 'rOoKv2OMpOw', +                'ext': 'mp4', +                'title': 'Shooting star seen on 7-Sep-2015', +                'description': 'md5:7292ff2a34b2f673da77da222ae77e1e', +                'uploader': 'Porjai Jaturongkhakun', +                'upload_date': '20150906', +                'uploader_id': 'UCnLY_3ezwNcDSC_Wc6suZxw', +            }, +            'add_ie': ['Youtube'], +        } +    ] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) + +        setup_js = self._search_regex(r"(?s)jwplayer\('player-vivo'\).setup\((\{.*?\})\)", webpage, 'setup code') +        sources = self._parse_json(self._search_regex(r'sources\s*:\s*(\[[^\]]+\])', setup_js, 'sources'), display_id, js_to_json) + +        preference = qualities(['Móvil', 'SD', 'HD']) +        formats = [] +        urls = [] +        for f in sources: +            format_url = f['file'] +            if format_url and format_url not in urls: +                ext = determine_ext(format_url) +                if ext == 'm3u8': +                    m3u8_formats = self._extract_m3u8_formats(format_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) +                    if m3u8_formats: +                        formats.extend(m3u8_formats) +                elif YoutubeIE.suitable(format_url): +                    return self.url_result(format_url, 'Youtube') +                else: +                    formats.append({ +                        'url': format_url, +                        'format_id': f.get('label'), +                        'preference': preference(f.get('label')), +                        'ext': ext, +                    }) +                urls.append(format_url) +        self._sort_formats(formats) + +        return { +            'id': display_id, +            'title': self._search_regex(r'title\s*:\s*"([^"]+)"', setup_js, 'title'), +            'description': self._html_search_meta('description', webpage, 'description'), +            'thumbnail': self._search_regex(r'image\s*:\s*"([^"]+)"', setup_js, 'thumbnail', default=None), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 3a68eaa80..6890021cf 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,7 +6,7 @@ from .common import InfoExtractor  class TF1IE(InfoExtractor):      """TF1 uses the wat.tv player.""" -    _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html' +    _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html'      _TESTS = [{          'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',          'info_dict': { @@ -22,7 +22,7 @@ class TF1IE(InfoExtractor):      }, {          'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html',          'info_dict': { -            'id': '12043945', +            'id': 'le-grand-mysterioso-chuggington-7085291-739',              'ext': 'mp4',              'title': 'Le grand Mystérioso - Chuggington',              'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.', @@ -32,22 +32,24 @@ class TF1IE(InfoExtractor):              # Sometimes wat serves the whole file with the --test option              'skip_download': True,          }, +        'skip': 'HTTP Error 410: Gone',      }, {          'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',          'only_matching': True,      }, {          'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html',          'only_matching': True, +    }, { +        'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', +        'only_matching': True,      }]      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        embed_url = self._html_search_regex( -            r'["\'](https?://www.wat.tv/embedframe/.*?)["\']', webpage, 'embed url') -        embed_page = self._download_webpage(embed_url, video_id, -                                            'Downloading embed player page') -        wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id') +        wat_id = self._html_search_regex( +            r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1', +            webpage, 'wat id', group='id')          wat_info = self._download_json(              'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id)          return self.url_result(wat_info['media']['url'], 'Wat') diff --git a/youtube_dl/extractor/theintercept.py b/youtube_dl/extractor/theintercept.py new file mode 100644 index 000000000..8cb3c3669 --- /dev/null +++ b/youtube_dl/extractor/theintercept.py @@ -0,0 +1,49 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    parse_iso8601, +    int_or_none, +    ExtractorError, +) + + +class TheInterceptIE(InfoExtractor): +    _VALID_URL = r'https://theintercept.com/fieldofvision/(?P<id>[^/?#]+)' +    _TESTS = [{ +        'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/', +        'md5': '145f28b41d44aab2f87c0a4ac8ec95bd', +        'info_dict': { +            'id': '46214', +            'ext': 'mp4', +            'title': '#ThisIsACoup – Episode Four: Surrender or Die', +            'description': 'md5:74dd27f0e2fbd50817829f97eaa33140', +            'timestamp': 1450429239, +            'upload_date': '20151218', +            'comment_count': int, +        } +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) + +        json_data = self._parse_json(self._search_regex( +            r'initialStoreTree\s*=\s*(?P<json_data>{.+})', webpage, +            'initialStoreTree'), display_id) + +        for post in json_data['resources']['posts'].values(): +            if post['slug'] == display_id: +                return { +                    '_type': 'url_transparent', +                    'url': 'jwplatform:%s' % post['fov_videoid'], +                    'id': compat_str(post['ID']), +                    'display_id': display_id, +                    'title': post['title'], +                    'description': post.get('excerpt'), +                    'timestamp': parse_iso8601(post.get('date')), +                    'comment_count': int_or_none(post.get('comments_number')), +                } +        raise ExtractorError('Unable to find the current post') diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 1555aa77c..0bf6726b5 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -16,11 +16,12 @@ from ..compat import (  from ..utils import (      determine_ext,      ExtractorError, -    xpath_with_ns, -    unsmuggle_url, +    float_or_none,      int_or_none, +    sanitized_Request, +    unsmuggle_url,      url_basename, -    float_or_none, +    xpath_with_ns,  )  default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -204,7 +205,12 @@ class ThePlatformIE(ThePlatformBaseIE):              smil_url = url          # Explicitly specified SMIL (see https://github.com/rg3/youtube-dl/issues/7385)          elif '/guid/' in url: -            webpage = self._download_webpage(url, video_id) +            headers = {} +            source_url = smuggled_data.get('source_url') +            if source_url: +                headers['Referer'] = source_url +            request = sanitized_Request(url, headers=headers) +            webpage = self._download_webpage(request, video_id)              smil_url = self._search_regex(                  r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml',                  webpage, 'smil url', group='url') diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py new file mode 100644 index 000000000..a47239952 --- /dev/null +++ b/youtube_dl/extractor/toggle.py @@ -0,0 +1,194 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    ExtractorError, +    float_or_none, +    int_or_none, +    parse_iso8601, +    sanitized_Request, +) + + +class ToggleIE(InfoExtractor): +    IE_NAME = 'toggle' +    _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/(?:[^/]+/)+(?P<id>[0-9]+)' +    _TESTS = [{ +        'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', +        'info_dict': { +            'id': '343115', +            'ext': 'mp4', +            'title': 'Lion Moms Premiere', +            'description': 'md5:aea1149404bff4d7f7b6da11fafd8e6b', +            'upload_date': '20150910', +            'timestamp': 1441858274, +        }, +        'params': { +            'skip_download': 'm3u8 download', +        } +    }, { +        'note': 'DRM-protected video', +        'url': 'http://video.toggle.sg/en/movies/dug-s-special-mission/341413', +        'info_dict': { +            'id': '341413', +            'ext': 'wvm', +            'title': 'Dug\'s Special Mission', +            'description': 'md5:e86c6f4458214905c1772398fabc93e0', +            'upload_date': '20150827', +            'timestamp': 1440644006, +        }, +        'params': { +            'skip_download': 'DRM-protected wvm download', +        } +    }, { +        # this also tests correct video id extraction +        'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay', +        'url': 'http://video.toggle.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861', +        'info_dict': { +            'id': '332861', +            'ext': 'mp4', +            'title': '28th SEA Games (5 Show) -  Episode  11', +            'description': 'md5:3cd4f5f56c7c3b1340c50a863f896faa', +            'upload_date': '20150605', +            'timestamp': 1433480166, +        }, +        'params': { +            'skip_download': 'DRM-protected wvm download', +        }, +        'skip': 'm3u8 links are geo-restricted' +    }, { +        'url': 'http://video.toggle.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', +        'only_matching': True, +    }, { +        'url': 'http://video.toggle.sg/zh/series/zero-calling-s2-hd/ep13/336367', +        'only_matching': True, +    }, { +        'url': 'http://video.toggle.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302', +        'only_matching': True, +    }, { +        'url': 'http://video.toggle.sg/en/movies/seven-days/321936', +        'only_matching': True, +    }] + +    _FORMAT_PREFERENCES = { +        'wvm-STBMain': -10, +        'wvm-iPadMain': -20, +        'wvm-iPhoneMain': -30, +        'wvm-Android': -40, +    } +    _API_USER = 'tvpapi_147' +    _API_PASS = '11111' + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage( +            url, video_id, note='Downloading video page') + +        api_user = self._search_regex( +            r'apiUser\s*:\s*(["\'])(?P<user>.+?)\1', webpage, 'apiUser', +            default=self._API_USER, group='user') +        api_pass = self._search_regex( +            r'apiPass\s*:\s*(["\'])(?P<pass>.+?)\1', webpage, 'apiPass', +            default=self._API_PASS, group='pass') + +        params = { +            'initObj': { +                'Locale': { +                    'LocaleLanguage': '', +                    'LocaleCountry': '', +                    'LocaleDevice': '', +                    'LocaleUserState': 0 +                }, +                'Platform': 0, +                'SiteGuid': 0, +                'DomainID': '0', +                'UDID': '', +                'ApiUser': api_user, +                'ApiPass': api_pass +            }, +            'MediaID': video_id, +            'mediaType': 0, +        } + +        req = sanitized_Request( +            'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo', +            json.dumps(params).encode('utf-8')) +        info = self._download_json(req, video_id, 'Downloading video info json') + +        title = info['MediaName'] + +        formats = [] +        for video_file in info.get('Files', []): +            video_url, vid_format = video_file.get('URL'), video_file.get('Format') +            if not video_url or not vid_format: +                continue +            ext = determine_ext(video_url) +            vid_format = vid_format.replace(' ', '') +            # if geo-restricted, m3u8 is inaccessible, but mp4 is okay +            if ext == 'm3u8': +                m3u8_formats = self._extract_m3u8_formats( +                    video_url, video_id, ext='mp4', m3u8_id=vid_format, +                    note='Downloading %s m3u8 information' % vid_format, +                    errnote='Failed to download %s m3u8 information' % vid_format, +                    fatal=False) +                if m3u8_formats: +                    formats.extend(m3u8_formats) +            elif ext in ('mp4', 'wvm'): +                # wvm are drm-protected files +                formats.append({ +                    'ext': ext, +                    'url': video_url, +                    'format_id': vid_format, +                    'preference': self._FORMAT_PREFERENCES.get(ext + '-' + vid_format) or -1, +                    'format_note': 'DRM-protected video' if ext == 'wvm' else None +                }) +        if not formats: +            # Most likely because geo-blocked +            raise ExtractorError('No downloadable videos found', expected=True) +        self._sort_formats(formats) + +        duration = int_or_none(info.get('Duration')) +        description = info.get('Description') +        created_at = parse_iso8601(info.get('CreationDate') or None) + +        average_rating = float_or_none(info.get('Rating')) +        view_count = int_or_none(info.get('ViewCounter') or info.get('view_counter')) +        like_count = int_or_none(info.get('LikeCounter') or info.get('like_counter')) + +        thumbnails = [] +        for picture in info.get('Pictures', []): +            if not isinstance(picture, dict): +                continue +            pic_url = picture.get('URL') +            if not pic_url: +                continue +            thumbnail = { +                'url': pic_url, +            } +            pic_size = picture.get('PicSize', '') +            m = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', pic_size) +            if m: +                thumbnail.update({ +                    'width': int(m.group('width')), +                    'height': int(m.group('height')), +                }) +            thumbnails.append(thumbnail) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'duration': duration, +            'timestamp': created_at, +            'average_rating': average_rating, +            'view_count': view_count, +            'like_count': like_count, +            'thumbnails': thumbnails, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py index b6b1f2568..8322cc14d 100644 --- a/youtube_dl/extractor/tunein.py +++ b/youtube_dl/extractor/tunein.py @@ -2,74 +2,33 @@  from __future__ import unicode_literals  import json -import re  from .common import InfoExtractor  from ..utils import ExtractorError +from ..compat import compat_urlparse -class TuneInIE(InfoExtractor): -    _VALID_URL = r'''(?x)https?://(?:www\.)? -    (?: -        tunein\.com/ -        (?: -            radio/.*?-s| -            station/.*?StationId\= -        )(?P<id>[0-9]+) -        |tun\.in/(?P<redirect_id>[A-Za-z0-9]+) -    ) -    ''' -    _API_URL_TEMPLATE = 'http://tunein.com/tuner/tune/?stationId={0:}&tuneType=Station' - -    _INFO_DICT = { -        'id': '34682', -        'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', -        'ext': 'aac', -        'thumbnail': 're:^https?://.*\.png$', -        'location': 'Tacoma, WA', -    } -    _TESTS = [ -        { -            'url': 'http://tunein.com/radio/Jazz24-885-s34682/', -            'info_dict': _INFO_DICT, -            'params': { -                'skip_download': True,  # live stream -            }, -        }, -        {  # test redirection -            'url': 'http://tun.in/ser7s', -            'info_dict': _INFO_DICT, -            'params': { -                'skip_download': True,  # live stream -            }, -        }, -    ] +class TuneInBaseIE(InfoExtractor): +    _API_BASE_URL = 'http://tunein.com/tuner/tune/'      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        redirect_id = mobj.group('redirect_id') -        if redirect_id: -            # The server doesn't support HEAD requests -            urlh = self._request_webpage( -                url, redirect_id, note='Downloading redirect page') -            url = urlh.geturl() -            self.to_screen('Following redirect: %s' % url) -            mobj = re.match(self._VALID_URL, url) -        station_id = mobj.group('id') - -        station_info = self._download_json( -            self._API_URL_TEMPLATE.format(station_id), -            station_id, note='Downloading station JSON') - -        title = station_info['Title'] -        thumbnail = station_info.get('Logo') -        location = station_info.get('Location') -        streams_url = station_info.get('StreamUrl') +        content_id = self._match_id(url) + +        content_info = self._download_json( +            self._API_BASE_URL + self._API_URL_QUERY % content_id, +            content_id, note='Downloading JSON metadata') + +        title = content_info['Title'] +        thumbnail = content_info.get('Logo') +        location = content_info.get('Location') +        streams_url = content_info.get('StreamUrl')          if not streams_url: -            raise ExtractorError('No downloadable streams found', -                                 expected=True) +            raise ExtractorError('No downloadable streams found', expected=True) +        if not streams_url.startswith('http://'): +            streams_url = compat_urlparse.urljoin(url, streams_url) +          stream_data = self._download_webpage( -            streams_url, station_id, note='Downloading stream data') +            streams_url, content_id, note='Downloading stream data')          streams = json.loads(self._search_regex(              r'\((.*)\);', stream_data, 'stream info'))['Streams'] @@ -97,10 +56,122 @@ class TuneInIE(InfoExtractor):          self._sort_formats(formats)          return { -            'id': station_id, +            'id': content_id,              'title': title,              'formats': formats,              'thumbnail': thumbnail,              'location': location,              'is_live': is_live,          } + + +class TuneInClipIE(TuneInBaseIE): +    IE_NAME = 'tunein:clip' +    _VALID_URL = r'https?://(?:www\.)?tunein\.com/station/.*?audioClipId\=(?P<id>\d+)' +    _API_URL_QUERY = '?tuneType=AudioClip&audioclipId=%s' + +    _TESTS = [ +        { +            'url': 'http://tunein.com/station/?stationId=246119&audioClipId=816', +            'md5': '99f00d772db70efc804385c6b47f4e77', +            'info_dict': { +                'id': '816', +                'title': '32m', +                'ext': 'mp3', +            }, +        }, +    ] + + +class TuneInStationIE(TuneInBaseIE): +    IE_NAME = 'tunein:station' +    _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId\=)(?P<id>\d+)' +    _API_URL_QUERY = '?tuneType=Station&stationId=%s' + +    @classmethod +    def suitable(cls, url): +        return False if TuneInClipIE.suitable(url) else super(TuneInStationIE, cls).suitable(url) + +    _TESTS = [ +        { +            'url': 'http://tunein.com/radio/Jazz24-885-s34682/', +            'info_dict': { +                'id': '34682', +                'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', +                'ext': 'mp3', +                'location': 'Tacoma, WA', +            }, +            'params': { +                'skip_download': True,  # live stream +            }, +        }, +    ] + + +class TuneInProgramIE(TuneInBaseIE): +    IE_NAME = 'tunein:program' +    _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-p|program/.*?ProgramId\=)(?P<id>\d+)' +    _API_URL_QUERY = '?tuneType=Program&programId=%s' + +    _TESTS = [ +        { +            'url': 'http://tunein.com/radio/Jazz-24-p2506/', +            'info_dict': { +                'id': '2506', +                'title': 'Jazz 24 on 91.3 WUKY-HD3', +                'ext': 'mp3', +                'location': 'Lexington, KY', +            }, +            'params': { +                'skip_download': True,  # live stream +            }, +        }, +    ] + + +class TuneInTopicIE(TuneInBaseIE): +    IE_NAME = 'tunein:topic' +    _VALID_URL = r'https?://(?:www\.)?tunein\.com/topic/.*?TopicId\=(?P<id>\d+)' +    _API_URL_QUERY = '?tuneType=Topic&topicId=%s' + +    _TESTS = [ +        { +            'url': 'http://tunein.com/topic/?TopicId=101830576', +            'md5': 'c31a39e6f988d188252eae7af0ef09c9', +            'info_dict': { +                'id': '101830576', +                'title': 'Votez pour moi du 29 octobre 2015 (29/10/15)', +                'ext': 'mp3', +                'location': 'Belgium', +            }, +        }, +    ] + + +class TuneInShortenerIE(InfoExtractor): +    IE_NAME = 'tunein:shortener' +    IE_DESC = False  # Do not list +    _VALID_URL = r'https?://tun\.in/(?P<id>[A-Za-z0-9]+)' + +    _TEST = { +        # test redirection +        'url': 'http://tun.in/ser7s', +        'info_dict': { +            'id': '34682', +            'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', +            'ext': 'mp3', +            'location': 'Tacoma, WA', +        }, +        'params': { +            'skip_download': True,  # live stream +        }, +    } + +    def _real_extract(self, url): +        redirect_id = self._match_id(url) +        # The server doesn't support HEAD requests +        urlh = self._request_webpage( +            url, redirect_id, note='Downloading redirect page') +        url = urlh.geturl() +        self.to_screen('Following redirect: %s' % url) +        return self.url_result(url) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index c1ee1decc..e03e2dbaa 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -5,6 +5,8 @@ from .common import InfoExtractor  from ..utils import (      parse_iso8601,      int_or_none, +    xpath_attr, +    xpath_element,  ) @@ -15,7 +17,7 @@ class TwentyFourVideoIE(InfoExtractor):      _TESTS = [          {              'url': 'http://www.24video.net/video/view/1044982', -            'md5': 'd041af8b5b4246ea466226a0d6693345', +            'md5': 'e09fc0901d9eaeedac872f154931deeb',              'info_dict': {                  'id': '1044982',                  'ext': 'mp4', @@ -64,33 +66,24 @@ class TwentyFourVideoIE(InfoExtractor):              r'<div class="comments-title" id="comments-count">(\d+) комментари',              webpage, 'comment count', fatal=False)) -        formats = [] +        # Sets some cookies +        self._download_xml( +            r'http://www.24video.net/video/xml/%s?mode=init' % video_id, +            video_id, 'Downloading init XML') -        pc_video = self._download_xml( +        video_xml = self._download_xml(              'http://www.24video.net/video/xml/%s?mode=play' % video_id, -            video_id, 'Downloading PC video URL').find('.//video') +            video_id, 'Downloading video XML') -        formats.append({ -            'url': pc_video.attrib['url'], -            'format_id': 'pc', -            'quality': 1, -        }) +        video = xpath_element(video_xml, './/video', 'video', fatal=True) -        like_count = int_or_none(pc_video.get('ratingPlus')) -        dislike_count = int_or_none(pc_video.get('ratingMinus')) -        age_limit = 18 if pc_video.get('adult') == 'true' else 0 +        formats = [{ +            'url': xpath_attr(video, '', 'url', 'video URL', fatal=True), +        }] -        mobile_video = self._download_xml( -            'http://www.24video.net/video/xml/%s' % video_id, -            video_id, 'Downloading mobile video URL').find('.//video') - -        formats.append({ -            'url': mobile_video.attrib['url'], -            'format_id': 'mobile', -            'quality': 0, -        }) - -        self._sort_formats(formats) +        like_count = int_or_none(video.get('ratingPlus')) +        dislike_count = int_or_none(video.get('ratingMinus')) +        age_limit = 18 if video.get('adult') == 'true' else 0          return {              'id': video_id, diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 571289421..02dfd36f4 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_etree_fromstring +from ..compat import ( +    compat_etree_fromstring, +    compat_urlparse, +)  from ..utils import (      ExtractorError,      int_or_none, @@ -67,6 +70,17 @@ class VevoIE(InfoExtractor):          'params': {              'skip_download': 'true',          } +    }, { +        'note': 'No video_info', +        'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000', +        'md5': '8b83cc492d72fc9cf74a02acee7dc1b0', +        'info_dict': { +            'id': 'USUV71503000', +            'ext': 'mp4', +            'title': 'Till I Die - K Camp ft. T.I.', +            'duration': 193, +        }, +        'expected_warnings': ['Unable to download SMIL file'],      }]      _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/' @@ -81,11 +95,17 @@ class VevoIE(InfoExtractor):          if webpage is False:              self._oauth_token = None          else: +            if 'THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION' in webpage: +                raise ExtractorError('%s said: This page is currently unavailable in your region.' % self.IE_NAME, expected=True) +              self._oauth_token = self._search_regex(                  r'access_token":\s*"([^"]+)"',                  webpage, 'access token', fatal=False)      def _formats_from_json(self, video_info): +        if not video_info: +            return [] +          last_version = {'version': -1}          for version in video_info['videoVersions']:              # These are the HTTP downloads, other types are for different manifests @@ -110,9 +130,8 @@ class VevoIE(InfoExtractor):              })          return formats -    def _formats_from_smil(self, smil_xml): +    def _formats_from_smil(self, smil_doc):          formats = [] -        smil_doc = compat_etree_fromstring(smil_xml.encode('utf-8'))          els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')          for el in els:              src = el.attrib['src'] @@ -145,14 +164,14 @@ class VevoIE(InfoExtractor):              })          return formats -    def _download_api_formats(self, video_id): +    def _download_api_formats(self, video_id, video_url):          if not self._oauth_token:              self._downloader.report_warning(                  'No oauth token available, skipping API HLS download')              return [] -        api_url = 'https://apiv2.vevo.com/video/%s/streams/hls?token=%s' % ( -            video_id, self._oauth_token) +        api_url = compat_urlparse.urljoin(video_url, '//apiv2.vevo.com/video/%s/streams/hls?token=%s' % ( +            video_id, self._oauth_token))          api_data = self._download_json(              api_url, video_id,              note='Downloading HLS formats', @@ -166,18 +185,26 @@ class VevoIE(InfoExtractor):              preference=0)      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url) + +        webpage = None          json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id          response = self._download_json(json_url, video_id) -        video_info = response['video'] +        video_info = response['video'] or {} -        if not video_info: +        if not video_info and response.get('statusCode') != 909:              if 'statusMessage' in response:                  raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusMessage']), expected=True)              raise ExtractorError('Unable to extract videos') +        if not video_info: +            if url.startswith('vevo:'): +                raise ExtractorError('Please specify full Vevo URL for downloading', expected=True) +            webpage = self._download_webpage(url, video_id) + +        title = video_info.get('title') or self._og_search_title(webpage) +          formats = self._formats_from_json(video_info)          is_explicit = video_info.get('isExplicit') @@ -189,11 +216,11 @@ class VevoIE(InfoExtractor):              age_limit = None          # Download via HLS API -        formats.extend(self._download_api_formats(video_id)) +        formats.extend(self._download_api_formats(video_id, url))          # Download SMIL          smil_blocks = sorted(( -            f for f in video_info['videoVersions'] +            f for f in video_info.get('videoVersions', [])              if f['sourceType'] == 13),              key=lambda f: f['version'])          smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( @@ -205,23 +232,26 @@ class VevoIE(InfoExtractor):              if smil_url_m is not None:                  smil_url = smil_url_m          if smil_url: -            smil_xml = self._download_webpage( -                smil_url, video_id, 'Downloading SMIL info', fatal=False) -            if smil_xml: -                formats.extend(self._formats_from_smil(smil_xml)) +            smil_doc = self._download_smil(smil_url, video_id, fatal=False) +            if smil_doc: +                formats.extend(self._formats_from_smil(smil_doc))          self._sort_formats(formats) -        timestamp_ms = int_or_none(self._search_regex( +        timestamp = int_or_none(self._search_regex(              r'/Date\((\d+)\)/', -            video_info['launchDate'], 'launch date', fatal=False)) +            video_info['launchDate'], 'launch date', fatal=False), +            scale=1000) if video_info else None + +        duration = video_info.get('duration') or int_or_none( +            self._html_search_meta('video:duration', webpage))          return {              'id': video_id, -            'title': video_info['title'], +            'title': title,              'formats': formats, -            'thumbnail': video_info['imageUrl'], -            'timestamp': timestamp_ms // 1000, -            'uploader': video_info['mainArtists'][0]['artistName'], -            'duration': video_info['duration'], +            'thumbnail': video_info.get('imageUrl'), +            'timestamp': timestamp, +            'uploader': video_info['mainArtists'][0]['artistName'] if video_info else None, +            'duration': duration,              'age_limit': age_limit,          } diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index f38a72fde..129668a99 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -4,26 +4,48 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from .xstream import XstreamIE  from ..utils import (      ExtractorError,      float_or_none,  ) -class VGTVIE(InfoExtractor): -    IE_DESC = 'VGTV and BTTV' +class VGTVIE(XstreamIE): +    IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' + +    _HOST_TO_APPNAME = { +        'vgtv.no': 'vgtv', +        'bt.no/tv': 'bttv', +        'aftenbladet.no/tv': 'satv', +        'fvn.no/fvntv': 'fvntv', +        'aftenposten.no/webtv': 'aptv', +    } + +    _APP_NAME_TO_VENDOR = { +        'vgtv': 'vgtv', +        'bttv': 'bt', +        'satv': 'sa', +        'fvntv': 'fvn', +        'aptv': 'ap', +    } +      _VALID_URL = r'''(?x) -                    (?: -                        vgtv:| -                        http://(?:www\.)? +                    (?:https?://(?:www\.)? +                    (?P<host> +                        %s                      ) -                    (?P<host>vgtv|bt) +                    /                      (?: -                        :| -                        \.no/(?:tv/)?\#!/(?:video|live)/ -                    ) -                    (?P<id>[0-9]+) -                    ''' +                        \#!/(?:video|live)/| +                        embed?.*id= +                    )| +                    (?P<appname> +                        %s +                    ):) +                    (?P<id>\d+) +                    ''' % ('|'.join(_HOST_TO_APPNAME.keys()), '|'.join(_APP_NAME_TO_VENDOR.keys())) +      _TESTS = [          {              # streamType: vod @@ -59,25 +81,37 @@ class VGTVIE(InfoExtractor):                  # m3u8 download                  'skip_download': True,              }, +            'skip': 'Video is no longer available',          },          { -            # streamType: live +            # streamType: wasLive              'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', +            'md5': '458f4841239dab414343b50e5af8869c',              'info_dict': {                  'id': '113063',                  'ext': 'flv', -                'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +                'title': 'V75 fra Solvalla 30.05.15',                  'description': 'md5:b3743425765355855f88e096acc93231',                  'thumbnail': 're:^https?://.*\.jpg', -                'duration': 0, +                'duration': 25966,                  'timestamp': 1432975582,                  'upload_date': '20150530',                  'view_count': int,              }, -            'params': { -                # m3u8 download -                'skip_download': True, -            }, +        }, +        { +            'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', +            'md5': 'fd828cd29774a729bf4d4425fe192972', +            'info_dict': { +                'id': '21039', +                'ext': 'mov', +                'title': 'TRAILER: «SWEATSHOP» - I can´t take any more', +                'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', +                'duration': 66, +                'timestamp': 1417002452, +                'upload_date': '20141126', +                'view_count': int, +            }          },          {              'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien', @@ -89,21 +123,27 @@ class VGTVIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id')          host = mobj.group('host') - -        HOST_WEBSITES = { -            'vgtv': 'vgtv', -            'bt': 'bttv', -        } +        appname = self._HOST_TO_APPNAME[host] if host else mobj.group('appname') +        vendor = self._APP_NAME_TO_VENDOR[appname]          data = self._download_json(              'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website' -            % (host, video_id, HOST_WEBSITES[host]), +            % (vendor, video_id, appname),              video_id, 'Downloading media JSON')          if data.get('status') == 'inactive':              raise ExtractorError(                  'Video %s is no longer available' % video_id, expected=True) +        info = { +            'formats': [], +        } +        if len(video_id) == 5: +            if appname == 'bttv': +                info = self._extract_video_info('btno', video_id) +            elif appname == 'aptv': +                info = self._extract_video_info('ap', video_id) +          streams = data['streamUrls']          stream_type = data.get('streamType') @@ -111,48 +151,56 @@ class VGTVIE(InfoExtractor):          hls_url = streams.get('hls')          if hls_url: -            formats.extend(self._extract_m3u8_formats( -                hls_url, video_id, 'mp4', m3u8_id='hls')) +            m3u8_formats = self._extract_m3u8_formats( +                hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False) +            if m3u8_formats: +                formats.extend(m3u8_formats)          hds_url = streams.get('hds') -        # wasLive hds are always 404 -        if hds_url and stream_type != 'wasLive': -            formats.extend(self._extract_f4m_formats( -                hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', -                video_id, f4m_id='hds')) +        if hds_url: +            hdcore_sign = 'hdcore=3.7.0' +            f4m_formats = self._extract_f4m_formats( +                hds_url + '?%s' % hdcore_sign, video_id, f4m_id='hds', fatal=False) +            if f4m_formats: +                for entry in f4m_formats: +                    # URLs without the extra param induce an 404 error +                    entry.update({'extra_param_to_segment_url': hdcore_sign}) +                    formats.append(entry) +        mp4_urls = streams.get('pseudostreaming') or []          mp4_url = streams.get('mp4')          if mp4_url: -            _url = hls_url or hds_url -            MP4_URL_TEMPLATE = '%s/%%s.%s' % (mp4_url.rpartition('/')[0], mp4_url.rpartition('.')[-1]) -            for mp4_format in _url.split(','): -                m = re.search('(?P<width>\d+)_(?P<height>\d+)_(?P<vbr>\d+)', mp4_format) -                if not m: -                    continue -                width = int(m.group('width')) -                height = int(m.group('height')) -                vbr = int(m.group('vbr')) -                formats.append({ -                    'url': MP4_URL_TEMPLATE % mp4_format, -                    'format_id': 'mp4-%s' % vbr, -                    'width': width, -                    'height': height, -                    'vbr': vbr, -                    'preference': 1, +            mp4_urls.append(mp4_url) +        for mp4_url in mp4_urls: +            format_info = { +                'url': mp4_url, +            } +            mobj = re.search('(\d+)_(\d+)_(\d+)', mp4_url) +            if mobj: +                tbr = int(mobj.group(3)) +                format_info.update({ +                    'width': int(mobj.group(1)), +                    'height': int(mobj.group(2)), +                    'tbr': tbr, +                    'format_id': 'mp4-%s' % tbr,                  }) -        self._sort_formats(formats) +            formats.append(format_info) + +        info['formats'].extend(formats) -        return { +        self._sort_formats(info['formats']) + +        info.update({              'id': video_id, -            'title': self._live_title(data['title']), +            'title': self._live_title(data['title']) if stream_type == 'live' else data['title'],              'description': data['description'],              'thumbnail': data['images']['main'] + '?t[]=900x506q80',              'timestamp': data['published'],              'duration': float_or_none(data['duration'], 1000),              'view_count': data['displays'], -            'formats': formats,              'is_live': True if stream_type == 'live' else False, -        } +        }) +        return info  class BTArticleIE(InfoExtractor): @@ -161,7 +209,7 @@ class BTArticleIE(InfoExtractor):      _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html'      _TEST = {          'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', -        'md5': 'd055e8ee918ef2844745fcfd1a4175fb', +        'md5': '2acbe8ad129b3469d5ae51b1158878df',          'info_dict': {              'id': '23199',              'ext': 'mp4', @@ -178,15 +226,15 @@ class BTArticleIE(InfoExtractor):      def _real_extract(self, url):          webpage = self._download_webpage(url, self._match_id(url))          video_id = self._search_regex( -            r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id') -        return self.url_result('vgtv:bt:%s' % video_id, 'VGTV') +            r'<video[^>]+data-id="(\d+)"', webpage, 'video id') +        return self.url_result('bttv:%s' % video_id, 'VGTV')  class BTVestlendingenIE(InfoExtractor):      IE_NAME = 'bt:vestlendingen'      IE_DESC = 'Bergens Tidende - Vestlendingen'      _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588',          'md5': 'd7d17e3337dc80de6d3a540aefbe441b',          'info_dict': { @@ -197,7 +245,19 @@ class BTVestlendingenIE(InfoExtractor):              'timestamp': 1430473209,              'upload_date': '20150501',          }, -    } +        'skip': '404 Error', +    }, { +        'url': 'http://www.bt.no/spesial/vestlendingen/#!/86255', +        'md5': 'a2893f8632e96389f4bdf36aa9463ceb', +        'info_dict': { +            'id': '86255', +            'ext': 'mov', +            'title': 'Du må tåle å fryse og være sulten', +            'description': 'md5:b8046f4d022d5830ddab04865791d063', +            'upload_date': '20150321', +            'timestamp': 1426942023, +        }, +    }]      def _real_extract(self, url): -        return self.url_result('xstream:btno:%s' % self._match_id(url), 'Xstream') +        return self.url_result('bttv:%s' % self._match_id(url), 'VGTV') diff --git a/youtube_dl/extractor/videoweed.py b/youtube_dl/extractor/videoweed.py deleted file mode 100644 index ca2e50935..000000000 --- a/youtube_dl/extractor/videoweed.py +++ /dev/null @@ -1,26 +0,0 @@ -from __future__ import unicode_literals - -from .novamov import NovaMovIE - - -class VideoWeedIE(NovaMovIE): -    IE_NAME = 'videoweed' -    IE_DESC = 'VideoWeed' - -    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'videoweed\.(?:es|com)'} - -    _HOST = 'www.videoweed.es' - -    _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' -    _TITLE_REGEX = r'<h1 class="text_shadow">([^<]+)</h1>' - -    _TEST = { -        'url': 'http://www.videoweed.es/file/b42178afbea14', -        'md5': 'abd31a2132947262c50429e1d16c1bfd', -        'info_dict': { -            'id': 'b42178afbea14', -            'ext': 'flv', -            'title': 'optical illusion  dissapeared image magic illusion', -            'description': '' -        }, -    } diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index a63c23617..9a1c377a4 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -30,6 +30,12 @@ class VikiBaseIE(InfoExtractor):      _token = None +    _ERRORS = { +        'geo': 'Sorry, this content is not available in your region.', +        'upcoming': 'Sorry, this content is not yet available.', +        # 'paywall': 'paywall', +    } +      def _prepare_call(self, path, timestamp=None, post_data=None):          path += '?' if '?' not in path else '&'          if not timestamp: @@ -67,6 +73,12 @@ class VikiBaseIE(InfoExtractor):              '%s returned error: %s' % (self.IE_NAME, error),              expected=True) +    def _check_errors(self, data): +        for reason, status in data.get('blocking', {}).items(): +            if status and reason in self._ERRORS: +                raise ExtractorError('%s said: %s' % ( +                    self.IE_NAME, self._ERRORS[reason]), expected=True) +      def _real_initialize(self):          self._login() @@ -193,6 +205,7 @@ class VikiIE(VikiBaseIE):              'timestamp': 1321985454,              'description': 'md5:44b1e46619df3a072294645c770cef36',              'title': 'Love In Magic', +            'age_limit': 13,          },      }] @@ -202,6 +215,8 @@ class VikiIE(VikiBaseIE):          video = self._call_api(              'videos/%s.json' % video_id, video_id, 'Downloading video JSON') +        self._check_errors(video) +          title = self.dict_selection(video.get('titles', {}), 'en')          if not title:              title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id @@ -262,8 +277,11 @@ class VikiIE(VikiBaseIE):                  r'^(\d+)[pP]$', format_id, 'height', default=None))              for protocol, format_dict in stream_dict.items():                  if format_id == 'm3u8': -                    formats = self._extract_m3u8_formats( -                        format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol) +                    m3u8_formats = self._extract_m3u8_formats( +                        format_dict['url'], video_id, 'mp4', 'm3u8_native', +                        m3u8_id='m3u8-%s' % protocol, fatal=False) +                    if m3u8_formats: +                        formats.extend(m3u8_formats)                  else:                      formats.append({                          'url': format_dict['url'], @@ -315,6 +333,8 @@ class VikiChannelIE(VikiBaseIE):              'containers/%s.json' % channel_id, channel_id,              'Downloading channel JSON') +        self._check_errors(channel) +          title = self.dict_selection(channel['titles'], 'en')          description = self.dict_selection(channel['descriptions'], 'en') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index f392ccf1c..ce08e6955 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -23,6 +23,7 @@ from ..utils import (      unsmuggle_url,      urlencode_postdata,      unescapeHTML, +    parse_filesize,  ) @@ -185,6 +186,20 @@ class VimeoIE(VimeoBaseInfoExtractor):              },          },          { +            # contains original format +            'url': 'https://vimeo.com/33951933', +            'md5': '53c688fa95a55bf4b7293d37a89c5c53', +            'info_dict': { +                'id': '33951933', +                'ext': 'mp4', +                'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute', +                'uploader': 'The DMCI', +                'uploader_id': 'dmci', +                'upload_date': '20111220', +                'description': 'md5:ae23671e82d05415868f7ad1aec21147', +            }, +        }, +        {              'url': 'https://vimeo.com/109815029',              'note': 'Video not completely processed, "failed" seed status',              'only_matching': True, @@ -392,6 +407,21 @@ class VimeoIE(VimeoBaseInfoExtractor):              comment_count = None          formats = [] +        download_request = sanitized_Request('https://vimeo.com/%s?action=load_download_config' % video_id, headers={ +            'X-Requested-With': 'XMLHttpRequest'}) +        download_data = self._download_json(download_request, video_id, fatal=False) +        if download_data: +            source_file = download_data.get('source_file') +            if source_file and not source_file.get('is_cold') and not source_file.get('is_defrosting'): +                formats.append({ +                    'url': source_file['download_url'], +                    'ext': source_file['extension'].lower(), +                    'width': int_or_none(source_file.get('width')), +                    'height': int_or_none(source_file.get('height')), +                    'filesize': parse_filesize(source_file.get('size')), +                    'format_id': source_file.get('public_name', 'Original'), +                    'preference': 1, +                })          config_files = config['video'].get('files') or config['request'].get('files', {})          for f in config_files.get('progressive', []):              video_url = f.get('url') @@ -408,12 +438,12 @@ class VimeoIE(VimeoBaseInfoExtractor):          m3u8_url = config_files.get('hls', {}).get('url')          if m3u8_url:              m3u8_formats = self._extract_m3u8_formats( -                m3u8_url, video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False) +                m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)              if m3u8_formats:                  formats.extend(m3u8_formats)          # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps          # at the same time without actual units specified. This lead to wrong sorting. -        self._sort_formats(formats, field_preference=('height', 'width', 'fps', 'format_id')) +        self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id'))          subtitles = {}          text_tracks = config['request'].get('text_tracks') diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index d99a42a9f..90557fa61 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -18,6 +18,7 @@ from ..utils import (      unified_strdate,  )  from .vimeo import VimeoIE +from .pladform import PladformIE  class VKIE(InfoExtractor): @@ -164,6 +165,11 @@ class VKIE(InfoExtractor):              # vk wrapper              'url': 'http://www.biqle.ru/watch/847655_160197695',              'only_matching': True, +        }, +        { +            # pladform embed +            'url': 'https://vk.com/video-76116461_171554880', +            'only_matching': True,          }      ] @@ -254,10 +260,13 @@ class VKIE(InfoExtractor):          if vimeo_url is not None:              return self.url_result(vimeo_url) +        pladform_url = PladformIE._extract_url(info_page) +        if pladform_url: +            return self.url_result(pladform_url) +          m_rutube = re.search(              r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page)          if m_rutube is not None: -            self.to_screen('rutube video detected')              rutube_url = self._proto_relative_url(                  m_rutube.group(1).replace('\\', ''))              return self.url_result(rutube_url) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index b46802306..ef096cbd2 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -10,8 +10,8 @@ from ..compat import (      compat_urlparse,  )  from ..utils import ( -    determine_ext,      unified_strdate, +    qualities,  ) @@ -33,6 +33,7 @@ class WDRIE(InfoExtractor):              'params': {                  'skip_download': True,              }, +            'skip': 'Page Not Found',          },          {              'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html', @@ -47,6 +48,7 @@ class WDRIE(InfoExtractor):              'params': {                  'skip_download': True,              }, +            'skip': 'Page Not Found',          },          {              'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html', @@ -71,6 +73,7 @@ class WDRIE(InfoExtractor):                  'upload_date': '20140717',                  'is_live': False              }, +            'skip': 'Page Not Found',          },          {              'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html', @@ -83,10 +86,10 @@ class WDRIE(InfoExtractor):              'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html',              'info_dict': {                  'id': 'mdb-103364', -                'title': 're:^WDR Fernsehen [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +                'title': 're:^WDR Fernsehen Live [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',                  'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9',                  'ext': 'flv', -                'upload_date': '20150212', +                'upload_date': '20150101',                  'is_live': True              },              'params': { @@ -150,25 +153,52 @@ class WDRIE(InfoExtractor):          if upload_date:              upload_date = unified_strdate(upload_date) +        formats = [] +        preference = qualities(['S', 'M', 'L', 'XL']) +          if video_url.endswith('.f4m'): -            video_url += '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18' -            ext = 'flv' +            f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', page_id, f4m_id='hds', fatal=False) +            if f4m_formats: +                formats.extend(f4m_formats)          elif video_url.endswith('.smil'): -            fmt = self._extract_smil_formats(video_url, page_id)[0] -            video_url = fmt['url'] -            sep = '&' if '?' in video_url else '?' -            video_url += sep -            video_url += 'hdcore=3.3.0&plugin=aasp-3.3.0.99.43' -            ext = fmt['ext'] +            smil_formats = self._extract_smil_formats(video_url, page_id, False, { +                'hdcore': '3.3.0', +                'plugin': 'aasp-3.3.0.99.43', +            }) +            if smil_formats: +                formats.extend(smil_formats)          else: -            ext = determine_ext(video_url) +            formats.append({ +                'url': video_url, +                'http_headers': { +                    'User-Agent': 'mobile', +                }, +            }) + +        m3u8_url = self._search_regex(r'rel="adaptiv"[^>]+href="([^"]+)"', webpage, 'm3u8 url', default=None) +        if m3u8_url: +            m3u8_formats = self._extract_m3u8_formats(m3u8_url, page_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) +            if m3u8_formats: +                formats.extend(m3u8_formats) + +        direct_urls = re.findall(r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage) +        if direct_urls: +            for quality, video_url in direct_urls: +                formats.append({ +                    'url': video_url, +                    'preference': preference(quality), +                    'http_headers': { +                        'User-Agent': 'mobile', +                    }, +                }) + +        self._sort_formats(formats)          description = self._html_search_meta('Description', webpage, 'description')          return {              'id': page_id, -            'url': video_url, -            'ext': ext, +            'formats': formats,              'title': title,              'description': description,              'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index e4f50e64c..041ff6c55 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -5,7 +5,7 @@ from .youtube import YoutubeIE  class WimpIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P<id>[^/]+)/' +    _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P<id>[^/]+)'      _TESTS = [{          'url': 'http://www.wimp.com/maruexhausted/',          'md5': 'ee21217ffd66d058e8b16be340b74883', @@ -28,18 +28,23 @@ class WimpIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) +          webpage = self._download_webpage(url, video_id) -        video_url = self._search_regex( -            [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"], -            webpage, 'video URL') -        if YoutubeIE.suitable(video_url): -            self.to_screen('Found YouTube video') + +        youtube_id = self._search_regex( +            r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", +            webpage, 'video URL', default=None) +        if youtube_id:              return {                  '_type': 'url', -                'url': video_url, +                'url': youtube_id,                  'ie_key': YoutubeIE.ie_key(),              } +        video_url = self._search_regex( +            r'<video[^>]+>\s*<source[^>]+src=(["\'])(?P<url>.+?)\1', +            webpage, 'video URL', group='url') +          return {              'id': video_id,              'url': video_url, diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py index 71584c291..76c91bd92 100644 --- a/youtube_dl/extractor/xstream.py +++ b/youtube_dl/extractor/xstream.py @@ -42,11 +42,7 @@ class XstreamIE(InfoExtractor):          'only_matching': True,      }] -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        partner_id = mobj.group('partner_id') -        video_id = mobj.group('id') - +    def _extract_video_info(self, partner_id, video_id):          data = self._download_xml(              'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s'              % (partner_id, video_id), @@ -97,6 +93,7 @@ class XstreamIE(InfoExtractor):              formats.append({                  'url': link.get('href'),                  'format_id': link.get('rel'), +                'preference': 1,              })          thumbnails = [{ @@ -113,3 +110,10 @@ class XstreamIE(InfoExtractor):              'formats': formats,              'thumbnails': thumbnails,          } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        partner_id = mobj.group('partner_id') +        video_id = mobj.group('id') + +        return self._extract_video_info(partner_id, video_id) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 69ecc837a..3a3432be8 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -25,8 +25,8 @@ class YoukuIE(InfoExtractor):      '''      _TESTS = [{ +        # MD5 is unstable          'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', -        'md5': '5f3af4192eabacc4501508d54a8cabd7',          'info_dict': {              'id': 'XMTc1ODE5Njcy_part1',              'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', @@ -42,6 +42,7 @@ class YoukuIE(InfoExtractor):              'title': '武媚娘传奇 85',          },          'playlist_count': 11, +        'skip': 'Available in China only',      }, {          'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',          'info_dict': { @@ -49,7 +50,6 @@ class YoukuIE(InfoExtractor):              'title': '花千骨 04',          },          'playlist_count': 13, -        'skip': 'Available in China only',      }, {          'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html',          'note': 'Video protected with password', @@ -63,7 +63,7 @@ class YoukuIE(InfoExtractor):          },      }] -    def construct_video_urls(self, data1, data2): +    def construct_video_urls(self, data):          # get sid, token          def yk_t(s1, s2):              ls = list(range(256)) @@ -81,34 +81,24 @@ class YoukuIE(InfoExtractor):              return bytes(s)          sid, token = yk_t( -            b'becaf9be', base64.b64decode(data2['ep'].encode('ascii')) +            b'becaf9be', base64.b64decode(data['security']['encrypt_string'].encode('ascii'))          ).decode('ascii').split('_')          # get oip -        oip = data2['ip'] - -        # get fileid -        string_ls = list( -            'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890') -        shuffled_string_ls = [] -        seed = data1['seed'] -        N = len(string_ls) -        for ii in range(N): -            seed = (seed * 0xd3 + 0x754f) % 0x10000 -            idx = seed * len(string_ls) // 0x10000 -            shuffled_string_ls.append(string_ls[idx]) -            del string_ls[idx] +        oip = data['security']['ip']          fileid_dict = {} -        for format in data1['streamtypes']: -            streamfileid = [ -                int(i) for i in data1['streamfileids'][format].strip('*').split('*')] -            fileid = ''.join( -                [shuffled_string_ls[i] for i in streamfileid]) -            fileid_dict[format] = fileid[:8] + '%s' + fileid[10:] +        for stream in data['stream']: +            format = stream.get('stream_type') +            fileid = stream['stream_fileid'] +            fileid_dict[format] = fileid          def get_fileid(format, n): -            fileid = fileid_dict[format] % hex(int(n))[2:].upper().zfill(2) +            number = hex(int(str(n), 10))[2:].upper() +            if len(number) == 1: +                number = '0' + number +            streamfileids = fileid_dict[format] +            fileid = streamfileids[0:8] + number + streamfileids[10:]              return fileid          # get ep @@ -123,15 +113,15 @@ class YoukuIE(InfoExtractor):          # generate video_urls          video_urls_dict = {} -        for format in data1['streamtypes']: +        for stream in data['stream']: +            format = stream.get('stream_type')              video_urls = [] -            for dt in data1['segs'][format]: -                n = str(int(dt['no'])) +            for dt in stream['segs']: +                n = str(stream['segs'].index(dt))                  param = { -                    'K': dt['k'], +                    'K': dt['key'],                      'hd': self.get_hd(format),                      'myp': 0, -                    'ts': dt['seconds'],                      'ypp': 0,                      'ctype': 12,                      'ev': 1, @@ -142,7 +132,7 @@ class YoukuIE(InfoExtractor):                  video_url = \                      'http://k.youku.com/player/getFlvPath/' + \                      'sid/' + sid + \ -                    '_' + str(int(n) + 1).zfill(2) + \ +                    '_00' + \                      '/st/' + self.parse_ext_l(format) + \                      '/fileid/' + get_fileid(format, n) + '?' + \                      compat_urllib_parse.urlencode(param) @@ -153,23 +143,31 @@ class YoukuIE(InfoExtractor):      def get_hd(self, fm):          hd_id_dict = { +            '3gp': '0', +            '3gphd': '1',              'flv': '0', +            'flvhd': '0',              'mp4': '1', +            'mp4hd': '1', +            'mp4hd2': '1', +            'mp4hd3': '1',              'hd2': '2',              'hd3': '3', -            '3gp': '0', -            '3gphd': '1'          }          return hd_id_dict[fm]      def parse_ext_l(self, fm):          ext_dict = { +            '3gp': 'flv', +            '3gphd': 'mp4',              'flv': 'flv', +            'flvhd': 'flv',              'mp4': 'mp4', +            'mp4hd': 'mp4', +            'mp4hd2': 'flv', +            'mp4hd3': 'flv',              'hd2': 'flv',              'hd3': 'flv', -            '3gp': 'flv', -            '3gphd': 'mp4'          }          return ext_dict[fm] @@ -178,9 +176,13 @@ class YoukuIE(InfoExtractor):              '3gp': 'h6',              '3gphd': 'h5',              'flv': 'h4', +            'flvhd': 'h4',              'mp4': 'h3', +            'mp4hd': 'h3', +            'mp4hd2': 'h4', +            'mp4hd3': 'h4',              'hd2': 'h2', -            'hd3': 'h1' +            'hd3': 'h1',          }          return _dict[fm] @@ -188,45 +190,46 @@ class YoukuIE(InfoExtractor):          video_id = self._match_id(url)          def retrieve_data(req_url, note): -            req = sanitized_Request(req_url) +            headers = { +                'Referer': req_url, +            } +            self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') +            req = sanitized_Request(req_url, headers=headers)              cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')              if cn_verification_proxy:                  req.add_header('Ytdl-request-proxy', cn_verification_proxy)              raw_data = self._download_json(req, video_id, note=note) -            return raw_data['data'][0] + +            return raw_data['data']          video_password = self._downloader.params.get('videopassword', None)          # request basic data -        basic_data_url = 'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id +        basic_data_url = "http://play.youku.com/play/get.json?vid=%s&ct=12" % video_id          if video_password: -            basic_data_url += '?password=%s' % video_password - -        data1 = retrieve_data( -            basic_data_url, -            'Downloading JSON metadata 1') -        data2 = retrieve_data( -            'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id, -            'Downloading JSON metadata 2') - -        error_code = data1.get('error_code') -        if error_code: -            error = data1.get('error') -            if error is not None and '因版权原因无法观看此视频' in error: +            basic_data_url += '&pwd=%s' % video_password + +        data = retrieve_data(basic_data_url, 'Downloading JSON metadata') + +        error = data.get('error') +        if error: +            error_note = error.get('note') +            if error_note is not None and '因版权原因无法观看此视频' in error_note:                  raise ExtractorError(                      'Youku said: Sorry, this video is available in China only', expected=True)              else: -                msg = 'Youku server reported error %i' % error_code -                if error is not None: -                    msg += ': ' + error +                msg = 'Youku server reported error %i' % error.get('code') +                if error_note is not None: +                    msg += ': ' + error_note                  raise ExtractorError(msg) -        title = data1['title'] +        # get video title +        title = data['video']['title']          # generate video_urls_dict -        video_urls_dict = self.construct_video_urls(data1, data2) +        video_urls_dict = self.construct_video_urls(data)          # construct info          entries = [{ @@ -235,10 +238,11 @@ class YoukuIE(InfoExtractor):              'formats': [],              # some formats are not available for all parts, we have to detect              # which one has all -        } for i in range(max(len(v) for v in data1['segs'].values()))] -        for fm in data1['streamtypes']: +        } for i in range(max(len(v.get('segs')) for v in data['stream']))] +        for stream in data['stream']: +            fm = stream.get('stream_type')              video_urls = video_urls_dict[fm] -            for video_url, seg, entry in zip(video_urls, data1['segs'][fm], entries): +            for video_url, seg, entry in zip(video_urls, stream['segs'], entries):                  entry['formats'].append({                      'url': video_url,                      'format_id': self.get_format_name(fm), diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9b39505ba..4aac2cc03 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,6 +26,7 @@ from ..compat import (  from ..utils import (      clean_html,      encode_dict, +    error_to_compat_str,      ExtractorError,      float_or_none,      get_element_by_attribute, @@ -33,6 +34,7 @@ from ..utils import (      int_or_none,      orderedSet,      parse_duration, +    remove_quotes,      remove_start,      sanitized_Request,      smuggle_url, @@ -395,12 +397,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'ext': 'mp4',                  'upload_date': '20120506',                  'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', +                'alt_title': 'I Love It (feat. Charli XCX)',                  'description': 'md5:782e8651347686cba06e58f71ab51773',                  'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',                           'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',                           'iconic ep', 'iconic', 'love', 'it'],                  'uploader': 'Icona Pop',                  'uploader_id': 'IconaPop', +                'creator': 'Icona Pop',              }          },          { @@ -411,9 +415,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'ext': 'mp4',                  'upload_date': '20130703',                  'title': 'Justin Timberlake - Tunnel Vision (Explicit)', +                'alt_title': 'Tunnel Vision',                  'description': 'md5:64249768eec3bc4276236606ea996373',                  'uploader': 'justintimberlakeVEVO',                  'uploader_id': 'justintimberlakeVEVO', +                'creator': 'Justin Timberlake',                  'age_limit': 18,              }          }, @@ -492,10 +498,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'id': 'nfWlot6h_JM',                  'ext': 'm4a',                  'title': 'Taylor Swift - Shake It Off', +                'alt_title': 'Shake It Off',                  'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',                  'uploader': 'TaylorSwiftVEVO',                  'uploader_id': 'TaylorSwiftVEVO',                  'upload_date': '20140818', +                'creator': 'Taylor Swift',              },              'params': {                  'youtube_include_dash_manifest': True, @@ -551,9 +559,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'ext': 'mp4',                  'upload_date': '20100430',                  'uploader_id': 'deadmau5', +                'creator': 'deadmau5',                  'description': 'md5:12c56784b8032162bb936a5f76d55360',                  'uploader': 'deadmau5',                  'title': 'Deadmau5 - Some Chords (HD)', +                'alt_title': 'Some Chords',              },              'expected_warnings': [                  'DASH manifest missing', @@ -701,10 +711,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'id': 'lsguqyKfVQg',                  'ext': 'mp4',                  'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', +                'alt_title': 'Dark Walk',                  'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',                  'upload_date': '20151119',                  'uploader_id': 'IronSoulElf',                  'uploader': 'IronSoulElf', +                'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',              },              'params': {                  'skip_download': True, @@ -892,7 +904,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,                  video_id, note=False)          except ExtractorError as err: -            self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err)) +            self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))              return {}          sub_lang_list = {} @@ -1308,6 +1320,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())          upload_date = unified_strdate(upload_date) +        m_music = re.search( +            r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li', +            video_webpage) +        if m_music: +            video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) +            video_creator = clean_html(m_music.group('creator')) +        else: +            video_alt_title = video_creator = None +          m_cat_container = self._search_regex(              r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',              video_webpage, 'categories', default=None) @@ -1537,7 +1558,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              'uploader': video_uploader,              'uploader_id': video_uploader_id,              'upload_date': upload_date, +            'creator': video_creator,              'title': video_title, +            'alt_title': video_alt_title,              'thumbnail': video_thumbnail,              'description': video_description,              'categories': video_categories, @@ -1752,6 +1775,10 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):          },      }] +    @classmethod +    def suitable(cls, url): +        return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url) +      def _real_extract(self, url):          channel_id = self._match_id(url) @@ -1825,10 +1852,10 @@ class YoutubeUserIE(YoutubeChannelIE):              return super(YoutubeUserIE, cls).suitable(url) -class YoutubeUserPlaylistsIE(YoutubePlaylistsBaseInfoExtractor): -    IE_DESC = 'YouTube.com user playlists' -    _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/user/(?P<id>[^/]+)/playlists' -    IE_NAME = 'youtube:user:playlists' +class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): +    IE_DESC = 'YouTube.com user/channel playlists' +    _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' +    IE_NAME = 'youtube:playlists'      _TESTS = [{          'url': 'http://www.youtube.com/user/ThirstForScience/playlists', @@ -1845,6 +1872,13 @@ class YoutubeUserPlaylistsIE(YoutubePlaylistsBaseInfoExtractor):              'id': 'igorkle1',              'title': 'Игорь Клейнер',          }, +    }, { +        'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', +        'playlist_mincount': 17, +        'info_dict': { +            'id': 'UCiU1dHvZObB2iP6xkJ__Icw', +            'title': 'Chem Player', +        },      }] diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index a795f56b3..92c12bac6 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -10,106 +10,16 @@ from ..utils import (      unified_strdate,      OnDemandPagedList,      xpath_text, +    determine_ext, +    qualities, +    float_or_none,  ) -def extract_from_xml_url(ie, video_id, xml_url): -    doc = ie._download_xml( -        xml_url, video_id, -        note='Downloading video info', -        errnote='Failed to download video info') - -    title = doc.find('.//information/title').text -    description = xpath_text(doc, './/information/detail', 'description') -    duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) -    uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') -    uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') -    upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) - -    def xml_to_format(fnode): -        video_url = fnode.find('url').text -        is_available = 'http://www.metafilegenerator' not in video_url - -        format_id = fnode.attrib['basetype'] -        format_m = re.match(r'''(?x) -            (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ -            (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) -        ''', format_id) - -        ext = format_m.group('container') -        proto = format_m.group('proto').lower() - -        quality = xpath_text(fnode, './quality', 'quality') -        abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) -        vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) - -        width = int_or_none(xpath_text(fnode, './width', 'width')) -        height = int_or_none(xpath_text(fnode, './height', 'height')) - -        filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) - -        format_note = '' -        if not format_note: -            format_note = None - -        return { -            'format_id': format_id + '-' + quality, -            'url': video_url, -            'ext': ext, -            'acodec': format_m.group('acodec'), -            'vcodec': format_m.group('vcodec'), -            'abr': abr, -            'vbr': vbr, -            'width': width, -            'height': height, -            'filesize': filesize, -            'format_note': format_note, -            'protocol': proto, -            '_available': is_available, -        } - -    def xml_to_thumbnails(fnode): -        thumbnails = [] -        for node in fnode: -            thumbnail_url = node.text -            if not thumbnail_url: -                continue -            thumbnail = { -                'url': thumbnail_url, -            } -            if 'key' in node.attrib: -                m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) -                if m: -                    thumbnail['width'] = int(m.group(1)) -                    thumbnail['height'] = int(m.group(2)) -            thumbnails.append(thumbnail) -        return thumbnails - -    thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) - -    format_nodes = doc.findall('.//formitaeten/formitaet') -    formats = list(filter( -        lambda f: f['_available'], -        map(xml_to_format, format_nodes))) -    ie._sort_formats(formats) - -    return { -        'id': video_id, -        'title': title, -        'description': description, -        'duration': duration, -        'thumbnails': thumbnails, -        'uploader': uploader, -        'uploader_id': uploader_id, -        'upload_date': upload_date, -        'formats': formats, -    } - -  class ZDFIE(InfoExtractor):      _VALID_URL = r'(?:zdf:|zdf:video:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/(.*beitrag/(?:video/)?))(?P<id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?' -    _TEST = { +    _TESTS = [{          'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt',          'info_dict': {              'id': '2037704', @@ -122,23 +32,183 @@ class ZDFIE(InfoExtractor):              'upload_date': '20131127',          },          'skip': 'Videos on ZDF.de are depublicised in short order', -    } +    }] + +    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): +        param_groups = {} +        for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): +            group_id = param_group.attrib.get(self._xpath_ns('id', 'http://www.w3.org/XML/1998/namespace')) +            params = {} +            for param in param_group: +                params[param.get('name')] = param.get('value') +            param_groups[group_id] = params + +        formats = [] +        for video in smil.findall(self._xpath_ns('.//video', namespace)): +            src = video.get('src') +            if not src: +                continue +            bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) +            group_id = video.get('paramGroup') +            param_group = param_groups[group_id] +            for proto in param_group['protocols'].split(','): +                formats.append({ +                    'url': '%s://%s' % (proto, param_group['host']), +                    'app': param_group['app'], +                    'play_path': src, +                    'ext': 'flv', +                    'format_id': '%s-%d' % (proto, bitrate), +                    'tbr': bitrate, +                    'protocol': proto, +                }) +        self._sort_formats(formats) +        return formats + +    def extract_from_xml_url(self, video_id, xml_url): +        doc = self._download_xml( +            xml_url, video_id, +            note='Downloading video info', +            errnote='Failed to download video info') + +        title = doc.find('.//information/title').text +        description = xpath_text(doc, './/information/detail', 'description') +        duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) +        uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') +        uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') +        upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) + +        def xml_to_thumbnails(fnode): +            thumbnails = [] +            for node in fnode: +                thumbnail_url = node.text +                if not thumbnail_url: +                    continue +                thumbnail = { +                    'url': thumbnail_url, +                } +                if 'key' in node.attrib: +                    m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) +                    if m: +                        thumbnail['width'] = int(m.group(1)) +                        thumbnail['height'] = int(m.group(2)) +                thumbnails.append(thumbnail) +            return thumbnails + +        thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) + +        format_nodes = doc.findall('.//formitaeten/formitaet') +        quality = qualities(['veryhigh', 'high', 'med', 'low']) + +        def get_quality(elem): +            return quality(xpath_text(elem, 'quality')) +        format_nodes.sort(key=get_quality) +        format_ids = [] +        formats = [] +        for fnode in format_nodes: +            video_url = fnode.find('url').text +            is_available = 'http://www.metafilegenerator' not in video_url +            if not is_available: +                continue +            format_id = fnode.attrib['basetype'] +            quality = xpath_text(fnode, './quality', 'quality') +            format_m = re.match(r'''(?x) +                (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ +                (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) +            ''', format_id) + +            ext = determine_ext(video_url, None) or format_m.group('container') +            if ext not in ('smil', 'f4m', 'm3u8'): +                format_id = format_id + '-' + quality +            if format_id in format_ids: +                continue + +            if ext == 'meta': +                continue +            elif ext == 'smil': +                smil_formats = self._extract_smil_formats( +                    video_url, video_id, fatal=False) +                if smil_formats: +                    formats.extend(smil_formats) +            elif ext == 'm3u8': +                m3u8_formats = self._extract_m3u8_formats( +                    video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) +                if m3u8_formats: +                    formats.extend(m3u8_formats) +            elif ext == 'f4m': +                f4m_formats = self._extract_f4m_formats( +                    video_url, video_id, f4m_id='hds', fatal=False) +                if f4m_formats: +                    formats.extend(f4m_formats) +            else: +                proto = format_m.group('proto').lower() + +                abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) +                vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) + +                width = int_or_none(xpath_text(fnode, './width', 'width')) +                height = int_or_none(xpath_text(fnode, './height', 'height')) + +                filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) + +                format_note = '' +                if not format_note: +                    format_note = None + +                formats.append({ +                    'format_id': format_id, +                    'url': video_url, +                    'ext': ext, +                    'acodec': format_m.group('acodec'), +                    'vcodec': format_m.group('vcodec'), +                    'abr': abr, +                    'vbr': vbr, +                    'width': width, +                    'height': height, +                    'filesize': filesize, +                    'format_note': format_note, +                    'protocol': proto, +                    '_available': is_available, +                }) +            format_ids.append(format_id) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'duration': duration, +            'thumbnails': thumbnails, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'upload_date': upload_date, +            'formats': formats, +        }      def _real_extract(self, url):          video_id = self._match_id(url)          xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id -        return extract_from_xml_url(self, video_id, xml_url) +        return self.extract_from_xml_url(video_id, xml_url)  class ZDFChannelIE(InfoExtractor): -    _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/)(?P<id>[0-9]+)' -    _TEST = { +    _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/(?:[^/]+/)?)(?P<id>[0-9]+)' +    _TESTS = [{          'url': 'http://www.zdf.de/ZDFmediathek#/kanaluebersicht/1586442/sendung/Titanic',          'info_dict': {              'id': '1586442',          },          'playlist_count': 3, -    } +    }, { +        'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/aktuellste/332', +        'only_matching': True, +    }, { +        'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/meist-gesehen/332', +        'only_matching': True, +    }, { +        'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/_/1798716?bc=nrt;nrm?flash=off', +        'only_matching': True, +    }]      _PAGE_SIZE = 50      def _fetch_page(self, channel_id, page): diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 2191e8b89..a7440c582 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -232,7 +232,7 @@ class JSInterpreter(object):      def extract_function(self, funcname):          func_m = re.search(              r'''(?x) -                (?:function\s+%s|[{;]%s\s*=\s*function|var\s+%s\s*=\s*function)\s* +                (?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s*                  \((?P<args>[^)]*)\)\s*                  \{(?P<code>[^}]+)\}''' % (                  re.escape(funcname), re.escape(funcname), re.escape(funcname)), diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 5ed723bc6..daca5d814 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -52,7 +52,7 @@ class FFmpegPostProcessor(PostProcessor):      def _determine_executables(self):          programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] -        prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', False) +        prefer_ffmpeg = False          self.basename = None          self.probe_basename = None @@ -60,6 +60,7 @@ class FFmpegPostProcessor(PostProcessor):          self._paths = None          self._versions = None          if self._downloader: +            prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', False)              location = self._downloader.params.get('ffmpeg_location')              if location is not None:                  if not os.path.exists(location): diff --git a/youtube_dl/update.py b/youtube_dl/update.py index 074eb64a7..995b8ed96 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -9,7 +9,7 @@ import subprocess  import sys  from zipimport import zipimporter -from .compat import compat_str +from .utils import encode_compat_str  from .version import __version__ @@ -61,7 +61,7 @@ def update_self(to_screen, verbose, opener):          newversion = opener.open(VERSION_URL).read().decode('utf-8').strip()      except Exception:          if verbose: -            to_screen(compat_str(traceback.format_exc())) +            to_screen(encode_compat_str(traceback.format_exc()))          to_screen('ERROR: can\'t find the current version. Please try again later.')          return      if newversion == __version__: @@ -74,7 +74,7 @@ def update_self(to_screen, verbose, opener):          versions_info = json.loads(versions_info)      except Exception:          if verbose: -            to_screen(compat_str(traceback.format_exc())) +            to_screen(encode_compat_str(traceback.format_exc()))          to_screen('ERROR: can\'t obtain versions info. Please try again later.')          return      if 'signature' not in versions_info: @@ -123,7 +123,7 @@ def update_self(to_screen, verbose, opener):              urlh.close()          except (IOError, OSError):              if verbose: -                to_screen(compat_str(traceback.format_exc())) +                to_screen(encode_compat_str(traceback.format_exc()))              to_screen('ERROR: unable to download latest version')              return @@ -137,7 +137,7 @@ def update_self(to_screen, verbose, opener):                  outf.write(newcontent)          except (IOError, OSError):              if verbose: -                to_screen(compat_str(traceback.format_exc())) +                to_screen(encode_compat_str(traceback.format_exc()))              to_screen('ERROR: unable to write the new version')              return @@ -157,7 +157,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"              return  # Do not show premature success messages          except (IOError, OSError):              if verbose: -                to_screen(compat_str(traceback.format_exc())) +                to_screen(encode_compat_str(traceback.format_exc()))              to_screen('ERROR: unable to overwrite current version')              return @@ -169,7 +169,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"              urlh.close()          except (IOError, OSError):              if verbose: -                to_screen(compat_str(traceback.format_exc())) +                to_screen(encode_compat_str(traceback.format_exc()))              to_screen('ERROR: unable to download latest version')              return @@ -183,7 +183,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"                  outf.write(newcontent)          except (IOError, OSError):              if verbose: -                to_screen(compat_str(traceback.format_exc())) +                to_screen(encode_compat_str(traceback.format_exc()))              to_screen('ERROR: unable to overwrite current version')              return diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d0606b4bc..0ed6c45c8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -773,11 +773,13 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):                      raise original_ioerror              resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)              resp.msg = old_resp.msg +            del resp.headers['Content-encoding']          # deflate          if resp.headers.get('Content-encoding', '') == 'deflate':              gz = io.BytesIO(self.deflate(resp.read()))              resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)              resp.msg = old_resp.msg +            del resp.headers['Content-encoding']          # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see          # https://github.com/rg3/youtube-dl/issues/6457).          if 300 <= resp.code < 400: @@ -1406,6 +1408,15 @@ def remove_end(s, end):      return s +def remove_quotes(s): +    if s is None or len(s) < 2: +        return s +    for quote in ('"', "'", ): +        if s[0] == quote and s[-1] == quote: +            return s[1:-1] +    return s + +  def url_basename(url):      path = compat_urlparse.urlparse(url).path      return path.strip('/').split('/')[-1] @@ -1703,6 +1714,10 @@ def encode_dict(d, encoding='utf-8'):      return dict((encode(k), encode(v)) for k, v in d.items()) +def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): +    return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) + +  US_RATINGS = {      'G': 0,      'PG': 10, @@ -1797,6 +1812,15 @@ def args_to_str(args):      return ' '.join(shlex_quote(a) for a in args) +def error_to_compat_str(err): +    err_str = str(err) +    # On python 2 error byte string must be decoded with proper +    # encoding rather than ascii +    if sys.version_info[0] < 3: +        err_str = err_str.decode(preferredencoding()) +    return err_str + +  def mimetype2ext(mt):      _, _, res = mt.rpartition('/') @@ -1967,15 +1991,15 @@ def match_filter_func(filter_str):  def parse_dfxp_time_expr(time_expr):      if not time_expr: -        return 0.0 +        return      mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)      if mobj:          return float(mobj.group('time_offset')) -    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr) +    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)      if mobj: -        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3)) +        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))  def srt_subtitles_timecode(seconds): @@ -2011,10 +2035,15 @@ def dfxp2srt(dfxp_data):          raise ValueError('Invalid dfxp/TTML subtitle')      for para, index in zip(paras, itertools.count(1)): -        begin_time = parse_dfxp_time_expr(para.attrib['begin']) +        begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))          end_time = parse_dfxp_time_expr(para.attrib.get('end')) +        dur = parse_dfxp_time_expr(para.attrib.get('dur')) +        if begin_time is None: +            continue          if not end_time: -            end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur']) +            if not dur: +                continue +            end_time = begin_time + dur          out.append('%d\n%s --> %s\n%s\n\n' % (              index,              srt_subtitles_timecode(begin_time), diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 12ca0241d..255d64269 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.12.05' +__version__ = '2015.12.23'  | 
