diff options
47 files changed, 1197 insertions, 299 deletions
| @@ -46,15 +46,15 @@ which means you can modify it, redistribute it or use it however you like.                                       an empty string (--proxy "") for direct                                       connection      --socket-timeout None            Time to wait before giving up, in seconds -    --bidi-workaround                Work around terminals that lack -                                     bidirectional text support. Requires bidiv -                                     or fribidi executable in PATH      --default-search PREFIX          Use this prefix for unqualified URLs. For                                       example "gvsearch2:" downloads two videos                                       from google videos for  youtube-dl "large                                       apple". Use the value "auto" to let -                                     youtube-dl guess. The default value "error" -                                     just throws an error. +                                     youtube-dl guess ("auto_warning" to emit a +                                     warning when guessing). "error" just throws +                                     an error. The default value "fixup_error" +                                     repairs broken URLs, but emits an error if +                                     this is not possible instead of searching.      --ignore-config                  Do not read configuration files. When given                                       in the global configuration file /etc                                       /youtube-dl.conf: do not read the user @@ -213,6 +213,9 @@ which means you can modify it, redistribute it or use it however you like.      --add-header FIELD:VALUE         specify a custom HTTP header and its value,                                       separated by a colon ':'. You can use this                                       option multiple times +    --bidi-workaround                Work around terminals that lack +                                     bidirectional text support. Requires bidiv +                                     or fribidi executable in PATH  ## Video Format Options:      -f, --format FORMAT              video format code, specify the order of diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 0ff47cf1e..b1ad30bf1 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -99,6 +99,7 @@ class TestAllURLsMatching(unittest.TestCase):      def test_facebook_matching(self):          self.assertTrue(FacebookIE.suitable('https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268')) +        self.assertTrue(FacebookIE.suitable('https://www.facebook.com/cindyweather?fref=ts#!/photo.php?v=10152183998945793'))      def test_no_duplicates(self):          ies = gen_extractors() diff --git a/test/test_playlists.py b/test/test_playlists.py index c221c47b9..4f188345b 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -193,10 +193,10 @@ class TestPlaylists(unittest.TestCase):      def test_bandcamp_album(self):          dl = FakeYDL()          ie = BandcampAlbumIE(dl) -        result = ie.extract('http://mpallante.bandcamp.com/album/nightmare-night-ep') +        result = ie.extract('http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave')          self.assertIsPlaylist(result) -        self.assertEqual(result['title'], 'Nightmare Night EP') -        assertGreaterEqual(self, len(result['entries']), 4) +        self.assertEqual(result['title'], 'Hierophany of the Open Grave') +        assertGreaterEqual(self, len(result['entries']), 9)      def test_smotri_community(self):          dl = FakeYDL() diff --git a/test/test_utils.py b/test/test_utils.py index 51eb0b6b9..e26cc5b0c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -280,7 +280,7 @@ class TestUtil(unittest.TestCase):          d = json.loads(stripped)          self.assertEqual(d, [{"id": "532cb", "x": 3}]) -    def test_uppercase_escpae(self): +    def test_uppercase_escape(self):          self.assertEqual(uppercase_escape(u'aä'), u'aä')          self.assertEqual(uppercase_escape(u'\\U0001d550'), u'𝕐') diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 6ff0be00f..962aedbff 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -66,6 +66,9 @@ __authors__  = (      'Naglis Jonaitis',      'Charles Chen',      'Hassaan Ali', +    'Dobrosław Żybort', +    'David Fabijan', +    'Sebastian Haas',  )  __license__ = 'Public Domain' @@ -253,12 +256,9 @@ def parseOpts(overrideArguments=None):          '--socket-timeout', dest='socket_timeout',          type=float, default=None, help=u'Time to wait before giving up, in seconds')      general.add_option( -        '--bidi-workaround', dest='bidi_workaround', action='store_true', -        help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') -    general.add_option(          '--default-search',          dest='default_search', metavar='PREFIX', -        help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for  youtube-dl "large apple". Use the value "auto" to let youtube-dl guess. The default value "error" just throws an error.') +        help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for  youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching.')      general.add_option(          '--ignore-config',          action='store_true', @@ -386,6 +386,9 @@ def parseOpts(overrideArguments=None):          dest='headers', action='append',          help='specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times',      ) +    workarounds.add_option( +        '--bidi-workaround', dest='bidi_workaround', action='store_true', +        help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH')      verbosity.add_option('-q', '--quiet',              action='store_true', dest='quiet', help='activates quiet mode', default=False) @@ -709,7 +712,7 @@ def _real_main(argv=None):          date = DateRange.day(opts.date)      else:          date = DateRange(opts.dateafter, opts.datebefore) -    if opts.default_search not in ('auto', 'auto_warning', None) and ':' not in opts.default_search: +    if opts.default_search not in ('auto', 'auto_warning', 'error', 'fixup_error', None) and ':' not in opts.default_search:          parser.error(u'--default-search invalid; did you forget a colon (:) at the end?')      # Do not download videos when there are audio-only formats diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 917f3450e..9ce97f5fe 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -292,7 +292,7 @@ class FileDownloader(object):      def real_download(self, filename, info_dict):          """Real download process. Redefine in subclasses.""" -        raise NotImplementedError(u'This method must be implemented by sublcasses') +        raise NotImplementedError(u'This method must be implemented by subclasses')      def _hook_progress(self, status):          for ph in self._progress_hooks: diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index e6be6ae6c..71353f607 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -220,6 +220,7 @@ class F4mFD(FileDownloader):      def real_download(self, filename, info_dict):          man_url = info_dict['url'] +        requested_bitrate = info_dict.get('tbr')          self.to_screen('[download] Downloading f4m manifest')          manifest = self.ydl.urlopen(man_url).read()          self.report_destination(filename) @@ -233,8 +234,14 @@ class F4mFD(FileDownloader):          doc = etree.fromstring(manifest)          formats = [(int(f.attrib.get('bitrate', -1)), f) for f in doc.findall(_add_ns('media'))] -        formats = sorted(formats, key=lambda f: f[0]) -        rate, media = formats[-1] +        if requested_bitrate is None: +            # get the best format +            formats = sorted(formats, key=lambda f: f[0]) +            rate, media = formats[-1] +        else: +            rate, media = list(filter( +                lambda f: int(f[0]) == requested_bitrate, formats))[0] +          base_url = compat_urlparse.urljoin(man_url, media.attrib['url'])          bootstrap = base64.b64decode(doc.find(_add_ns('bootstrapInfo')).text)          metadata = base64.b64decode(media.find(_add_ns('metadata')).text) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c80a1bd48..632e832c7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -112,9 +112,11 @@ from .funnyordie import FunnyOrDieIE  from .gamekings import GamekingsIE  from .gameone import GameOneIE  from .gamespot import GameSpotIE +from .gamestar import GameStarIE  from .gametrailers import GametrailersIE  from .gdcvault import GDCVaultIE  from .generic import GenericIE +from .godtube import GodTubeIE  from .googleplus import GooglePlusIE  from .googlesearch import GoogleSearchIE  from .gorillavid import GorillaVidIE @@ -124,6 +126,7 @@ from .helsinki import HelsinkiIE  from .hentaistigma import HentaiStigmaIE  from .hotnewhiphop import HotNewHipHopIE  from .howcast import HowcastIE +from .howstuffworks import HowStuffWorksIE  from .huffpost import HuffPostIE  from .hypem import HypemIE  from .iconosquare import IconosquareIE @@ -176,10 +179,12 @@ from .mdr import MDRIE  from .metacafe import MetacafeIE  from .metacritic import MetacriticIE  from .mit import TechTVMITIE, MITIE, OCWMITIE +from .mitele import MiTeleIE  from .mixcloud import MixcloudIE  from .mlb import MLBIE  from .mpora import MporaIE  from .mofosex import MofosexIE +from .mojvideo import MojvideoIE  from .mooshare import MooshareIE  from .morningstar import MorningstarIE  from .motherless import MotherlessIE @@ -223,9 +228,12 @@ from .nrk import (  from .ntv import NTVIE  from .nytimes import NYTimesIE  from .nuvid import NuvidIE -from .oe1 import OE1IE  from .ooyala import OoyalaIE -from .orf import ORFIE +from .orf import ( +    ORFTVthekIE, +    ORFOE1IE, +    ORFFM4IE, +)  from .parliamentliveuk import ParliamentLiveUKIE  from .pbs import PBSIE  from .photobucket import PhotobucketIE @@ -246,6 +254,7 @@ from .ro220 import Ro220IE  from .rottentomatoes import RottenTomatoesIE  from .roxwel import RoxwelIE  from .rtbf import RTBFIE +from .rtlnl import RtlXlIE  from .rtlnow import RTLnowIE  from .rts import RTSIE  from .rtve import RTVEALaCartaIE @@ -326,6 +335,7 @@ from .tutv import TutvIE  from .tvigle import TvigleIE  from .tvp import TvpIE  from .tvplay import TVPlayIE +from .ubu import UbuIE  from .udemy import (      UdemyIE,      UdemyCourseIE @@ -347,6 +357,7 @@ from .videofyme import VideofyMeIE  from .videopremium import VideoPremiumIE  from .videott import VideoTtIE  from .videoweed import VideoWeedIE +from .vidme import VidmeIE  from .vimeo import (      VimeoIE,      VimeoChannelIE, @@ -380,6 +391,7 @@ from .wistia import WistiaIE  from .worldstarhiphop import WorldStarHipHopIE  from .wrzuta import WrzutaIE  from .xbef import XBefIE +from .xboxclips import XboxClipsIE  from .xhamster import XHamsterIE  from .xnxx import XNXXIE  from .xvideos import XVideosIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index dc8657b67..4359b88d1 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -6,6 +6,7 @@ import json  from .common import InfoExtractor  from ..utils import (      compat_urlparse, +    int_or_none,  ) @@ -110,8 +111,8 @@ class AppleTrailersIE(InfoExtractor):                  formats.append({                      'url': format_url,                      'format': format['type'], -                    'width': format['width'], -                    'height': int(format['height']), +                    'width': int_or_none(format['width']), +                    'height': int_or_none(format['height']),                  })              self._sort_formats(formats) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 30a85c8c1..7f0da8ab6 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,6 +8,8 @@ from ..utils import (      determine_ext,      ExtractorError,      qualities, +    compat_urllib_parse_urlparse, +    compat_urllib_parse,  ) @@ -44,8 +46,14 @@ class ARDIE(InfoExtractor):          else:              video_id = m.group('video_id') +        urlp = compat_urllib_parse_urlparse(url) +        url = urlp._replace(path=compat_urllib_parse.quote(urlp.path.encode('utf-8'))).geturl() +          webpage = self._download_webpage(url, video_id) +        if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage: +            raise ExtractorError('Video %s is no longer available' % video_id, expected=True) +          title = self._html_search_regex(              [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',               r'<meta name="dcterms.title" content="(.*?)"/>', diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 9591bad8a..d86dbba8e 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -109,15 +109,19 @@ class ArteTVPlus7IE(InfoExtractor):              regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]              return any(re.match(r, f['versionCode']) for r in regexes)          # Some formats may not be in the same language as the url +        # TODO: Might want not to drop videos that does not match requested language +        # but to process those formats with lower precedence          formats = filter(_match_lang, all_formats) -        formats = list(formats) # in python3 filter returns an iterator +        formats = list(formats)  # in python3 filter returns an iterator          if not formats:              # Some videos are only available in the 'Originalversion'              # they aren't tagged as being in French or German -            if all(f['versionCode'] == 'VO' or f['versionCode'] == 'VA' for f in all_formats): -                formats = all_formats -            else: -                raise ExtractorError(u'The formats list is empty') +            # Sometimes there are neither videos of requested lang code +            # nor original version videos available +            # For such cases we just take all_formats as is +            formats = all_formats +            if not formats: +                raise ExtractorError('The formats list is empty')          if re.match(r'[A-Z]Q', formats[0]['quality']) is not None:              def sort_key(f): diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index 7d558e262..3e461e715 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -52,7 +52,7 @@ class BlinkxIE(InfoExtractor):                      'height': int(m['h']),                  })              elif m['type'] == 'original': -                duration = m['d'] +                duration = float(m['d'])              elif m['type'] == 'youtube':                  yt_id = m['link']                  self.to_screen('Youtube video detected: %s' % yt_id) diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 25fb79e14..c51a97ce4 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -10,7 +10,7 @@ class BloombergIE(InfoExtractor):      _TEST = {          'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', -        'md5': '7bf08858ff7c203c870e8a6190e221e5', +        # The md5 checksum changes          'info_dict': {              'id': 'qurhIVlJSB6hzkVi229d8g',              'ext': 'flv', @@ -31,8 +31,7 @@ class BloombergIE(InfoExtractor):          return {              'id': name.split('-')[-1],              'title': title, -            'url': f4m_url, -            'ext': 'flv', +            'formats': self._extract_f4m_formats(f4m_url, name),              'description': self._og_search_description(webpage),              'thumbnail': self._og_search_thumbnail(webpage),          } diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index f7f2f713a..86f0c2861 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -7,6 +7,7 @@ from .common import InfoExtractor  from ..utils import (      ExtractorError,      int_or_none, +    parse_duration,  ) @@ -22,8 +23,9 @@ class BRIE(InfoExtractor):              'info_dict': {                  'id': '25e279aa-1ffd-40fd-9955-5325bd48a53a',                  'ext': 'mp4', -                'title': 'Am 1. und 2. August in Oberammergau', -                'description': 'md5:dfd224e5aa6819bc1fcbb7826a932021', +                'title': 'Wenn das Traditions-Theater wackelt', +                'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt', +                'duration': 34,              }          },          { @@ -34,6 +36,7 @@ class BRIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'Über den Pass',                  'description': 'Die Eroberung der Alpen: Über den Pass', +                'duration': 2588,              }          },          { @@ -44,6 +47,7 @@ class BRIE(InfoExtractor):                  'ext': 'aac',                  'title': '"Keine neuen Schulden im nächsten Jahr"',                  'description': 'Haushaltsentwurf: "Keine neuen Schulden im nächsten Jahr"', +                'duration': 64,              }          },          { @@ -54,6 +58,7 @@ class BRIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'Umweltbewusster Häuslebauer',                  'description': 'Uwe Erdelt: Umweltbewusster Häuslebauer', +                'duration': 116,              }          },          { @@ -64,6 +69,7 @@ class BRIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'Folge 1 - Metaphysik',                  'description': 'Kant für Anfänger: Folge 1 - Metaphysik', +                'duration': 893,                  'uploader': 'Eva Maria Steimle',                  'upload_date': '20140117',              } @@ -84,6 +90,7 @@ class BRIE(InfoExtractor):              media = {                  'id': xml_media.get('externalId'),                  'title': xml_media.find('title').text, +                'duration': parse_duration(xml_media.find('duration').text),                  'formats': self._extract_formats(xml_media.find('assets')),                  'thumbnails': self._extract_thumbnails(xml_media.find('teaserImage/variants')),                  'description': ' '.join(xml_media.find('shareTitle').text.splitlines()), diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 52c00186e..342bfb8b3 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -18,6 +18,7 @@ from ..utils import (      clean_html,      compiled_regex_type,      ExtractorError, +    int_or_none,      RegexNotFoundError,      sanitize_filename,      unescapeHTML, @@ -590,6 +591,24 @@ class InfoExtractor(object):          self.to_screen(msg)          time.sleep(timeout) +    def _extract_f4m_formats(self, manifest_url, video_id): +        manifest = self._download_xml( +            manifest_url, video_id, 'Downloading f4m manifest', +            'Unable to download f4m manifest') + +        formats = [] +        for media_el in manifest.findall('{http://ns.adobe.com/f4m/1.0}media'): +            formats.append({ +                'url': manifest_url, +                'ext': 'flv', +                'tbr': int_or_none(media_el.attrib.get('bitrate')), +                'width': int_or_none(media_el.attrib.get('width')), +                'height': int_or_none(media_el.attrib.get('height')), +            }) +        self._sort_formats(formats) + +        return formats +  class SearchInfoExtractor(InfoExtractor):      """ diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index cb8e06822..8049779b0 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -30,7 +30,7 @@ class DFBIE(InfoExtractor):              video_id)          video_info = player_info.find('video') -        f4m_info = self._download_xml(video_info.find('url').text, video_id) +        f4m_info = self._download_xml(self._proto_relative_url(video_info.find('url').text.strip()), video_id)          token_el = f4m_info.find('token')          manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f0cd8f156..f7cf700b5 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -20,7 +20,7 @@ from ..utils import (  class FacebookIE(InfoExtractor):      _VALID_URL = r'''(?x)          https?://(?:\w+\.)?facebook\.com/ -        (?:[^#?]*\#!/)? +        (?:[^#]*?\#!/)?          (?:video/video\.php|photo\.php|video/embed)\?(?:.*?)          (?:v|video_id)=(?P<id>[0-9]+)          (?:.*)''' diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py index 6d73c8a4a..af439ccfe 100644 --- a/youtube_dl/extractor/firedrive.py +++ b/youtube_dl/extractor/firedrive.py @@ -42,7 +42,6 @@ class FiredriveIE(InfoExtractor):          fields = dict(re.findall(r'''(?x)<input\s+              type="hidden"\s+              name="([^"]+)"\s+ -            (?:id="[^"]+"\s+)?              value="([^"]*)"              ''', webpage)) @@ -66,7 +65,7 @@ class FiredriveIE(InfoExtractor):          ext = self._search_regex(r'type:\s?\'([^\']+)\',',                                   webpage, 'extension', fatal=False)          video_url = self._search_regex( -            r'file:\s?\'(http[^\']+)\',', webpage, 'file url') +            r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url')          formats = [{              'format_id': 'sd', diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 1fbe6d175..1b0e8e5d5 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -19,17 +19,35 @@ class FranceTVBaseInfoExtractor(InfoExtractor):              + video_id, video_id, 'Downloading XML config')          manifest_url = info.find('videos/video/url').text -        video_url = manifest_url.replace('manifest.f4m', 'index_2_av.m3u8') -        video_url = video_url.replace('/z/', '/i/') +        manifest_url = manifest_url.replace('/z/', '/i/') +         +        if manifest_url.startswith('rtmp'): +            formats = [{'url': manifest_url, 'ext': 'flv'}] +        else: +            formats = [] +            available_formats = self._search_regex(r'/[^,]*,(.*?),k\.mp4', manifest_url, 'available formats') +            for index, format_descr in enumerate(available_formats.split(',')): +                format_info = { +                    'url': manifest_url.replace('manifest.f4m', 'index_%d_av.m3u8' % index), +                    'ext': 'mp4', +                } +                m_resolution = re.search(r'(?P<width>\d+)x(?P<height>\d+)', format_descr) +                if m_resolution is not None: +                    format_info.update({ +                        'width': int(m_resolution.group('width')), +                        'height': int(m_resolution.group('height')), +                    }) +                formats.append(format_info) +          thumbnail_path = info.find('image').text -        return {'id': video_id, -                'ext': 'flv' if video_url.startswith('rtmp') else 'mp4', -                'url': video_url, -                'title': info.find('titre').text, -                'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', thumbnail_path), -                'description': info.find('synopsis').text, -                } +        return { +            'id': video_id, +            'title': info.find('titre').text, +            'formats': formats, +            'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', thumbnail_path), +            'description': info.find('synopsis').text, +        }  class PluzzIE(FranceTVBaseInfoExtractor): diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py new file mode 100644 index 000000000..50f8fc7e7 --- /dev/null +++ b/youtube_dl/extractor/gamestar.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    parse_duration, +    str_to_int, +    unified_strdate, +) + + +class GameStarIE(InfoExtractor): +    _VALID_URL = r'http://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' +    _TEST = { +        'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', +        'md5': '96974ecbb7fd8d0d20fca5a00810cea7', +        'info_dict': { +            'id': '76110', +            'ext': 'mp4', +            'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', +            'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den vollständigen Trailer an.', +            'thumbnail': 'http://images.gamestar.de/images/idgwpgsgp/bdb/2494525/600x.jpg', +            'upload_date': '20140728', +            'duration': 17 +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        og_title = self._og_search_title(webpage) +        title = og_title.replace(' - Video bei GameStar.de', '').strip() + +        url = 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id + +        description = self._og_search_description(webpage).strip() + +        thumbnail = self._proto_relative_url( +            self._og_search_thumbnail(webpage), scheme='http:') + +        upload_date = unified_strdate(self._html_search_regex( +            r'<span style="float:left;font-size:11px;">Datum: ([0-9]+\.[0-9]+\.[0-9]+)  ', +            webpage, 'upload_date', fatal=False)) + +        duration = parse_duration(self._html_search_regex( +            r'  Länge: ([0-9]+:[0-9]+)</span>', webpage, 'duration', +            fatal=False)) + +        view_count = str_to_int(self._html_search_regex( +            r'  Zuschauer: ([0-9\.]+)  ', webpage, +            'view_count', fatal=False)) + +        comment_count = int_or_none(self._html_search_regex( +            r'>Kommentieren \(([0-9]+)\)</a>', webpage, 'comment_count', +            fatal=False)) + +        return { +            'id': video_id, +            'title': title, +            'url': url, +            'ext': 'mp4', +            'thumbnail': thumbnail, +            'description': description, +            'upload_date': upload_date, +            'duration': duration, +            'view_count': view_count, +            'comment_count': comment_count +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9db27f9aa..8e915735e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -383,13 +383,13 @@ class GenericIE(InfoExtractor):          if not parsed_url.scheme:              default_search = self._downloader.params.get('default_search')              if default_search is None: -                default_search = 'error' +                default_search = 'fixup_error' -            if default_search in ('auto', 'auto_warning'): +            if default_search in ('auto', 'auto_warning', 'fixup_error'):                  if '/' in url:                      self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')                      return self.url_result('http://' + url) -                else: +                elif default_search != 'fixup_error':                      if default_search == 'auto_warning':                          if re.match(r'^(?:url|URL)$', url):                              raise ExtractorError( @@ -399,7 +399,8 @@ class GenericIE(InfoExtractor):                              self._downloader.report_warning(                                  'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)                      return self.url_result('ytsearch:' + url) -            elif default_search == 'error': + +            if default_search in ('error', 'fixup_error'):                  raise ExtractorError(                      ('%r is not a valid URL. '                       'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube' @@ -705,6 +706,13 @@ class GenericIE(InfoExtractor):              url = unescapeHTML(mobj.group('url'))              return self.url_result(url, ie='MTVServicesEmbedded') +        # Look for embedded yahoo player +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1', +            webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'Yahoo') +          # Start with something easy: JW Player in SWFObject          found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)          if not found: diff --git a/youtube_dl/extractor/godtube.py b/youtube_dl/extractor/godtube.py new file mode 100644 index 000000000..73bd6d890 --- /dev/null +++ b/youtube_dl/extractor/godtube.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    parse_duration, +    parse_iso8601, +) + + +class GodTubeIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?godtube\.com/watch/\?v=(?P<id>[\da-zA-Z]+)' +    _TESTS = [ +        { +            'url': 'https://www.godtube.com/watch/?v=0C0CNNNU', +            'md5': '77108c1e4ab58f48031101a1a2119789', +            'info_dict': { +                'id': '0C0CNNNU', +                'ext': 'mp4', +                'title': 'Woman at the well.', +                'duration': 159, +                'timestamp': 1205712000, +                'uploader': 'beverlybmusic', +                'upload_date': '20080317', +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +        }, +    ] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        config = self._download_xml( +            'http://www.godtube.com/resource/mediaplayer/%s.xml' % video_id.lower(), +            video_id, 'Downloading player config XML') + +        video_url = config.find('.//file').text +        uploader = config.find('.//author').text +        timestamp = parse_iso8601(config.find('.//date').text) +        duration = parse_duration(config.find('.//duration').text) +        thumbnail = config.find('.//image').text + +        media = self._download_xml( +            'http://www.godtube.com/media/xml/?v=%s' % video_id, video_id, 'Downloading media XML') + +        title = media.find('.//title').text + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'thumbnail': thumbnail, +            'timestamp': timestamp, +            'uploader': uploader, +            'duration': duration, +        } diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py new file mode 100644 index 000000000..68684b997 --- /dev/null +++ b/youtube_dl/extractor/howstuffworks.py @@ -0,0 +1,134 @@ +from __future__ import unicode_literals + +import re +import json +import random +import string + +from .common import InfoExtractor +from ..utils import find_xpath_attr + + +class HowStuffWorksIE(InfoExtractor): +    _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*\d+-(?P<id>.+?)-video\.htm' +    _TESTS = [ +        { +            'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm', +            'info_dict': { +                'id': '450221', +                'display_id': 'cool-jobs-iditarod-musher', +                'ext': 'flv', +                'title': 'Cool Jobs - Iditarod Musher', +                'description': 'md5:82bb58438a88027b8186a1fccb365f90', +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +            'params': { +                # md5 is not consistent +                'skip_download': True +            } +        }, +        { +            'url': 'http://adventure.howstuffworks.com/39516-deadliest-catch-jakes-farewell-pots-video.htm', +            'info_dict': { +                'id': '553470', +                'display_id': 'deadliest-catch-jakes-farewell-pots', +                'ext': 'mp4', +                'title': 'Deadliest Catch: Jake\'s Farewell Pots', +                'description': 'md5:9632c346d5e43ee238028c9cefd8dbbc', +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +            'params': { +                # md5 is not consistent +                'skip_download': True +            } +        }, +        { +            'url': 'http://entertainment.howstuffworks.com/arts/2706-sword-swallowing-1-by-dan-meyer-video.htm', +            'info_dict': { +                'id': '440011', +                'display_id': 'sword-swallowing-1-by-dan-meyer', +                'ext': 'flv', +                'title': 'Sword Swallowing #1 by Dan Meyer', +                'description': 'md5:b2409e88172913e2e7d3d1159b0ef735', +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +            'params': { +                # md5 is not consistent +                'skip_download': True +            } +        }, +    ] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        display_id = mobj.group('id') +        webpage = self._download_webpage(url, display_id) + +        content_id = self._search_regex(r'var siteSectionId="(\d+)";', webpage, 'content id') + +        mp4 = self._search_regex( +            r'''(?xs)var\s+clip\s*=\s*{\s* +                .+?\s* +                content_id\s*:\s*%s\s*,\s* +                .+?\s* +                mp4\s*:\s*\[(.*?),?\]\s* +                };\s* +                videoData\.push\(clip\);''' % content_id, +            webpage, 'mp4', fatal=False, default=None) + +        smil = self._download_xml( +            'http://services.media.howstuffworks.com/videos/%s/smil-service.smil' % content_id, +            content_id, 'Downloading video SMIL') + +        http_base = find_xpath_attr( +            smil, +            './{0}head/{0}meta'.format('{http://www.w3.org/2001/SMIL20/Language}'), +            'name', +            'httpBase').get('content') + +        def random_string(str_len=0): +            return ''.join([random.choice(string.ascii_uppercase) for _ in range(str_len)]) + +        URL_SUFFIX = '?v=2.11.3&fp=LNX 11,2,202,356&r=%s&g=%s' % (random_string(5), random_string(12)) + +        formats = [] + +        if mp4: +            for video in json.loads('[%s]' % mp4): +                bitrate = video['bitrate'] +                fmt = { +                    'url': video['src'].replace('http://pmd.video.howstuffworks.com', http_base) + URL_SUFFIX, +                    'format_id': bitrate, +                } +                m = re.search(r'(?P<vbr>\d+)[Kk]', bitrate) +                if m: +                    fmt['vbr'] = int(m.group('vbr')) +                formats.append(fmt) +        else: +            for video in smil.findall( +                    './/{0}body/{0}switch/{0}video'.format('{http://www.w3.org/2001/SMIL20/Language}')): +                vbr = int(video.attrib['system-bitrate']) / 1000 +                formats.append({ +                    'url': '%s/%s%s' % (http_base, video.attrib['src'], URL_SUFFIX), +                    'format_id': '%dk' % vbr, +                    'vbr': vbr, +                }) + +        self._sort_formats(formats) + +        title = self._og_search_title(webpage) +        TITLE_SUFFIX = ' : HowStuffWorks' +        if title.endswith(TITLE_SUFFIX): +            title = title[:-len(TITLE_SUFFIX)] + +        description = self._og_search_description(webpage) +        thumbnail = self._og_search_thumbnail(webpage) + +        return { +            'id': content_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index e51358595..79e8430b5 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -4,25 +4,31 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import get_element_by_id, parse_iso8601, determine_ext, int_or_none +from ..utils import ( +    get_element_by_id, +    parse_iso8601, +    determine_ext, +    int_or_none, +    str_to_int, +)  class IzleseneIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.|m\.)?izlesene\.com/(?:video|embedplayer)/(?:[^/]+/)?(?P<id>[0-9]+)' +    _VALID_URL = r'https?://(?:(?:www|m)\.)?izlesene\.com/(?:video|embedplayer)/(?:[^/]+/)?(?P<id>[0-9]+)'      _STREAM_URL = 'http://panel.izlesene.com/api/streamurl/{id:}/{format:}'      _TEST = {          'url': 'http://www.izlesene.com/video/sevincten-cildirtan-dogum-gunu-hediyesi/7599694',          'md5': '4384f9f0ea65086734b881085ee05ac2',          'info_dict': {              'id': '7599694', -            'title': u'Sevinçten Çıldırtan Doğum Günü Hediyesi', -            'upload_date': '20140702', -            'uploader_id': 'pelikzzle', -            'description': u'Annesi oğluna doğum günü hediyesi olarak minecraft cd si alıyor, ve çocuk hunharca seviniyor', -            'timestamp': 1404298698, -            'duration': 95,              'ext': 'mp4', +            'title': 'Sevinçten Çıldırtan Doğum Günü Hediyesi', +            'description': 'Annesi oğluna doğum günü hediyesi olarak minecraft cd si alıyor, ve çocuk hunharca seviniyor',              'thumbnail': 're:^http://.*\.jpg', +            'uploader_id': 'pelikzzle', +            'timestamp': 1404298698, +            'upload_date': '20140702', +            'duration': 95.395,              'age_limit': 0,          }      } @@ -37,58 +43,55 @@ class IzleseneIE(InfoExtractor):          title = self._og_search_title(webpage)          description = self._og_search_description(webpage)          thumbnail = self._og_search_thumbnail(webpage) -        duration = int( -            self._html_search_regex( -                r'"videoduration"\s?:\s?"([^"]+)"', webpage, 'duration', -                fatal=False, default='0') -            ) / 1000 -        view_count = get_element_by_id('videoViewCount', -                                       webpage).replace('.', '') -        timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage, -                                  'upload date', fatal=False)) -        family_friendly = self._html_search_meta('isFamilyFriendly', webpage, -                                                 'age limit', fatal=False) -        uploader = self._html_search_regex(r"adduserUsername\s?=\s?'([^']+)';", -                                           webpage, 'uploader', fatal=False, -                                           default='') + +        uploader = self._html_search_regex( +            r"adduserUsername\s*=\s*'([^']+)';", webpage, 'uploader', fatal=False, default='') +        timestamp = parse_iso8601(self._html_search_meta( +            'uploadDate', webpage, 'upload date', fatal=False)) + +        duration = int_or_none(self._html_search_regex( +            r'"videoduration"\s*:\s*"([^"]+)"', webpage, 'duration', fatal=False)) +        if duration: +            duration /= 1000.0 + +        view_count = str_to_int(get_element_by_id('videoViewCount', webpage))          comment_count = self._html_search_regex( -            r'comment_count\s?=\s?\'([^\']+)\';', -            webpage, 'uploader', fatal=False) +            r'comment_count\s*=\s*\'([^\']+)\';', webpage, 'uploader', fatal=False) -        content_url = self._html_search_meta('contentURL', webpage, -                                             'content URL', fatal=False) -        ext = determine_ext(content_url) +        family_friendly = self._html_search_meta( +            'isFamilyFriendly', webpage, 'age limit', fatal=False) + +        content_url = self._html_search_meta( +            'contentURL', webpage, 'content URL', fatal=False) +        ext = determine_ext(content_url, 'mp4')          # Might be empty for some videos. -        qualities = self._html_search_regex(r'"quality"\s?:\s?"([^"]+)"', -                                            webpage, 'qualities', fatal=False, -                                            default='') +        qualities = self._html_search_regex( +            r'"quality"\s*:\s*"([^"]+)"', webpage, 'qualities', fatal=False, default='')          formats = []          for quality in qualities.split('|'):              json = self._download_json(                  self._STREAM_URL.format(id=video_id, format=quality), video_id, -                note=u'Getting video URL for "%s" quality' % quality, -                errnote=u'Failed to get video URL for "%s" quality' % quality +                note='Getting video URL for "%s" quality' % quality, +                errnote='Failed to get video URL for "%s" quality' % quality              ) -            video_format = '%sp' % quality if quality else 'sd'              formats.append({                  'url': json.get('streamurl'),                  'ext': ext, -                'format': video_format, -                'format_id': video_format, +                'format_id': '%sp' % quality if quality else 'sd',              })          return {              'id': video_id,              'title': title, -            'formats': formats,              'description': description,              'thumbnail': thumbnail, +            'uploader_id': uploader, +            'timestamp': timestamp,              'duration': duration,              'view_count': int_or_none(view_count), -            'timestamp': timestamp, -            'age_limit': 18 if family_friendly == 'False' else 0, -            'uploader_id': uploader,              'comment_count': int_or_none(comment_count), +            'age_limit': 18 if family_friendly == 'False' else 0, +            'formats': formats,          } diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index c0c2d9b09..281a0ce40 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -111,17 +111,28 @@ class LivestreamIE(InfoExtractor):          event_name = mobj.group('event_name')          webpage = self._download_webpage(url, video_id or event_name) -        if video_id is None: -            # This is an event page: +        og_video = self._og_search_video_url(webpage, 'player url', fatal=False, default=None) +        if og_video is None:              config_json = self._search_regex(                  r'window.config = ({.*?});', webpage, 'window config')              info = json.loads(config_json)['event'] + +            def is_relevant(vdata, vid): +                result = vdata['type'] == 'video' +                if video_id is not None: +                    result = result and compat_str(vdata['data']['id']) == vid +                return result +              videos = [self._extract_video_info(video_data['data']) -                for video_data in info['feed']['data'] -                if video_data['type'] == 'video'] -            return self.playlist_result(videos, info['id'], info['full_name']) +                      for video_data in info['feed']['data'] +                      if is_relevant(video_data, video_id)] +            if video_id is None: +                # This is an event page: +                return self.playlist_result(videos, info['id'], info['full_name']) +            else: +                if videos: +                    return videos[0]          else: -            og_video = self._og_search_video_url(webpage, 'player url')              query_str = compat_urllib_parse_urlparse(og_video).query              query = compat_urlparse.parse_qs(query_str)              api_url = query['play_url'][0].replace('.smil', '') diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py new file mode 100644 index 000000000..979f3d692 --- /dev/null +++ b/youtube_dl/extractor/mitele.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_parse, +    get_element_by_attribute, +    parse_duration, +    strip_jsonp, +) + + +class MiTeleIE(InfoExtractor): +    IE_NAME = 'mitele.es' +    _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<episode>[^/]+)/' + +    _TEST = { +        'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', +        'md5': '6a75fe9d0d3275bead0cb683c616fddb', +        'info_dict': { +            'id': '0fce117d', +            'ext': 'mp4', +            'title': 'Programa 144 - Tor, la web invisible', +            'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', +            'display_id': 'programa-144', +            'duration': 2913, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        episode = mobj.group('episode') +        webpage = self._download_webpage(url, episode) +        embed_data_json = self._search_regex( +            r'MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', +            flags=re.DOTALL +        ).replace('\'', '"') +        embed_data = json.loads(embed_data_json) + +        info_url = embed_data['flashvars']['host'] +        info_el = self._download_xml(info_url, episode).find('./video/info') + +        video_link = info_el.find('videoUrl/link').text +        token_query = compat_urllib_parse.urlencode({'id': video_link}) +        token_info = self._download_json( +            'http://token.mitele.es/?' + token_query, episode, +            transform_source=strip_jsonp +        ) + +        return { +            'id': embed_data['videoId'], +            'display_id': episode, +            'title': info_el.find('title').text, +            'url': token_info['tokenizedUrl'], +            'description': get_element_by_attribute('class', 'text', webpage), +            'thumbnail': info_el.find('thumb').text, +            'duration': parse_duration(info_el.find('duration').text), +        } diff --git a/youtube_dl/extractor/mojvideo.py b/youtube_dl/extractor/mojvideo.py new file mode 100644 index 000000000..90b460d65 --- /dev/null +++ b/youtube_dl/extractor/mojvideo.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    parse_duration, +) + + +class MojvideoIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?mojvideo\.com/video-(?P<display_id>[^/]+)/(?P<id>[a-f0-9]+)' +    _TEST = { +        'url': 'http://www.mojvideo.com/video-v-avtu-pred-mano-rdecelaska-alfi-nipic/3d1ed4497707730b2906', +        'md5': 'f7fd662cc8ce2be107b0d4f2c0483ae7', +        'info_dict': { +            'id': '3d1ed4497707730b2906', +            'display_id': 'v-avtu-pred-mano-rdecelaska-alfi-nipic', +            'ext': 'mp4', +            'title': 'V avtu pred mano rdečelaska - Alfi Nipič', +            'thumbnail': 're:^http://.*\.jpg$', +            'duration': 242, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        display_id = mobj.group('display_id') + +        # XML is malformed +        playerapi = self._download_webpage( +            'http://www.mojvideo.com/playerapi.php?v=%s&t=1' % video_id, display_id) + +        if '<error>true</error>' in playerapi: +            error_desc = self._html_search_regex( +                r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False) +            raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True) + +        title = self._html_search_regex( +            r'<title>([^<]+)</title>', playerapi, 'title') +        video_url = self._html_search_regex( +            r'<file>([^<]+)</file>', playerapi, 'video URL') +        thumbnail = self._html_search_regex( +            r'<preview>([^<]+)</preview>', playerapi, 'thumbnail', fatal=False) +        duration = parse_duration(self._html_search_regex( +            r'<duration>([^<]+)</duration>', playerapi, 'duration', fatal=False)) + +        return { +            'id': video_id, +            'display_id': display_id, +            'url': video_url, +            'title': title, +            'thumbnail': thumbnail, +            'duration': duration, +        }
\ No newline at end of file diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index 1c5e9401f..6b2f3f55a 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -1,3 +1,4 @@ +# encoding: utf-8  from __future__ import unicode_literals  import re @@ -8,19 +9,34 @@ from ..utils import ExtractorError  class NownessIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?nowness\.com/[^?#]*?/(?P<id>[0-9]+)/(?P<slug>[^/]+?)(?:$|[?#])' - -    _TEST = { -        'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation', -        'md5': '068bc0202558c2e391924cb8cc470676', -        'info_dict': { -            'id': '2520295746001', -            'ext': 'mp4', -            'description': 'Candor: The Art of Gesticulation', -            'uploader': 'Nowness', -            'title': 'Candor: The Art of Gesticulation', -        } -    } +    _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/[^?#]*?/(?P<id>[0-9]+)/(?P<slug>[^/]+?)(?:$|[?#])' + +    _TESTS = [ +        { +            'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation', +            'md5': '068bc0202558c2e391924cb8cc470676', +            'info_dict': { +                'id': '2520295746001', +                'ext': 'mp4', +                'title': 'Candor: The Art of Gesticulation', +                'description': 'Candor: The Art of Gesticulation', +                'thumbnail': 're:^https?://.*\.jpg', +                'uploader': 'Nowness', +            } +        }, +        { +            'url': 'http://cn.nowness.com/day/2014/8/7/4069/kasper-bj-rke-ft-jaakko-eino-kalevi--tnr', +            'md5': 'e79cf125e387216f86b2e0a5b5c63aa3', +            'info_dict': { +                'id': '3716354522001', +                'ext': 'mp4', +                'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', +                'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', +                'thumbnail': 're:^https?://.*\.jpg', +                'uploader': 'Nowness', +            } +        }, +    ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/oe1.py b/youtube_dl/extractor/oe1.py deleted file mode 100644 index 38971ab4d..000000000 --- a/youtube_dl/extractor/oe1.py +++ /dev/null @@ -1,40 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import calendar -import datetime -import re - -from .common import InfoExtractor - -# audios on oe1.orf.at are only available for 7 days, so we can't -# add tests. - - -class OE1IE(InfoExtractor): -    IE_DESC = 'oe1.orf.at' -    _VALID_URL = r'http://oe1\.orf\.at/programm/(?P<id>[0-9]+)' - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        show_id = mobj.group('id') - -        data = self._download_json( -            'http://oe1.orf.at/programm/%s/konsole' % show_id, -            show_id -        ) - -        timestamp = datetime.datetime.strptime('%s %s' % ( -            data['item']['day_label'], -            data['item']['time'] -        ), '%d.%m.%Y %H:%M') -        unix_timestamp = calendar.timegm(timestamp.utctimetuple()) - -        return { -            'id': show_id, -            'title': data['item']['title'], -            'url': data['item']['url_stream'], -            'ext': 'mp3', -            'description': data['item'].get('info'), -            'timestamp': unix_timestamp -        } diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 13f12824c..2044e107e 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -3,23 +3,38 @@ import re  import json  from .common import InfoExtractor -from ..utils import unescapeHTML +from ..utils import ( +    unescapeHTML, +    ExtractorError, +)  class OoyalaIE(InfoExtractor):      _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)' -    _TEST = { -        # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video -        'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', -        'md5': '3f5cceb3a7bf461d6c29dc466cf8033c', -        'info_dict': { -            'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', -            'ext': 'mp4', -            'title': 'Explaining Data Recovery from Hard Drives and SSDs', -            'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', +    _TESTS = [ +        { +            # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video +            'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', +            'md5': '3f5cceb3a7bf461d6c29dc466cf8033c', +            'info_dict': { +                'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', +                'ext': 'mp4', +                'title': 'Explaining Data Recovery from Hard Drives and SSDs', +                'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', +            }, +        }, { +            # Only available for ipad +            'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', +            'md5': '4b9754921fddb68106e48c142e2a01e6', +            'info_dict': { +                'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', +                'ext': 'mp4', +                'title': 'Simulation Overview - Levels of Simulation', +                'description': '', +            },          }, -    } +    ]      @staticmethod      def _url_for_embed_code(embed_code): @@ -47,13 +62,30 @@ class OoyalaIE(InfoExtractor):          player = self._download_webpage(player_url, embedCode)          mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="',                                          player, 'mobile player url') -        mobile_player = self._download_webpage(mobile_url, embedCode) -        videos_info = self._search_regex( -            r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', -            mobile_player, 'info').replace('\\"','"') -        videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"','"') +        # Looks like some videos are only available for particular devices +        # (e.g. http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0 +        # is only available for ipad) +        # Working around with fetching URLs for all the devices found starting with 'unknown' +        # until we succeed or eventually fail for each device. +        devices = re.findall(r'device\s*=\s*"([^"]+)";', player) +        devices.remove('unknown') +        devices.insert(0, 'unknown') +        for device in devices: +            mobile_player = self._download_webpage( +                '%s&device=%s' % (mobile_url, device), embedCode, +                'Downloading mobile player JS for %s device' % device) +            videos_info = self._search_regex( +                r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', +                mobile_player, 'info', fatal=False, default=None) +            if videos_info: +                break +        if not videos_info: +            raise ExtractorError('Unable to extract info') +        videos_info = videos_info.replace('\\"', '"') +        videos_more_info = self._search_regex( +            r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"', '"')          videos_info = json.loads(videos_info) -        videos_more_info =json.loads(videos_more_info) +        videos_more_info = json.loads(videos_more_info)          if videos_more_info.get('lineup'):              videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])] diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 03421d1d5..011e6be13 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals  import json  import re +import calendar +import datetime  from .common import InfoExtractor  from ..utils import ( @@ -12,7 +14,9 @@ from ..utils import (  ) -class ORFIE(InfoExtractor): +class ORFTVthekIE(InfoExtractor): +    IE_NAME = 'orf:tvthek' +    IE_DESC = 'ORF TVthek'      _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)'      _TEST = { @@ -105,3 +109,73 @@ class ORFIE(InfoExtractor):              'entries': entries,              'id': playlist_id,          } + + +# Audios on ORF radio are only available for 7 days, so we can't add tests. + + +class ORFOE1IE(InfoExtractor): +    IE_NAME = 'orf:oe1' +    IE_DESC = 'Radio Österreich 1' +    _VALID_URL = r'http://oe1\.orf\.at/programm/(?P<id>[0-9]+)' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        show_id = mobj.group('id') + +        data = self._download_json( +            'http://oe1.orf.at/programm/%s/konsole' % show_id, +            show_id +        ) + +        timestamp = datetime.datetime.strptime('%s %s' % ( +            data['item']['day_label'], +            data['item']['time'] +        ), '%d.%m.%Y %H:%M') +        unix_timestamp = calendar.timegm(timestamp.utctimetuple()) + +        return { +            'id': show_id, +            'title': data['item']['title'], +            'url': data['item']['url_stream'], +            'ext': 'mp3', +            'description': data['item'].get('info'), +            'timestamp': unix_timestamp +        } + + +class ORFFM4IE(InfoExtractor): +    IE_DESC = 'orf:fm4' +    IE_DESC = 'radio FM4' +    _VALID_URL = r'http://fm4\.orf\.at/7tage/?#(?P<date>[0-9]+)/(?P<show>\w+)' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        show_date = mobj.group('date') +        show_id = mobj.group('show') + +        data = self._download_json( +            'http://audioapi.orf.at/fm4/json/2.0/broadcasts/%s/4%s' % (show_date, show_id), +            show_id +        ) + +        def extract_entry_dict(info, title, subtitle): +            return { +                'id': info['loopStreamId'].replace('.mp3', ''), +                'url': 'http://loopstream01.apa.at/?channel=fm4&id=%s' % info['loopStreamId'], +                'title': title, +                'description': subtitle, +                'duration': (info['end'] - info['start']) / 1000, +                'timestamp': info['start'] / 1000, +                'ext': 'mp3' +            } + +        entries = [extract_entry_dict(t, data['title'], data['subtitle']) for t in data['streams']] + +        return { +            '_type': 'playlist', +            'id': show_id, +            'title': data['title'], +            'description': data['subtitle'], +            'entries': entries +        }
\ No newline at end of file diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 64cded707..dee4af6f1 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -20,25 +20,60 @@ class PBSIE(InfoExtractor):          )      ''' -    _TEST = { -        'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', -        'md5': 'ce1888486f0908d555a8093cac9a7362', -        'info_dict': { -            'id': '2365006249', -            'ext': 'mp4', -            'title': 'A More Perfect Union', -            'description': 'md5:ba0c207295339c8d6eced00b7c363c6a', -            'duration': 3190, +    _TESTS = [ +        { +            'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', +            'md5': 'ce1888486f0908d555a8093cac9a7362', +            'info_dict': { +                'id': '2365006249', +                'ext': 'mp4', +                'title': 'A More Perfect Union', +                'description': 'md5:ba0c207295339c8d6eced00b7c363c6a', +                'duration': 3190, +            }, +        }, +        { +            'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', +            'md5': '143c98aa54a346738a3d78f54c925321', +            'info_dict': { +                'id': '2365297690', +                'ext': 'mp4', +                'title': 'Losing Iraq', +                'description': 'md5:f5bfbefadf421e8bb8647602011caf8e', +                'duration': 5050, +            }, +        }, +        { +            'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', +            'md5': 'b19856d7f5351b17a5ab1dc6a64be633', +            'info_dict': { +                'id': '2201174722', +                'ext': 'mp4', +                'title': 'Cyber Schools Gain Popularity, but Quality Questions Persist', +                'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28', +                'duration': 801, +            },          }, -    } +    ] -    def _real_extract(self, url): +    def _extract_ids(self, url):          mobj = re.match(self._VALID_URL, url)          presumptive_id = mobj.group('presumptive_id')          display_id = presumptive_id          if presumptive_id:              webpage = self._download_webpage(url, display_id) + +            MEDIA_ID_REGEXES = [ +                r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed +                r'class="coveplayerid">([^<]+)<',                       # coveplayer +            ] + +            media_id = self._search_regex( +                MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None) +            if media_id: +                return media_id, presumptive_id +              url = self._search_regex(                  r'<iframe\s+id=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',                  webpage, 'player URL') @@ -57,6 +92,11 @@ class PBSIE(InfoExtractor):              video_id = mobj.group('id')              display_id = video_id +        return video_id, display_id + +    def _real_extract(self, url): +        video_id, display_id = self._extract_ids(url) +          info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id          info = self._download_json(info_url, display_id) diff --git a/youtube_dl/extractor/reverbnation.py b/youtube_dl/extractor/reverbnation.py index 49cf427a1..ec7e7df7b 100644 --- a/youtube_dl/extractor/reverbnation.py +++ b/youtube_dl/extractor/reverbnation.py @@ -1,23 +1,23 @@  from __future__ import unicode_literals  import re -import time  from .common import InfoExtractor -from ..utils import strip_jsonp +from ..utils import str_or_none  class ReverbNationIE(InfoExtractor):      _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$'      _TESTS = [{          'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', -        'file': '16965047.mp3',          'md5': '3da12ebca28c67c111a7f8b262d3f7a7',          'info_dict': { +            "id": "16965047", +            "ext": "mp3",              "title": "MONA LISA",              "uploader": "ALKILADOS", -            "uploader_id": 216429, -            "thumbnail": "//gp1.wac.edgecastcdn.net/802892/production_public/Photo/13761700/image/1366002176_AVATAR_MONA_LISA.jpg" +            "uploader_id": "216429", +            "thumbnail": "re:^https://gp1\.wac\.edgecastcdn\.net/.*?\.jpg$"          },      }] @@ -26,10 +26,8 @@ class ReverbNationIE(InfoExtractor):          song_id = mobj.group('id')          api_res = self._download_json( -            'https://api.reverbnation.com/song/%s?callback=api_response_5&_=%d' -                % (song_id, int(time.time() * 1000)), +            'https://api.reverbnation.com/song/%s' % song_id,              song_id, -            transform_source=strip_jsonp,              note='Downloading information of song %s' % song_id          ) @@ -38,8 +36,9 @@ class ReverbNationIE(InfoExtractor):              'title': api_res.get('name'),              'url': api_res.get('url'),              'uploader': api_res.get('artist', {}).get('name'), -            'uploader_id': api_res.get('artist', {}).get('id'), -            'thumbnail': api_res.get('image', api_res.get('thumbnail')), +            'uploader_id': str_or_none(api_res.get('artist', {}).get('id')), +            'thumbnail': self._proto_relative_url( +                api_res.get('image', api_res.get('thumbnail'))),              'ext': 'mp3',              'vcodec': 'none',          } diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py new file mode 100644 index 000000000..14928cd62 --- /dev/null +++ b/youtube_dl/extractor/rtlnl.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RtlXlIE(InfoExtractor): +    IE_NAME = 'rtlxl.nl' +    _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)' + +    _TEST = { +        'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', +        'info_dict': { +            'id': '6e4203a6-0a5e-3596-8424-c599a59e0677', +            'ext': 'flv', +            'title': 'RTL Nieuws - Laat', +            'description': 'Dagelijks het laatste nieuws uit binnen- en ' +                'buitenland. Voor nog meer nieuws kunt u ook gebruikmaken van ' +                'onze mobiele apps.', +            'timestamp': 1408051800, +            'upload_date': '20140814', +        }, +        'params': { +            # We download the first bytes of the first fragment, it can't be +            # processed by the f4m downloader beacuse it isn't complete +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        uuid = mobj.group('uuid') + +        info = self._download_json( +            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid, +            uuid) +        meta = info['meta'] +        material = info['material'][0] +        episode_info = info['episodes'][0] + +        f4m_url = 'http://manifest.us.rtl.nl' + material['videopath'] +        progname = info['abstracts'][0]['name'] +        subtitle = material['title'] or info['episodes'][0]['name'] + +        return { +            'id': uuid, +            'title': '%s - %s' % (progname, subtitle),  +            'formats': self._extract_f4m_formats(f4m_url, uuid), +            'timestamp': material['original_date'], +            'description': episode_info['synopsis'], +        } diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 8607482be..badba2ac6 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -17,11 +17,11 @@ class SharedIE(InfoExtractor):      _TEST = {          'url': 'http://shared.sx/0060718775', -        'md5': '53e1c58fc3e777ae1dfe9e57ba2f9c72', +        'md5': '106fefed92a8a2adb8c98e6a0652f49b',          'info_dict': {              'id': '0060718775',              'ext': 'mp4', -            'title': 'Big Buck Bunny Trailer', +            'title': 'Bmp4',          },      } diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py index 9faf3a5e3..172def221 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/youtube_dl/extractor/streamcloud.py @@ -1,4 +1,6 @@  # coding: utf-8 +from __future__ import unicode_literals +  import re  import time @@ -10,18 +12,18 @@ from ..utils import (  class StreamcloudIE(InfoExtractor): -    IE_NAME = u'streamcloud.eu' +    IE_NAME = 'streamcloud.eu'      _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)/(?P<fname>[^#?]*)\.html'      _TEST = { -        u'url': u'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html', -        u'file': u'skp9j99s4bpz.mp4', -        u'md5': u'6bea4c7fa5daaacc2a946b7146286686', -        u'info_dict': { -            u'title': u'youtube-dl test video  \'/\\ ä ↭', -            u'duration': 9, +        'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html', +        'md5': '6bea4c7fa5daaacc2a946b7146286686', +        'info_dict': { +            'id': 'skp9j99s4bpz', +            'ext': 'mp4', +            'title': 'youtube-dl test video  \'/\\ ä ↭',          }, -        u'skip': u'Only available from the EU' +        'skip': 'Only available from the EU'      }      def _real_extract(self, url): @@ -46,21 +48,17 @@ class StreamcloudIE(InfoExtractor):          req = compat_urllib_request.Request(url, post, headers)          webpage = self._download_webpage( -            req, video_id, note=u'Downloading video page ...') +            req, video_id, note='Downloading video page ...')          title = self._html_search_regex( -            r'<h1[^>]*>([^<]+)<', webpage, u'title') +            r'<h1[^>]*>([^<]+)<', webpage, 'title')          video_url = self._search_regex( -            r'file:\s*"([^"]+)"', webpage, u'video URL') -        duration_str = self._search_regex( -            r'duration:\s*"?([0-9]+)"?', webpage, u'duration', fatal=False) -        duration = None if duration_str is None else int(duration_str) +            r'file:\s*"([^"]+)"', webpage, 'video URL')          thumbnail = self._search_regex( -            r'image:\s*"([^"]+)"', webpage, u'thumbnail URL', fatal=False) +            r'image:\s*"([^"]+)"', webpage, 'thumbnail URL', fatal=False)          return {              'id': video_id,              'title': title,              'url': video_url, -            'duration': duration,              'thumbnail': thumbnail,          } diff --git a/youtube_dl/extractor/swrmediathek.py b/youtube_dl/extractor/swrmediathek.py index 6c688c520..5d9d70367 100644 --- a/youtube_dl/extractor/swrmediathek.py +++ b/youtube_dl/extractor/swrmediathek.py @@ -8,7 +8,7 @@ from ..utils import parse_duration  class SWRMediathekIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?swrmediathek\.de/player\.htm\?show=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' +    _VALID_URL = r'https?://(?:www\.)?swrmediathek\.de/(?:content/)?player\.htm\?show=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'      _TESTS = [{          'url': 'http://swrmediathek.de/player.htm?show=849790d0-dab8-11e3-a953-0026b975f2e6', @@ -52,6 +52,20 @@ class SWRMediathekIE(InfoExtractor):              'uploader': 'SWR 2',              'uploader_id': '284670',          } +    }, { +        'url': 'http://swrmediathek.de/content/player.htm?show=52dc7e00-15c5-11e4-84bc-0026b975f2e6', +        'md5': '881531487d0633080a8cc88d31ef896f', +        'info_dict': { +            'id': '52dc7e00-15c5-11e4-84bc-0026b975f2e6', +            'ext': 'mp4', +            'title': 'Familienspaß am Bodensee', +            'description': 'md5:0b591225a32cfde7be1629ed49fe4315', +            'thumbnail': 're:http://.*\.jpg', +            'duration': 1784, +            'upload_date': '20140727', +            'uploader': 'SWR Fernsehen BW', +            'uploader_id': '281130', +        }      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/ubu.py b/youtube_dl/extractor/ubu.py new file mode 100644 index 000000000..0182d67ec --- /dev/null +++ b/youtube_dl/extractor/ubu.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class UbuIE(InfoExtractor): +    _VALID_URL = r'http://(?:www\.)?ubu\.com/film/(?P<id>[\da-z_-]+)\.html' +    _TEST = { +        'url': 'http://ubu.com/film/her_noise.html', +        'md5': '8edd46ee8aa6b265fb5ed6cf05c36bc9', +        'info_dict': { +            'id': 'her_noise', +            'ext': 'mp4', +            'title': 'Her Noise - The Making Of (2007)', +            'duration': 3600, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        title = self._html_search_regex( +            r'<title>.+?Film & Video: ([^<]+)</title>', webpage, 'title') + +        duration = int_or_none(self._html_search_regex( +            r'Duration: (\d+) minutes', webpage, 'duration', fatal=False, default=None)) +        if duration: +            duration *= 60 + +        formats = [] + +        FORMAT_REGEXES = [ +            ['sq', r"'flashvars'\s*,\s*'file=([^']+)'"], +            ['hq', r'href="(http://ubumexico\.centro\.org\.mx/video/[^"]+)"'] +        ] + +        for format_id, format_regex in FORMAT_REGEXES: +            m = re.search(format_regex, webpage) +            if m: +                formats.append({ +                    'url': m.group(1), +                    'format_id': format_id, +                }) + +        return { +            'id': video_id, +            'title': title, +            'duration': duration, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index eada13ce9..d2ffd1b6b 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -177,6 +177,7 @@ class VevoIE(InfoExtractor):              self._downloader.report_warning(                  'Cannot download SMIL information, falling back to JSON ..') +        self._sort_formats(formats)          timestamp_ms = int(self._search_regex(              r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date')) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py new file mode 100644 index 000000000..5c89824c1 --- /dev/null +++ b/youtube_dl/extractor/vidme.py @@ -0,0 +1,68 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    float_or_none, +    str_to_int, +) + + +class VidmeIE(InfoExtractor): +    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)' +    _TEST = { +        'url': 'https://vid.me/QNB', +        'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', +        'info_dict': { +            'id': 'QNB', +            'ext': 'mp4', +            'title': 'Fishing for piranha - the easy way', +            'description': 'source: https://www.facebook.com/photo.php?v=312276045600871', +            'duration': 119.92, +            'timestamp': 1406313244, +            'upload_date': '20140725', +            'thumbnail': 're:^https?://.*\.jpg', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        video_url = self._html_search_regex(r'<source src="([^"]+)"', webpage, 'video URL') + +        title = self._og_search_title(webpage) +        description = self._og_search_description(webpage, default='') +        thumbnail = self._og_search_thumbnail(webpage) +        timestamp = int_or_none(self._og_search_property('updated_time', webpage, fatal=False)) +        width = int_or_none(self._og_search_property('video:width', webpage, fatal=False)) +        height = int_or_none(self._og_search_property('video:height', webpage, fatal=False)) +        duration = float_or_none(self._html_search_regex( +            r'data-duration="([^"]+)"', webpage, 'duration', fatal=False)) +        view_count = str_to_int(self._html_search_regex( +            r'<span class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False)) +        like_count = str_to_int(self._html_search_regex( +            r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">', +            webpage, 'like count', fatal=False)) +        comment_count = str_to_int(self._html_search_regex( +            r'class="js-comment-count"[^>]+data-count="([\d,\.\s]+)">', +            webpage, 'comment count', fatal=False)) + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'timestamp': timestamp, +            'width': width, +            'height': height, +            'duration': duration, +            'view_count': view_count, +            'like_count': like_count, +            'comment_count': comment_count, +        } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index a3c6e83b0..11c7d7e81 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -122,6 +122,21 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):              },          },          { +            'url': 'http://vimeo.com/channels/keypeele/75629013', +            'md5': '2f86a05afe9d7abc0b9126d229bbe15d', +            'note': 'Video is freely available via original URL ' +                    'and protected with password when accessed via http://vimeo.com/75629013', +            'info_dict': { +                'id': '75629013', +                'ext': 'mp4', +                'title': 'Key & Peele: Terrorist Interrogation', +                'description': 'md5:8678b246399b070816b12313e8b4eb5c', +                'uploader_id': 'atencio', +                'uploader': 'Peter Atencio', +                'duration': 187, +            }, +        }, +        {              'url': 'http://vimeo.com/76979871',              'md5': '3363dd6ffebe3784d56f4132317fd446',              'note': 'Video with subtitles', @@ -196,8 +211,6 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):          video_id = mobj.group('id')          if mobj.group('pro') or mobj.group('player'):              url = 'http://player.vimeo.com/video/' + video_id -        else: -            url = 'https://vimeo.com/' + video_id          # Retrieve video webpage to extract further information          request = compat_urllib_request.Request(url, None, headers) @@ -263,7 +276,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):          if video_thumbnail is None:              video_thumbs = config["video"].get("thumbs")              if video_thumbs and isinstance(video_thumbs, dict): -                _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in video_thumbs.items())[-1] +                _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1]          # Extract video description          video_description = None diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py index f1b9e9a19..2544c24bd 100644 --- a/youtube_dl/extractor/vube.py +++ b/youtube_dl/extractor/vube.py @@ -1,10 +1,12 @@  from __future__ import unicode_literals -import json  import re  from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( +    int_or_none, +    compat_str, +)  class VubeIE(InfoExtractor): @@ -29,6 +31,7 @@ class VubeIE(InfoExtractor):                  'like_count': int,                  'dislike_count': int,                  'comment_count': int, +                'categories': ['pop', 'music', 'cover', 'singing', 'jessie j', 'price tag', 'chiara grispo'],              }          },          { @@ -47,6 +50,7 @@ class VubeIE(InfoExtractor):                  'like_count': int,                  'dislike_count': int,                  'comment_count': int, +                'categories': ['seraina', 'jessica', 'krewella', 'alive'],              }          }, {              'url': 'http://vube.com/vote/Siren+Gene/0nmsMY5vEq?n=2&t=s', @@ -56,13 +60,15 @@ class VubeIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'Frozen - Let It Go Cover by Siren Gene',                  'description': 'My rendition of "Let It Go" originally sung by Idina Menzel.', -                'uploader': 'Siren Gene', -                'uploader_id': 'Siren',                  'thumbnail': 're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/10283ab622a-86c9-4681-51f2-30d1f65774af\.jpg$', +                'uploader': 'Siren', +                'timestamp': 1395448018, +                'upload_date': '20140322',                  'duration': 221.788,                  'like_count': int,                  'dislike_count': int,                  'comment_count': int, +                'categories': ['let it go', 'cover', 'idina menzel', 'frozen', 'singing', 'disney', 'siren gene'],              }          }      ] @@ -71,47 +77,40 @@ class VubeIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') -        webpage = self._download_webpage(url, video_id) -        data_json = self._search_regex( -            r'(?s)window\["(?:tapiVideoData|vubeOriginalVideoData)"\]\s*=\s*(\{.*?\n});\n', -            webpage, 'video data' -        ) -        data = json.loads(data_json) -        video = ( -            data.get('video') or -            data) -        assert isinstance(video, dict) +        video = self._download_json( +            'http://vube.com/t-api/v1/video/%s' % video_id, video_id, 'Downloading video JSON')          public_id = video['public_id'] -        formats = [ -            { -                'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (fmt['media_resolution_id'], public_id), -                'height': int(fmt['height']), -                'abr': int(fmt['audio_bitrate']), -                'vbr': int(fmt['video_bitrate']), -                'format_id': fmt['media_resolution_id'] -            } for fmt in video['mtm'] if fmt['transcoding_status'] == 'processed' -        ] +        formats = [] + +        for media in video['media'].get('video', []) + video['media'].get('audio', []): +            if media['transcoding_status'] != 'processed': +                continue +            fmt = { +                'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (media['media_resolution_id'], public_id), +                'abr': int(media['audio_bitrate']), +                'format_id': compat_str(media['media_resolution_id']), +            } +            vbr = int(media['video_bitrate']) +            if vbr: +                fmt.update({ +                    'vbr': vbr, +                    'height': int(media['height']), +                }) +            formats.append(fmt)          self._sort_formats(formats)          title = video['title']          description = video.get('description') -        thumbnail = self._proto_relative_url( -            video.get('thumbnail') or video.get('thumbnail_src'), -            scheme='http:') -        uploader = data.get('user', {}).get('channel', {}).get('name') or video.get('user_alias') -        uploader_id = data.get('user', {}).get('name') +        thumbnail = self._proto_relative_url(video.get('thumbnail_src'), scheme='http:') +        uploader = video.get('user_alias') or video.get('channel')          timestamp = int_or_none(video.get('upload_time'))          duration = video['duration']          view_count = video.get('raw_view_count') -        like_count = video.get('rlikes') -        if like_count is None: -            like_count = video.get('total_likes') -        dislike_count = video.get('rhates') -        if dislike_count is None: -            dislike_count = video.get('total_hates') +        like_count = video.get('total_likes') +        dislike_count = video.get('total_hates')          comments = video.get('comments')          comment_count = None @@ -124,6 +123,8 @@ class VubeIE(InfoExtractor):          else:              comment_count = len(comments) +        categories = [tag['text'] for tag in video['tags']] +          return {              'id': video_id,              'formats': formats, @@ -131,11 +132,11 @@ class VubeIE(InfoExtractor):              'description': description,              'thumbnail': thumbnail,              'uploader': uploader, -            'uploader_id': uploader_id,              'timestamp': timestamp,              'duration': duration,              'view_count': view_count,              'like_count': like_count,              'dislike_count': dislike_count,              'comment_count': comment_count, +            'categories': categories,          } diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py new file mode 100644 index 000000000..a9aa72e73 --- /dev/null +++ b/youtube_dl/extractor/xboxclips.py @@ -0,0 +1,57 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    parse_iso8601, +    float_or_none, +    int_or_none, +) + + +class XboxClipsIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/video\.php\?.*vid=(?P<id>[\w-]{36})' +    _TEST = { +        'url': 'https://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', +        'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', +        'info_dict': { +            'id': '074a69a9-5faf-46aa-b93b-9909c1720325', +            'ext': 'mp4', +            'title': 'Iabdulelah playing Upload Studio', +            'filesize_approx': 28101836.8, +            'timestamp': 1407388500, +            'upload_date': '20140807', +            'duration': 56, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        video_url = self._html_search_regex( +            r'>Link: <a href="([^"]+)">', webpage, 'video URL') +        title = self._html_search_regex( +            r'<title>XboxClips \| ([^<]+)</title>', webpage, 'title') +        timestamp = parse_iso8601(self._html_search_regex( +            r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False)) +        filesize = float_or_none(self._html_search_regex( +            r'>Size: ([\d\.]+)MB<', webpage, 'file size', fatal=False), invscale=1024 * 1024) +        duration = int_or_none(self._html_search_regex( +            r'>Duration: (\d+) Seconds<', webpage, 'duration', fatal=False)) +        view_count = int_or_none(self._html_search_regex( +            r'>Views: (\d+)<', webpage, 'view count', fatal=False)) + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'timestamp': timestamp, +            'filesize_approx': filesize, +            'duration': duration, +            'view_count': view_count, +        } diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index d84be2562..0e3b33b16 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -15,7 +15,7 @@ from ..utils import (  class YahooIE(InfoExtractor):      IE_DESC = 'Yahoo screen and movies' -    _VALID_URL = r'https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html' +    _VALID_URL = r'(?P<url>https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'      _TESTS = [          {              'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', @@ -46,12 +46,23 @@ class YahooIE(InfoExtractor):                  'title': 'The World Loves Spider-Man',                  'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''',              } -        } +        }, +        { +            'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', +            'md5': '60e8ac193d8fb71997caa8fce54c6460', +            'info_dict': { +                'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', +                'ext': 'mp4', +                'title': "Yahoo Saves 'Community'", +                'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', +            } +        },      ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') +        url = mobj.group('url')          webpage = self._download_webpage(url, video_id)          items_json = self._search_regex( diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c48d1b8ef..3c9b0b584 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -344,7 +344,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          """Indicate the download will use the RTMP protocol."""          self.to_screen(u'RTMP download detected') -    def _extract_signature_function(self, video_id, player_url, slen): +    def _signature_cache_id(self, example_sig): +        """ Return a string representation of a signature """ +        return u'.'.join(compat_str(len(part)) for part in example_sig.split('.')) + +    def _extract_signature_function(self, video_id, player_url, example_sig):          id_m = re.match(              r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',              player_url) @@ -354,7 +358,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          player_id = id_m.group('id')          # Read from filesystem cache -        func_id = '%s_%s_%d' % (player_type, player_id, slen) +        func_id = '%s_%s_%s' % ( +            player_type, player_id, self._signature_cache_id(example_sig))          assert os.path.basename(func_id) == func_id          cache_dir = get_cachedir(self._downloader.params) @@ -388,7 +393,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          if cache_enabled:              try: -                test_string = u''.join(map(compat_chr, range(slen))) +                test_string = u''.join(map(compat_chr, range(len(example_sig))))                  cache_res = res(test_string)                  cache_spec = [ord(c) for c in cache_res]                  try: @@ -404,7 +409,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          return res -    def _print_sig_code(self, func, slen): +    def _print_sig_code(self, func, example_sig):          def gen_sig_code(idxs):              def _genslice(start, end, step):                  starts = u'' if start == 0 else str(start) @@ -433,11 +438,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              else:                  yield _genslice(start, i, step) -        test_string = u''.join(map(compat_chr, range(slen))) +        test_string = u''.join(map(compat_chr, range(len(example_sig))))          cache_res = func(test_string)          cache_spec = [ord(c) for c in cache_res]          expr_code = u' + '.join(gen_sig_code(cache_spec)) -        code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code) +        signature_id_tuple = '(%s)' % ( +            ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) +        code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n' +                u'    return %s\n') % (signature_id_tuple, expr_code)          self.to_screen(u'Extracted signature function:\n' + code)      def _parse_sig_js(self, jscode): @@ -465,20 +473,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          if player_url.startswith(u'//'):              player_url = u'https:' + player_url          try: -            player_id = (player_url, len(s)) +            player_id = (player_url, self._signature_cache_id(s))              if player_id not in self._player_cache:                  func = self._extract_signature_function( -                    video_id, player_url, len(s) +                    video_id, player_url, s                  )                  self._player_cache[player_id] = func              func = self._player_cache[player_id]              if self._downloader.params.get('youtube_print_sig_code'): -                self._print_sig_code(func, len(s)) +                self._print_sig_code(func, s)              return func(s)          except Exception as e:              tb = traceback.format_exc()              raise ExtractorError( -                u'Automatic signature extraction failed: ' + tb, cause=e) +                u'Signature extraction failed: ' + tb, cause=e)      def _get_available_subtitles(self, video_id, webpage):          try: @@ -806,51 +814,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              url_map = {}              for url_data_str in encoded_url_map.split(','):                  url_data = compat_parse_qs(url_data_str) -                if 'itag' in url_data and 'url' in url_data: -                    url = url_data['url'][0] -                    if 'sig' in url_data: -                        url += '&signature=' + url_data['sig'][0] -                    elif 's' in url_data: -                        encrypted_sig = url_data['s'][0] - -                        if not age_gate: -                            jsplayer_url_json = self._search_regex( -                                r'"assets":.+?"js":\s*("[^"]+")', -                                video_webpage, u'JS player URL') -                            player_url = json.loads(jsplayer_url_json) +                if 'itag' not in url_data or 'url' not in url_data: +                    continue +                format_id = url_data['itag'][0] +                url = url_data['url'][0] + +                if 'sig' in url_data: +                    url += '&signature=' + url_data['sig'][0] +                elif 's' in url_data: +                    encrypted_sig = url_data['s'][0] + +                    if not age_gate: +                        jsplayer_url_json = self._search_regex( +                            r'"assets":.+?"js":\s*("[^"]+")', +                            video_webpage, u'JS player URL') +                        player_url = json.loads(jsplayer_url_json) +                    if player_url is None: +                        player_url_json = self._search_regex( +                            r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', +                            video_webpage, u'age gate player URL') +                        player_url = json.loads(player_url_json) + +                    if self._downloader.params.get('verbose'):                          if player_url is None: -                            player_url_json = self._search_regex( -                                r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', -                                video_webpage, u'age gate player URL') -                            player_url = json.loads(player_url_json) - -                        if self._downloader.params.get('verbose'): -                            if player_url is None: -                                player_version = 'unknown' -                                player_desc = 'unknown' +                            player_version = 'unknown' +                            player_desc = 'unknown' +                        else: +                            if player_url.endswith('swf'): +                                player_version = self._search_regex( +                                    r'-(.+?)(?:/watch_as3)?\.swf$', player_url, +                                    u'flash player', fatal=False) +                                player_desc = 'flash player %s' % player_version                              else: -                                if player_url.endswith('swf'): -                                    player_version = self._search_regex( -                                        r'-(.+?)(?:/watch_as3)?\.swf$', player_url, -                                        u'flash player', fatal=False) -                                    player_desc = 'flash player %s' % player_version -                                else: -                                    player_version = self._search_regex( -                                        r'html5player-([^/]+?)(?:/html5player)?\.js', -                                        player_url, -                                        'html5 player', fatal=False) -                                    player_desc = u'html5 player %s' % player_version - -                            parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.')) -                            self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % -                                (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) - -                        signature = self._decrypt_signature( -                            encrypted_sig, video_id, player_url, age_gate) -                        url += '&signature=' + signature -                    if 'ratebypass' not in url: -                        url += '&ratebypass=yes' -                    url_map[url_data['itag'][0]] = url +                                player_version = self._search_regex( +                                    r'html5player-([^/]+?)(?:/html5player)?\.js', +                                    player_url, +                                    'html5 player', fatal=False) +                                player_desc = u'html5 player %s' % player_version + +                        parts_sizes = self._signature_cache_id(encrypted_sig) +                        self.to_screen(u'{%s} signature length %s, %s' % +                            (format_id, parts_sizes, player_desc)) + +                    signature = self._decrypt_signature( +                        encrypted_sig, video_id, player_url, age_gate) +                    url += '&signature=' + signature +                if 'ratebypass' not in url: +                    url += '&ratebypass=yes' +                url_map[format_id] = url              formats = _map_to_format_list(url_map)          elif video_info.get('hlsvp'):              manifest_url = video_info['hlsvp'][0] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0d0bbe8f6..65b492fb3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -852,6 +852,8 @@ def unified_strdate(date_str):      return upload_date  def determine_ext(url, default_ext=u'unknown_video'): +    if url is None: +        return default_ext      guess = url.partition(u'?')[0].rpartition(u'.')[2]      if re.match(r'^[A-Za-z0-9]+$', guess):          return guess @@ -1271,9 +1273,15 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):      if get_attr:          if v is not None:              v = getattr(v, get_attr, None) +    if v == '': +        v = None      return default if v is None else (int(v) * invscale // scale) +def str_or_none(v, default=None): +    return default if v is None else compat_str(v) + +  def str_to_int(int_str):      if int_str is None:          return None diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e77494595..2ef0d59e3 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.07.25.1' +__version__ = '2014.08.10' | 
