diff options
53 files changed, 1337 insertions, 177 deletions
| @@ -117,3 +117,8 @@ Alexander Mamay  Devin J. Pohly  Eduardo Ferro Aldama  Jeff Buchbinder +Amish Bhadeshia +Joram Schrijver +Will W. +Mohammad Teimori Pabandi +Roman Le Négrate @@ -2,7 +2,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bas  clean:  	rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe -	find -name "*.pyc" -delete +	find . -name "*.pyc" -delete  PREFIX ?= /usr/local  BINDIR ?= $(PREFIX)/bin diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 6a5bd9eda..7a219ebe9 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -28,7 +28,7 @@ for test in get_testcases():      if METHOD == 'EURISTIC':          try:              webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() -        except: +        except Exception:              print('\nFail: {0}'.format(test['name']))              continue diff --git a/docs/supportedsites.md b/docs/supportedsites.md index baf7b3880..2785b9587 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -2,6 +2,8 @@   - **1tv**: Первый канал   - **1up.com**   - **220.ro** + - **22tracks:genre** + - **22tracks:track**   - **24video**   - **3sat**   - **4tube** @@ -109,6 +111,7 @@   - **DctpTv**   - **DeezerPlaylist**   - **defense.gouv.fr** + - **DHM**: Filmarchiv - Deutsches Historisches Museum   - **Discovery**   - **divxstage**: DivxStage   - **Dotsub** @@ -118,6 +121,7 @@   - **DrTuber**   - **DRTV**   - **Dump** + - **Dumpert**   - **dvtv**: http://video.aktualne.cz/   - **EaglePlatform**   - **EbaumsWorld** @@ -251,6 +255,7 @@   - **Mgoon**   - **Minhateca**   - **MinistryGrid** + - **miomio.tv**   - **mitele.es**   - **mixcloud**   - **MLB** @@ -284,6 +289,8 @@   - **NBA**   - **NBC**   - **NBCNews** + - **NBCSports** + - **NBCSportsVPlayer**   - **ndr**: NDR.de - Mediathek   - **NDTV**   - **NerdCubedFeed** @@ -380,6 +387,8 @@   - **rutube:movie**: Rutube movies   - **rutube:person**: Rutube person videos   - **RUTV**: RUTV.RU + - **safari**: safaribooksonline.com online video + - **safari:course**: safaribooksonline.com online courses   - **Sandia**: Sandia National Laboratories   - **Sapo**: SAPO Vídeos   - **savefrom.net** @@ -497,9 +506,11 @@   - **Urort**: NRK P3 Urørt   - **ustream**   - **ustream:channel** + - **Varzesh3**   - **Vbox7**   - **VeeHD**   - **Veoh** + - **Vessel**   - **Vesti**: Вести.Ru   - **Vevo**   - **VGTV** @@ -588,7 +599,7 @@   - **youtube:show**: YouTube.com (multi-season) shows   - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)   - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) - - **youtube:watch_later**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)   - **Zapiks**   - **ZDF**   - **ZDFChannel** diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 6ae168b7f..a9db42b30 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -59,7 +59,7 @@ class TestAllURLsMatching(unittest.TestCase):          self.assertMatch('www.youtube.com/NASAgovVideo/videos', ['youtube:user'])      def test_youtube_feeds(self): -        self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watch_later']) +        self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater'])          self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])          self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])          self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites']) diff --git a/test/test_execution.py b/test/test_execution.py index f31e51558..620db080e 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -8,6 +8,9 @@ import unittest  import sys  import os  import subprocess +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.utils import encodeArgument  rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -31,7 +34,7 @@ class TestExecution(unittest.TestCase):      def test_cmdline_umlauts(self):          p = subprocess.Popen( -            [sys.executable, 'youtube_dl/__main__.py', 'ä', '--version'], +            [sys.executable, 'youtube_dl/__main__.py', encodeArgument('ä'), '--version'],              cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE)          _, stderr = p.communicate()          self.assertFalse(stderr) diff --git a/test/test_utils.py b/test/test_utils.py index a8ab87685..2e3a6480c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -200,6 +200,8 @@ class TestUtil(unittest.TestCase):      def test_unescape_html(self):          self.assertEqual(unescapeHTML('%20;'), '%20;') +        self.assertEqual(unescapeHTML('/'), '/') +        self.assertEqual(unescapeHTML('/'), '/')          self.assertEqual(              unescapeHTML('é'), 'é') @@ -225,6 +227,7 @@ class TestUtil(unittest.TestCase):          self.assertEqual(              unified_strdate('2/2/2015 6:47:40 PM', day_first=False),              '20150202') +        self.assertEqual(unified_strdate('25-09-2014'), '20140925')      def test_find_xpath_attr(self):          testxml = '''<root> @@ -468,6 +471,12 @@ class TestUtil(unittest.TestCase):          self.assertEqual(d['x'], 1)          self.assertEqual(d['y'], 'a') +        on = js_to_json('["abc", "def",]') +        self.assertEqual(json.loads(on), ['abc', 'def']) + +        on = js_to_json('{"abc": "def",}') +        self.assertEqual(json.loads(on), {'abc': 'def'}) +      def test_clean_html(self):          self.assertEqual(clean_html('a:\nb'), 'a: b')          self.assertEqual(clean_html('a:\n   "b"'), 'a:    "b"') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b5ef5e009..ce4b72fd3 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1701,10 +1701,10 @@ class YoutubeDL(object):              out = out.decode().strip()              if re.match('[0-9a-f]+', out):                  self._write_string('[debug] Git HEAD: ' + out + '\n') -        except: +        except Exception:              try:                  sys.exc_clear() -            except: +            except Exception:                  pass          self._write_string('[debug] Python version %s - %s\n' % (              platform.python_version(), platform_name())) @@ -1768,6 +1768,14 @@ class YoutubeDL(object):          debuglevel = 1 if self.params.get('debug_printtraffic') else 0          https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) +        # The ssl context is only available in python 2.7.9 and 3.x +        if hasattr(https_handler, '_context'): +            ctx = https_handler._context +            # get_ca_certs is unavailable prior to python 3.4 +            if hasattr(ctx, 'get_ca_certs') and len(ctx.get_ca_certs()) == 0: +                self.report_warning( +                    'No ssl certificates were loaded, urls that use https ' +                    'won\'t work')          ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)          opener = compat_urllib_request.build_opener(              proxy_handler, https_handler, cookie_processor, ydlh) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 852b2fc3d..1c8b411b7 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -189,10 +189,6 @@ def _real_main(argv=None):      if opts.allsubtitles and not opts.writeautomaticsub:          opts.writesubtitles = True -    if sys.version_info < (3,): -        # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems) -        if opts.outtmpl is not None: -            opts.outtmpl = opts.outtmpl.decode(preferredencoding())      outtmpl = ((opts.outtmpl is not None and opts.outtmpl) or                 (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') or                 (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') or diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index b2bf149ef..973bcd320 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -389,7 +389,7 @@ else:                  stdout=subprocess.PIPE, stderr=subprocess.PIPE)              out, err = sp.communicate()              lines, columns = map(int, out.split()) -        except: +        except Exception:              pass          return _terminal_size(columns, lines) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 8ed5c19a6..a0fc5ead0 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -204,7 +204,7 @@ class FileDownloader(object):              return          try:              os.utime(filename, (time.time(), filetime)) -        except: +        except Exception:              pass          return filetime @@ -318,7 +318,7 @@ class FileDownloader(object):          )          continuedl_and_exists = ( -            self.params.get('continuedl', False) and +            self.params.get('continuedl', True) and              os.path.isfile(encodeFilename(filename)) and              not self.params.get('nopart', False)          ) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 4047d7167..d136bebd1 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -49,7 +49,7 @@ class HttpFD(FileDownloader):          open_mode = 'wb'          if resume_len != 0: -            if self.params.get('continuedl', False): +            if self.params.get('continuedl', True):                  self.report_resuming_byte(resume_len)                  request.add_header('Range', 'bytes=%d-' % resume_len)                  open_mode = 'ab' diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 89e98ae61..ddf5724ae 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -105,7 +105,7 @@ class RtmpFD(FileDownloader):          protocol = info_dict.get('rtmp_protocol', None)          real_time = info_dict.get('rtmp_real_time', False)          no_resume = info_dict.get('no_resume', False) -        continue_dl = info_dict.get('continuedl', False) +        continue_dl = info_dict.get('continuedl', True)          self.report_destination(filename)          tmpfilename = self.temp_name(filename) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d73826d44..0f7d44616 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -106,6 +106,7 @@ from .dbtv import DBTVIE  from .dctp import DctpTvIE  from .deezer import DeezerPlaylistIE  from .dfb import DFBIE +from .dhm import DHMIE  from .dotsub import DotsubIE  from .douyutv import DouyuTVIE  from .dreisat import DreiSatIE @@ -114,6 +115,7 @@ from .drtuber import DrTuberIE  from .drtv import DRTVIE  from .dvtv import DVTVIE  from .dump import DumpIE +from .dumpert import DumpertIE  from .defense import DefenseGouvFrIE  from .discovery import DiscoveryIE  from .divxstage import DivxStageIE @@ -175,6 +177,7 @@ from .gameone import (      GameOneIE,      GameOnePlaylistIE,  ) +from .gamersyde import GamersydeIE  from .gamespot import GameSpotIE  from .gamestar import GameStarIE  from .gametrailers import GametrailersIE @@ -274,6 +277,7 @@ from .metacritic import MetacriticIE  from .mgoon import MgoonIE  from .minhateca import MinhatecaIE  from .ministrygrid import MinistryGridIE +from .miomio import MioMioIE  from .mit import TechTVMITIE, MITIE, OCWMITIE  from .mitele import MiTeleIE  from .mixcloud import MixcloudIE @@ -309,6 +313,8 @@ from .nba import NBAIE  from .nbc import (      NBCIE,      NBCNewsIE, +    NBCSportsIE, +    NBCSportsVPlayerIE,  )  from .ndr import NDRIE  from .ndtv import NDTVIE @@ -382,6 +388,7 @@ from .pornhub import (      PornHubPlaylistIE,  )  from .pornotube import PornotubeIE +from .pornovoisines import PornoVoisinesIE  from .pornoxo import PornoXOIE  from .primesharetv import PrimeShareTVIE  from .promptfile import PromptFileIE @@ -391,6 +398,7 @@ from .pyvideo import PyvideoIE  from .quickvid import QuickVidIE  from .r7 import R7IE  from .radiode import RadioDeIE +from .radiojavan import RadioJavanIE  from .radiobremen import RadioBremenIE  from .radiofrance import RadioFranceIE  from .rai import RaiIE @@ -420,6 +428,10 @@ from .rutube import (  )  from .rutv import RUTVIE  from .sandia import SandiaIE +from .safari import ( +    SafariIE, +    SafariCourseIE, +)  from .sapo import SapoIE  from .savefrom import SaveFromIE  from .sbs import SBSIE @@ -526,6 +538,10 @@ from .tvp import TvpIE, TvpSeriesIE  from .tvplay import TVPlayIE  from .tweakers import TweakersIE  from .twentyfourvideo import TwentyFourVideoIE +from .twentytwotracks import ( +    TwentyTwoTracksIE, +    TwentyTwoTracksGenreIE +)  from .twitch import (      TwitchVideoIE,      TwitchChapterIE, @@ -544,9 +560,11 @@ from .ultimedia import UltimediaIE  from .unistra import UnistraIE  from .urort import UrortIE  from .ustream import UstreamIE, UstreamChannelIE +from .varzesh3 import Varzesh3IE  from .vbox7 import Vbox7IE  from .veehd import VeeHDIE  from .veoh import VeohIE +from .vessel import VesselIE  from .vesti import VestiIE  from .vevo import VevoIE  from .vgtv import VGTVIE diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 4a88ccd13..0dca29b71 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -6,32 +6,39 @@ from .common import InfoExtractor  class BloombergIE(InfoExtractor): -    _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<id>.+?)\.html' +    _VALID_URL = r'https?://www\.bloomberg\.com/news/videos/[^/]+/(?P<id>[^/?#]+)'      _TEST = { -        'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', +        'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',          # The md5 checksum changes          'info_dict': {              'id': 'qurhIVlJSB6hzkVi229d8g',              'ext': 'flv',              'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', -            'description': 'md5:0681e0d30dcdfc6abf34594961d8ea88', +            'description': 'md5:a8ba0302912d03d246979735c17d2761',          },      }      def _real_extract(self, url):          name = self._match_id(url)          webpage = self._download_webpage(url, name) - -        f4m_url = self._search_regex( -            r'<source src="(https?://[^"]+\.f4m.*?)"', webpage, -            'f4m url') +        video_id = self._search_regex(r'"bmmrId":"(.+?)"', webpage, 'id')          title = re.sub(': Video$', '', self._og_search_title(webpage)) +        embed_info = self._download_json( +            'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) +        formats = [] +        for stream in embed_info['streams']: +            if stream["muxing_format"] == "TS": +                formats.extend(self._extract_m3u8_formats(stream['url'], video_id)) +            else: +                formats.extend(self._extract_f4m_formats(stream['url'], video_id)) +        self._sort_formats(formats) +          return { -            'id': name.split('-')[-1], +            'id': video_id,              'title': title, -            'formats': self._extract_f4m_formats(f4m_url, name), +            'formats': formats,              'description': self._og_search_description(webpage),              'thumbnail': self._og_search_thumbnail(webpage),          } diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 90ea07438..0a77e951c 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import (  class CNNIE(InfoExtractor):      _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ -        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln)(?:-ap)?|(?=&)))''' +        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln|ktvk)(?:-ap)?|(?=&)))'''      _TESTS = [{          'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', @@ -45,6 +45,9 @@ class CNNIE(InfoExtractor):              'description': 'md5:e7223a503315c9f150acac52e76de086',              'upload_date': '20141222',          } +    }, { +        'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 4f67c3aac..47d58330b 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -25,8 +25,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):      def _build_request(url):          """Build a request with the family filter disabled"""          request = compat_urllib_request.Request(url) -        request.add_header('Cookie', 'family_filter=off') -        request.add_header('Cookie', 'ff=off') +        request.add_header('Cookie', 'family_filter=off; ff=off')          return request @@ -112,8 +111,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor):              video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)          embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id -        embed_page = self._download_webpage(embed_url, video_id, -                                            'Downloading embed page') +        embed_request = self._build_request(embed_url) +        embed_page = self._download_webpage( +            embed_request, video_id, 'Downloading embed page')          info = self._search_regex(r'var info = ({.*?}),$', embed_page,                                    'video info', flags=re.MULTILINE)          info = json.loads(info) diff --git a/youtube_dl/extractor/dhm.py b/youtube_dl/extractor/dhm.py new file mode 100644 index 000000000..3ed1f1663 --- /dev/null +++ b/youtube_dl/extractor/dhm.py @@ -0,0 +1,73 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    xpath_text, +    parse_duration, +) + + +class DHMIE(InfoExtractor): +    IE_DESC = 'Filmarchiv - Deutsches Historisches Museum' +    _VALID_URL = r'https?://(?:www\.)?dhm\.de/filmarchiv/(?:[^/]+/)+(?P<id>[^/]+)' + +    _TESTS = [{ +        'url': 'http://www.dhm.de/filmarchiv/die-filme/the-marshallplan-at-work-in-west-germany/', +        'md5': '11c475f670209bf6acca0b2b7ef51827', +        'info_dict': { +            'id': 'the-marshallplan-at-work-in-west-germany', +            'ext': 'flv', +            'title': 'MARSHALL PLAN AT WORK IN WESTERN GERMANY, THE', +            'description': 'md5:1fabd480c153f97b07add61c44407c82', +            'duration': 660, +            'thumbnail': 're:^https?://.*\.jpg$', +        }, +    }, { +        'url': 'http://www.dhm.de/filmarchiv/02-mapping-the-wall/peter-g/rolle-1/', +        'md5': '09890226332476a3e3f6f2cb74734aa5', +        'info_dict': { +            'id': 'rolle-1', +            'ext': 'flv', +            'title': 'ROLLE 1', +            'thumbnail': 're:^https?://.*\.jpg$', +        }, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        playlist_url = self._search_regex( +            r"file\s*:\s*'([^']+)'", webpage, 'playlist url') + +        playlist = self._download_xml(playlist_url, video_id) + +        track = playlist.find( +            './{http://xspf.org/ns/0/}trackList/{http://xspf.org/ns/0/}track') + +        video_url = xpath_text( +            track, './{http://xspf.org/ns/0/}location', +            'video url', fatal=True) +        thumbnail = xpath_text( +            track, './{http://xspf.org/ns/0/}image', +            'thumbnail') + +        title = self._search_regex( +            [r'dc:title="([^"]+)"', r'<title> »([^<]+)</title>'], +            webpage, 'title').strip() +        description = self._html_search_regex( +            r'<p><strong>Description:</strong>(.+?)</p>', +            webpage, 'description', default=None) +        duration = parse_duration(self._search_regex( +            r'<em>Length\s*</em>\s*:\s*</strong>([^<]+)', +            webpage, 'duration', default=None)) + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'description': description, +            'duration': duration, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index d7956e6e4..479430c51 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -1,19 +1,23 @@  # coding: utf-8  from __future__ import unicode_literals +import hashlib +import time  from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import (ExtractorError, unescapeHTML) +from ..compat import (compat_str, compat_basestring)  class DouyuTVIE(InfoExtractor):      _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.douyutv.com/iseven',          'info_dict': { -            'id': 'iseven', +            'id': '17732', +            'display_id': 'iseven',              'ext': 'flv',              'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', -            'description': 'md5:9e525642c25a0a24302869937cf69d17', +            'description': 'md5:c93d6692dde6fe33809a46edcbecca44',              'thumbnail': 're:^https?://.*\.jpg$',              'uploader': '7师傅',              'uploader_id': '431925', @@ -22,22 +26,52 @@ class DouyuTVIE(InfoExtractor):          'params': {              'skip_download': True,          } -    } +    }, { +        'url': 'http://www.douyutv.com/85982', +        'info_dict': { +            'id': '85982', +            'display_id': '85982', +            'ext': 'flv', +            'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            'description': 'md5:746a2f7a253966a06755a912f0acc0d2', +            'thumbnail': 're:^https?://.*\.jpg$', +            'uploader': 'douyu小漠', +            'uploader_id': '3769985', +            'is_live': True, +        }, +        'params': { +            'skip_download': True, +        } +    }]      def _real_extract(self, url):          video_id = self._match_id(url) +        if video_id.isdigit(): +            room_id = video_id +        else: +            page = self._download_webpage(url, video_id) +            room_id = self._html_search_regex( +                r'"room_id"\s*:\s*(\d+),', page, 'room id') + +        prefix = 'room/%s?aid=android&client_sys=android&time=%d' % ( +            room_id, int(time.time())) + +        auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest()          config = self._download_json( -            'http://www.douyutv.com/api/client/room/%s' % video_id, video_id) +            'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth), +            video_id)          data = config['data']          error_code = config.get('error', 0) -        show_status = data.get('show_status')          if error_code is not 0: -            raise ExtractorError( -                'Server reported error %i' % error_code, expected=True) +            error_desc = 'Server reported error %i' % error_code +            if isinstance(data, (compat_str, compat_basestring)): +                error_desc += ': ' + data +            raise ExtractorError(error_desc, expected=True) +        show_status = data.get('show_status')          # 1 = live, 2 = offline          if show_status == '2':              raise ExtractorError( @@ -46,7 +80,7 @@ class DouyuTVIE(InfoExtractor):          base_url = data['rtmp_url']          live_path = data['rtmp_live'] -        title = self._live_title(data['room_name']) +        title = self._live_title(unescapeHTML(data['room_name']))          description = data.get('show_details')          thumbnail = data.get('room_src') @@ -66,7 +100,8 @@ class DouyuTVIE(InfoExtractor):          self._sort_formats(formats)          return { -            'id': video_id, +            'id': room_id, +            'display_id': video_id,              'title': title,              'description': description,              'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py new file mode 100644 index 000000000..e43bc81b2 --- /dev/null +++ b/youtube_dl/extractor/dumpert.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 + +from .common import InfoExtractor +from ..utils import qualities + + +class DumpertIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?dumpert\.nl/mediabase/(?P<id>[0-9]+/[0-9a-zA-Z]+)' +    _TEST = { +        'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/', +        'md5': '1b9318d7d5054e7dcb9dc7654f21d643', +        'info_dict': { +            'id': '6646981/951bc60f', +            'ext': 'mp4', +            'title': 'Ik heb nieuws voor je', +            'description': 'Niet schrikken hoor', +            'thumbnail': 're:^https?://.*\.jpg$', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        files_base64 = self._search_regex( +            r'data-files="([^"]+)"', webpage, 'data files') + +        files = self._parse_json( +            base64.b64decode(files_base64.encode('utf-8')).decode('utf-8'), +            video_id) + +        quality = qualities(['flv', 'mobile', 'tablet', '720p']) + +        formats = [{ +            'url': video_url, +            'format_id': format_id, +            'quality': quality(format_id), +        } for format_id, video_url in files.items() if format_id != 'still'] +        self._sort_formats(formats) + +        title = self._html_search_meta( +            'title', webpage) or self._og_search_title(webpage) +        description = self._html_search_meta( +            'description', webpage) or self._og_search_description(webpage) +        thumbnail = files.get('still') or self._og_search_thumbnail(webpage) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'formats': formats +        } diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 79e2fbd39..0cbca90b0 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -1,11 +1,17 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ExtractorError  class EroProfileIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P<id>[^/]+)' -    _TEST = { +    _LOGIN_URL = 'http://www.eroprofile.com/auth/auth.php?' +    _NETRC_MACHINE = 'eroprofile' +    _TESTS = [{          'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore',          'md5': 'c26f351332edf23e1ea28ce9ec9de32f',          'info_dict': { @@ -16,13 +22,55 @@ class EroProfileIE(InfoExtractor):              'thumbnail': 're:https?://.*\.jpg',              'age_limit': 18,          } -    } +    }, { +        'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file', +        'md5': '1baa9602ede46ce904c431f5418d8916', +        'info_dict': { +            'id': '1133519', +            'ext': 'm4v', +            'title': 'Try It On Pee_cut_2.wmv - 4shared.com - file sharing - download movie file', +            'thumbnail': 're:https?://.*\.jpg', +            'age_limit': 18, +        }, +        'skip': 'Requires login', +    }] + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            return + +        query = compat_urllib_parse.urlencode({ +            'username': username, +            'password': password, +            'url': 'http://www.eroprofile.com/', +        }) +        login_url = self._LOGIN_URL + query +        login_page = self._download_webpage(login_url, None, False) + +        m = re.search(r'Your username or password was incorrect\.', login_page) +        if m: +            raise ExtractorError( +                'Wrong username and/or password.', expected=True) + +        self.report_login() +        redirect_url = self._search_regex( +            r'<script[^>]+?src="([^"]+)"', login_page, 'login redirect url') +        self._download_webpage(redirect_url, None, False) + +    def _real_initialize(self): +        self._login()      def _real_extract(self, url):          display_id = self._match_id(url)          webpage = self._download_webpage(url, display_id) +        m = re.search(r'You must be logged in to view this video\.', webpage) +        if m: +            raise ExtractorError( +                'This video requires login. Please specify a username and password and try again.', expected=True) +          video_id = self._search_regex(              [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],              webpage, 'video id', default=None) diff --git a/youtube_dl/extractor/gamersyde.py b/youtube_dl/extractor/gamersyde.py new file mode 100644 index 000000000..d545e01bb --- /dev/null +++ b/youtube_dl/extractor/gamersyde.py @@ -0,0 +1,70 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    js_to_json, +    parse_duration, +    remove_start, +) + + +class GamersydeIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_(?P<display_id>[\da-z_]+)-(?P<id>\d+)_[a-z]{2}\.html' +    _TEST = { +        'url': 'http://www.gamersyde.com/hqstream_bloodborne_birth_of_a_hero-34371_en.html', +        'md5': 'f38d400d32f19724570040d5ce3a505f', +        'info_dict': { +            'id': '34371', +            'ext': 'mp4', +            'duration': 372, +            'title': 'Bloodborne - Birth of a hero', +            'thumbnail': 're:^https?://.*\.jpg$', +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        display_id = mobj.group('display_id') + +        webpage = self._download_webpage(url, display_id) + +        playlist = self._parse_json( +            self._search_regex( +                r'(?s)playlist: \[({.+?})\]\s*}\);', webpage, 'files'), +            display_id, transform_source=js_to_json) + +        formats = [] +        for source in playlist['sources']: +            video_url = source.get('file') +            if not video_url: +                continue +            format_id = source.get('label') +            f = { +                'url': video_url, +                'format_id': format_id, +            } +            m = re.search(r'^(?P<height>\d+)[pP](?P<fps>\d+)fps', format_id) +            if m: +                f.update({ +                    'height': int(m.group('height')), +                    'fps': int(m.group('fps')), +                }) +            formats.append(f) +        self._sort_formats(formats) + +        title = remove_start(playlist['title'], '%s - ' % video_id) +        thumbnail = playlist.get('image') +        duration = parse_duration(self._search_regex( +            r'Length:</label>([^<]+)<', webpage, 'duration', fatal=False)) + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'thumbnail': thumbnail, +            'duration': duration, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8a49b0b54..2ff002643 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,6 +29,7 @@ from ..utils import (      xpath_text,  )  from .brightcove import BrightcoveIE +from .nbc import NBCSportsVPlayerIE  from .ooyala import OoyalaIE  from .rutv import RUTVIE  from .smotri import SmotriIE @@ -620,6 +621,16 @@ class GenericIE(InfoExtractor):                  'age_limit': 0,              },          }, +        # 5min embed +        { +            'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/', +            'md5': '4c6f127a30736b59b3e2c19234ee2bf7', +            'info_dict': { +                'id': '518726732', +                'ext': 'mp4', +                'title': 'Facebook Creates "On This Day" | Crunch Report', +            }, +        },          # RSS feed with enclosure          {              'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', @@ -629,6 +640,16 @@ class GenericIE(InfoExtractor):                  'upload_date': '20150228',                  'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',              } +        }, +        # NBC Sports vplayer embed +        { +            'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a', +            'info_dict': { +                'id': 'ln7x1qSThw4k', +                'ext': 'flv', +                'title': "PFT Live: New leader in the 'new-look' defense", +                'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', +            },          }      ] @@ -1236,6 +1257,17 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result(mobj.group('url'), 'Pladform') +        # Look for 5min embeds +        mobj = re.search( +            r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage) +        if mobj is not None: +            return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin') + +        # Look for NBC Sports VPlayer embeds +        nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) +        if nbc_sports_url: +            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') +          def check_video(vurl):              if YoutubeIE.suitable(vurl):                  return True diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py new file mode 100644 index 000000000..cc3f27194 --- /dev/null +++ b/youtube_dl/extractor/miomio.py @@ -0,0 +1,93 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random + +from .common import InfoExtractor +from ..utils import ( +    xpath_text, +    int_or_none, +) + + +class MioMioIE(InfoExtractor): +    IE_NAME = 'miomio.tv' +    _VALID_URL = r'https?://(?:www\.)?miomio\.tv/watch/cc(?P<id>[0-9]+)' +    _TESTS = [{ +        'url': 'http://www.miomio.tv/watch/cc179734/', +        'md5': '48de02137d0739c15b440a224ad364b9', +        'info_dict': { +            'id': '179734', +            'ext': 'flv', +            'title': '手绘动漫鬼泣但丁全程画法', +            'duration': 354, +        }, +    }, { +        'url': 'http://www.miomio.tv/watch/cc184024/', +        'info_dict': { +            'id': '43729', +            'title': '《动漫同人插画绘制》', +        }, +        'playlist_mincount': 86, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        title = self._html_search_meta( +            'description', webpage, 'title', fatal=True) + +        mioplayer_path = self._search_regex( +            r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path') + +        xml_config = self._search_regex( +            r'flashvars="type=sina&(.+?)&', +            webpage, 'xml config') + +        # skipping the following page causes lags and eventually connection drop-outs +        self._request_webpage( +            'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/xml.php?id=%s&r=%s' % (id, random.randint(100, 999)), +            video_id) + +        # the following xml contains the actual configuration information on the video file(s) +        vid_config = self._download_xml( +            'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config), +            video_id) + +        http_headers = { +            'Referer': 'http://www.miomio.tv%s' % mioplayer_path, +        } + +        entries = [] +        for f in vid_config.findall('./durl'): +            segment_url = xpath_text(f, 'url', 'video url') +            if not segment_url: +                continue +            order = xpath_text(f, 'order', 'order') +            segment_id = video_id +            segment_title = title +            if order: +                segment_id += '-%s' % order +                segment_title += ' part %s' % order +            entries.append({ +                'id': segment_id, +                'url': segment_url, +                'title': segment_title, +                'duration': int_or_none(xpath_text(f, 'length', 'duration'), 1000), +                'http_headers': http_headers, +            }) + +        if len(entries) == 1: +            segment = entries[0] +            segment['id'] = video_id +            segment['title'] = title +            return segment + +        return { +            '_type': 'multi_video', +            'id': video_id, +            'entries': entries, +            'title': title, +            'http_headers': http_headers, +        } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 21aea0c55..84f291558 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -97,7 +97,7 @@ class MixcloudIE(InfoExtractor):              r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)          description = self._og_search_description(webpage)          like_count = str_to_int(self._search_regex( -            r'\bbutton-favorite\b.+m-ajax-toggle-count="([^"]+)"', +            r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"',              webpage, 'like count', fatal=False))          view_count = str_to_int(self._search_regex(              [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 3645d3033..ecd0ac8b1 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -14,7 +14,7 @@ from ..utils import (  class NBCIE(InfoExtractor): -    _VALID_URL = r'http://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)' +    _VALID_URL = r'https?://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'      _TESTS = [          { @@ -50,6 +50,57 @@ class NBCIE(InfoExtractor):          return self.url_result(theplatform_url) +class NBCSportsVPlayerIE(InfoExtractor): +    _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)' + +    _TESTS = [{ +        'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI', +        'info_dict': { +            'id': '9CsDKds0kvHI', +            'ext': 'flv', +            'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', +            'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', +        } +    }, { +        'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z', +        'only_matching': True, +    }] + +    @staticmethod +    def _extract_url(webpage): +        iframe_m = re.search( +            r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage) +        if iframe_m: +            return iframe_m.group('url') + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        theplatform_url = self._og_search_video_url(webpage) +        return self.url_result(theplatform_url, 'ThePlatform') + + +class NBCSportsIE(InfoExtractor): +    # Does not include https becuase its certificate is invalid +    _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)' + +    _TEST = { +        'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', +        'info_dict': { +            'id': 'PHJSaFWbrTY9', +            'ext': 'flv', +            'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', +            'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        return self.url_result( +            NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') + +  class NBCNewsIE(InfoExtractor):      _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/          (?:video/.+?/(?P<id>\d+)| diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index a20672c0c..46cebc0d7 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -5,19 +5,33 @@ from .zdf import extract_from_xml_url  class PhoenixIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?phoenix\.de/content/(?P<id>[0-9]+)' -    _TEST = { -        'url': 'http://www.phoenix.de/content/884301', -        'md5': 'ed249f045256150c92e72dbb70eadec6', -        'info_dict': { -            'id': '884301', -            'ext': 'mp4', -            'title': 'Michael Krons mit Hans-Werner Sinn', -            'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', -            'upload_date': '20141025', -            'uploader': 'Im Dialog', -        } -    } +    _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ +        (?: +            phoenix/die_sendungen/(?:[^/]+/)? +        )? +        (?P<id>[0-9]+)''' +    _TESTS = [ +        { +            'url': 'http://www.phoenix.de/content/884301', +            'md5': 'ed249f045256150c92e72dbb70eadec6', +            'info_dict': { +                'id': '884301', +                'ext': 'mp4', +                'title': 'Michael Krons mit Hans-Werner Sinn', +                'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', +                'upload_date': '20141025', +                'uploader': 'Im Dialog', +            } +        }, +        { +            'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815', +            'only_matching': True, +        }, +        { +            'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234', +            'only_matching': True, +        }, +    ]      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py index 9576aed0e..e766ccca3 100644 --- a/youtube_dl/extractor/playfm.py +++ b/youtube_dl/extractor/playfm.py @@ -4,85 +4,72 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import ( -    compat_urllib_parse, -    compat_urllib_request, -) +from ..compat import compat_str  from ..utils import (      ExtractorError, -    float_or_none,      int_or_none, -    str_to_int, +    parse_iso8601,  )  class PlayFMIE(InfoExtractor):      IE_NAME = 'play.fm' -    _VALID_URL = r'https?://(?:www\.)?play\.fm/[^?#]*(?P<upload_date>[0-9]{8})(?P<id>[0-9]{6})(?:$|[?#])' +    _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P<slug>(?:[^/]+/)+(?P<id>[^/]+))/?(?:$|[?#])'      _TEST = { -        'url': 'http://www.play.fm/recording/leipzigelectronicmusicbatofarparis_fr20140712137220', +        'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12',          'md5': 'c505f8307825a245d0c7ad1850001f22',          'info_dict': { -            'id': '137220', +            'id': '71276',              'ext': 'mp3', -            'title': 'LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', -            'uploader': 'Sven Tasnadi', -            'uploader_id': 'sventasnadi', -            'duration': 5627.428, -            'upload_date': '20140712', +            'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', +            'description': '', +            'duration': 5627, +            'timestamp': 1406033781, +            'upload_date': '20140722', +            'uploader': 'Dan Drastic', +            'uploader_id': '71170',              'view_count': int,              'comment_count': int, -            'thumbnail': 're:^https?://.*\.jpg$',          },      }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') -        upload_date = mobj.group('upload_date') - -        rec_data = compat_urllib_parse.urlencode({'rec_id': video_id}) -        req = compat_urllib_request.Request( -            'http://www.play.fm/flexRead/recording', data=rec_data) -        req.add_header('Content-Type', 'application/x-www-form-urlencoded') -        rec_doc = self._download_xml(req, video_id) +        slug = mobj.group('slug') -        error_node = rec_doc.find('./error') -        if error_node is not None: -            raise ExtractorError('An error occured: %s (code %s)' % ( -                error_node.text, rec_doc.find('./status').text)) +        recordings = self._download_json( +            'http://v2api.play.fm/recordings/slug/%s' % slug, video_id) -        recording = rec_doc.find('./recording') -        title = recording.find('./title').text -        view_count = str_to_int(recording.find('./stats/playcount').text) -        comment_count = str_to_int(recording.find('./stats/comments').text) -        duration = float_or_none(recording.find('./duration').text, scale=1000) -        thumbnail = recording.find('./image').text +        error = recordings.get('error') +        if isinstance(error, dict): +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, error.get('message')), +                expected=True) -        artist = recording.find('./artists/artist') -        uploader = artist.find('./name').text -        uploader_id = artist.find('./slug').text - -        video_url = '%s//%s/%s/%s/offset/0/sh/%s/rec/%s/jingle/%s/loc/%s' % ( -            'http:', recording.find('./url').text, -            recording.find('./_class').text, recording.find('./file_id').text, -            rec_doc.find('./uuid').text, video_id, -            rec_doc.find('./jingle/file_id').text, -            'http%3A%2F%2Fwww.play.fm%2Fplayer', -        ) +        audio_url = recordings['audio'] +        video_id = compat_str(recordings.get('id') or video_id) +        title = recordings['title'] +        description = recordings.get('description') +        duration = int_or_none(recordings.get('recordingDuration')) +        timestamp = parse_iso8601(recordings.get('created_at')) +        uploader = recordings.get('page', {}).get('title') +        uploader_id = compat_str(recordings.get('page', {}).get('id')) +        view_count = int_or_none(recordings.get('playCount')) +        comment_count = int_or_none(recordings.get('commentCount')) +        categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')]          return {              'id': video_id, -            'url': video_url, -            'ext': 'mp3', -            'filesize': int_or_none(recording.find('./size').text), +            'url': audio_url,              'title': title, -            'upload_date': upload_date, -            'view_count': view_count, -            'comment_count': comment_count, +            'description': description,              'duration': duration, -            'thumbnail': thumbnail, +            'timestamp': timestamp,              'uploader': uploader,              'uploader_id': uploader_id, +            'view_count': view_count, +            'comment_count': comment_count, +            'categories': categories,          } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 3a27e3789..0c8b731cf 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,10 +33,8 @@ class PornHubIE(InfoExtractor):      }      def _extract_count(self, pattern, webpage, name): -        count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False) -        if count: -            count = str_to_int(count) -        return count +        return str_to_int(self._search_regex( +            pattern, webpage, '%s count' % name, fatal=False))      def _real_extract(self, url):          video_id = self._match_id(url) @@ -62,11 +60,14 @@ class PornHubIE(InfoExtractor):          if thumbnail:              thumbnail = compat_urllib_parse.unquote(thumbnail) -        view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') -        like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') -        dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') +        view_count = self._extract_count( +            r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') +        like_count = self._extract_count( +            r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') +        dislike_count = self._extract_count( +            r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')          comment_count = self._extract_count( -            r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment') +            r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')          video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))          if webpage.find('"encrypted":true') != -1: diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py new file mode 100644 index 000000000..9688ed948 --- /dev/null +++ b/youtube_dl/extractor/pornovoisines.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import random + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    float_or_none, +    unified_strdate, +) + + +class PornoVoisinesIE(InfoExtractor): +    _VALID_URL = r'http://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)' + +    _VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \ +        '/static/media/video/transcoded/%s-640x360-1000-trscded.mp4' + +    _SERVER_NUMBERS = (1, 2) + +    _TEST = { +        'url': 'http://www.pornovoisines.com/showvideo/1285/recherche-appartement/', +        'md5': '5ac670803bc12e9e7f9f662ce64cf1d1', +        'info_dict': { +            'id': '1285', +            'display_id': 'recherche-appartement', +            'ext': 'mp4', +            'title': 'Recherche appartement', +            'description': 'md5:819ea0b785e2a04667a1a01cdc89594e', +            'thumbnail': 're:^https?://.*\.jpg$', +            'upload_date': '20140925', +            'duration': 120, +            'view_count': int, +            'average_rating': float, +            'categories': ['Débutante', 'Scénario', 'Sodomie'], +            'age_limit': 18, +        } +    } + +    @classmethod +    def build_video_url(cls, num): +        return cls._VIDEO_URL_TEMPLATE % (random.choice(cls._SERVER_NUMBERS), num) + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        display_id = mobj.group('display_id') + +        webpage = self._download_webpage(url, video_id) + +        video_url = self.build_video_url(video_id) + +        title = self._html_search_regex( +            r'<h1>(.+?)</h1>', webpage, 'title', flags=re.DOTALL) +        description = self._html_search_regex( +            r'<article id="descriptif">(.+?)</article>', +            webpage, "description", fatal=False, flags=re.DOTALL) + +        thumbnail = self._search_regex( +            r'<div id="mediaspace%s">\s*<img src="/?([^"]+)"' % video_id, +            webpage, 'thumbnail', fatal=False) +        if thumbnail: +            thumbnail = 'http://www.pornovoisines.com/%s' % thumbnail + +        upload_date = unified_strdate(self._search_regex( +            r'Publié le ([\d-]+)', webpage, 'upload date', fatal=False)) +        duration = int_or_none(self._search_regex( +            'Durée (\d+)', webpage, 'duration', fatal=False)) +        view_count = int_or_none(self._search_regex( +            r'(\d+) vues', webpage, 'view count', fatal=False)) +        average_rating = self._search_regex( +            r'Note : (\d+,\d+)', webpage, 'average rating', fatal=False) +        if average_rating: +            average_rating = float_or_none(average_rating.replace(',', '.')) + +        categories = self._html_search_meta( +            'keywords', webpage, 'categories', fatal=False) +        if categories: +            categories = [category.strip() for category in categories.split(',')] + +        return { +            'id': video_id, +            'display_id': display_id, +            'url': video_url, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'upload_date': upload_date, +            'duration': duration, +            'view_count': view_count, +            'average_rating': average_rating, +            'categories': categories, +            'age_limit': 18, +        } diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 385681d06..7cc799664 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -10,6 +10,7 @@ from ..compat import (  )  from ..utils import (      unified_strdate, +    int_or_none,  ) @@ -24,7 +25,7 @@ class ProSiebenSat1IE(InfoExtractor):              'info_dict': {                  'id': '2104602',                  'ext': 'mp4', -                'title': 'Staffel 2, Episode 18 - Jahresrückblick', +                'title': 'Episode 18 - Staffel 2',                  'description': 'md5:8733c81b702ea472e069bc48bb658fc1',                  'upload_date': '20131231',                  'duration': 5845.04, @@ -266,6 +267,9 @@ class ProSiebenSat1IE(InfoExtractor):              urls_sources = urls_sources.values()          def fix_bitrate(bitrate): +            bitrate = int_or_none(bitrate) +            if not bitrate: +                return None              return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate          for source in urls_sources: diff --git a/youtube_dl/extractor/radiojavan.py b/youtube_dl/extractor/radiojavan.py new file mode 100644 index 000000000..884c28420 --- /dev/null +++ b/youtube_dl/extractor/radiojavan.py @@ -0,0 +1,67 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import( +    unified_strdate, +    str_to_int, +) + + +class RadioJavanIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?radiojavan\.com/videos/video/(?P<id>[^/]+)/?' +    _TEST = { +        'url': 'http://www.radiojavan.com/videos/video/chaartaar-ashoobam', +        'md5': 'e85208ffa3ca8b83534fca9fe19af95b', +        'info_dict': { +            'id': 'chaartaar-ashoobam', +            'ext': 'mp4', +            'title': 'Chaartaar - Ashoobam', +            'thumbnail': 're:^https?://.*\.jpe?g$', +            'upload_date': '20150215', +            'view_count': int, +            'like_count': int, +            'dislike_count': int, +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        formats = [{ +            'url': 'https://media.rdjavan.com/media/music_video/%s' % video_path, +            'format_id': '%sp' % height, +            'height': int(height), +        } for height, video_path in re.findall(r"RJ\.video(\d+)p\s*=\s*'/?([^']+)'", webpage)] +        self._sort_formats(formats) + +        title = self._og_search_title(webpage) +        thumbnail = self._og_search_thumbnail(webpage) + +        upload_date = unified_strdate(self._search_regex( +            r'class="date_added">Date added: ([^<]+)<', +            webpage, 'upload date', fatal=False)) + +        view_count = str_to_int(self._search_regex( +            r'class="views">Plays: ([\d,]+)', +            webpage, 'view count', fatal=False)) +        like_count = str_to_int(self._search_regex( +            r'class="rating">([\d,]+) likes', +            webpage, 'like count', fatal=False)) +        dislike_count = str_to_int(self._search_regex( +            r'class="rating">([\d,]+) dislikes', +            webpage, 'dislike count', fatal=False)) + +        return { +            'id': video_id, +            'title': title, +            'thumbnail': thumbnail, +            'upload_date': upload_date, +            'view_count': view_count, +            'like_count': like_count, +            'dislike_count': dislike_count, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 846b76c81..d6054d717 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,17 +1,19 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..utils import ExtractorError  class RedTubeIE(InfoExtractor):      _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)'      _TEST = {          'url': 'http://www.redtube.com/66418', +        'md5': '7b8c22b5e7098a3e1c09709df1126d2d',          'info_dict': {              'id': '66418',              'ext': 'mp4', -            "title": "Sucked on a toilet", -            "age_limit": 18, +            'title': 'Sucked on a toilet', +            'age_limit': 18,          }      } @@ -19,6 +21,9 @@ class RedTubeIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) +        if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): +            raise ExtractorError('Video %s has been removed' % video_id, expected=True) +          video_url = self._html_search_regex(              r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')          video_title = self._html_search_regex( diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 13f071077..8d9be1b98 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -10,6 +10,7 @@ from ..compat import compat_urlparse  from ..utils import (      float_or_none,      remove_end, +    std_headers,      struct_unpack,  ) @@ -84,13 +85,20 @@ class RTVEALaCartaIE(InfoExtractor):          'only_matching': True,      }] +    def _real_initialize(self): +        user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') +        manager_info = self._download_json( +            'http://www.rtve.es/odin/loki/' + user_agent_b64, +            None, 'Fetching manager info') +        self._manager = manager_info['manager'] +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id')          info = self._download_json(              'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id,              video_id)['page']['items'][0] -        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % video_id +        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id)          png = self._download_webpage(png_url, video_id, 'Downloading url information')          video_url = _decrypt_url(png)          if not video_url.endswith('.f4m'): diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py new file mode 100644 index 000000000..10251f29e --- /dev/null +++ b/youtube_dl/extractor/safari.py @@ -0,0 +1,157 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveIE + +from ..compat import ( +    compat_urllib_parse, +    compat_urllib_request, +) +from ..utils import ( +    ExtractorError, +    smuggle_url, +    std_headers, +) + + +class SafariBaseIE(InfoExtractor): +    _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' +    _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>' +    _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to supply credentials for safaribooksonline.com' +    _NETRC_MACHINE = 'safari' + +    _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' +    _API_FORMAT = 'json' + +    LOGGED_IN = False + +    def _real_initialize(self): +        # We only need to log in once for courses or individual videos +        if not self.LOGGED_IN: +            self._login() +            SafariBaseIE.LOGGED_IN = True + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            raise ExtractorError( +                self._ACCOUNT_CREDENTIALS_HINT, +                expected=True) + +        headers = std_headers +        if 'Referer' not in headers: +            headers['Referer'] = self._LOGIN_URL + +        login_page = self._download_webpage( +            self._LOGIN_URL, None, +            'Downloading login form') + +        csrf = self._html_search_regex( +            r"name='csrfmiddlewaretoken'\s+value='([^']+)'", +            login_page, 'csrf token') + +        login_form = { +            'csrfmiddlewaretoken': csrf, +            'email': username, +            'password1': password, +            'login': 'Sign In', +            'next': '', +        } + +        request = compat_urllib_request.Request( +            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form), headers=headers) +        login_page = self._download_webpage( +            request, None, 'Logging in as %s' % username) + +        if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: +            raise ExtractorError( +                'Login failed; make sure your credentials are correct and try again.', +                expected=True) + +        self.to_screen('Login successful') + + +class SafariIE(SafariBaseIE): +    IE_NAME = 'safari' +    IE_DESC = 'safaribooksonline.com online video' +    _VALID_URL = r'''(?x)https?:// +                            (?:www\.)?safaribooksonline\.com/ +                                (?: +                                    library/view/[^/]+| +                                    api/v1/book +                                )/ +                                (?P<course_id>\d+)/ +                                    (?:chapter(?:-content)?/)? +                                (?P<part>part\d+)\.html +    ''' + +    _TESTS = [{ +        'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', +        'md5': '5b0c4cc1b3c1ba15dda7344085aa5592', +        'info_dict': { +            'id': '2842601850001', +            'ext': 'mp4', +            'title': 'Introduction', +        }, +        'skip': 'Requires safaribooksonline account credentials', +    }, { +        'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        course_id = mobj.group('course_id') +        part = mobj.group('part') + +        webpage = self._download_webpage( +            '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part), +            part) + +        bc_url = BrightcoveIE._extract_brightcove_url(webpage) +        if not bc_url: +            raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) + +        return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'Brightcove') + + +class SafariCourseIE(SafariBaseIE): +    IE_NAME = 'safari:course' +    IE_DESC = 'safaribooksonline.com online courses' + +    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>\d+)/?(?:[#?]|$)' + +    _TESTS = [{ +        'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', +        'info_dict': { +            'id': '9780133392838', +            'title': 'Hadoop Fundamentals LiveLessons', +        }, +        'playlist_count': 22, +        'skip': 'Requires safaribooksonline account credentials', +    }, { +        'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        course_id = self._match_id(url) + +        course_json = self._download_json( +            '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), +            course_id, 'Downloading course JSON') + +        if 'chapters' not in course_json: +            raise ExtractorError( +                'No chapters found for course %s' % course_id, expected=True) + +        entries = [ +            self.url_result(chapter, 'Safari') +            for chapter in course_json['chapters']] + +        course_title = course_json['title'] + +        return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 9f79ff5c1..0b717a1e4 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -30,7 +30,7 @@ class SlideshareIE(InfoExtractor):          page_title = mobj.group('title')          webpage = self._download_webpage(url, page_title)          slideshare_obj = self._search_regex( -            r'var\s+slideshare_object\s*=\s*({.*?});\s*var\s+user_info\s*=', +            r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);',              webpage, 'slideshare object')          info = json.loads(slideshare_obj)          if info['slideshow']['type'] != 'video': diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 9d4505972..316b2c90f 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -242,7 +242,7 @@ class SoundcloudIE(InfoExtractor):  class SoundcloudSetIE(SoundcloudIE): -    _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' +    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?'      IE_NAME = 'soundcloud:set'      _TESTS = [{          'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', @@ -287,7 +287,7 @@ class SoundcloudSetIE(SoundcloudIE):  class SoundcloudUserIE(SoundcloudIE): -    _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' +    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$'      IE_NAME = 'soundcloud:user'      _TESTS = [{          'url': 'https://soundcloud.com/the-concept-band', diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 7cb06f351..a46a7ecba 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -54,7 +54,7 @@ class TeamcocoIE(InfoExtractor):              embed_url, video_id, 'Downloading embed page')          player_data = self._parse_json(self._search_regex( -            r'Y\.Ginger\.Module\.Player\((\{.*?\})\);', embed, 'player data'), video_id) +            r'Y\.Ginger\.Module\.Player(?:;var\s*player\s*=\s*new\s*m)?\((\{.*?\})\);', embed, 'player data'), video_id)          data = self._parse_json(              base64.b64decode(player_data['preload'].encode('ascii')).decode('utf-8'), video_id) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index feac666f7..0e3e627f4 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -92,7 +92,7 @@ class ThePlatformIE(InfoExtractor):              error_msg = next(                  n.attrib['abstract']                  for n in meta.findall(_x('.//smil:ref')) -                if n.attrib.get('title') == 'Geographic Restriction') +                if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired')          except StopIteration:              pass          else: diff --git a/youtube_dl/extractor/twentytwotracks.py b/youtube_dl/extractor/twentytwotracks.py new file mode 100644 index 000000000..d6c0ab184 --- /dev/null +++ b/youtube_dl/extractor/twentytwotracks.py @@ -0,0 +1,86 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + +# 22Tracks regularly replace the audio tracks that can be streamed on their +# site. The tracks usually expire after 1 months, so we can't add tests. + + +class TwentyTwoTracksIE(InfoExtractor): +    _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/(?P<id>\d+)' +    IE_NAME = '22tracks:track' + +    _API_BASE = 'http://22tracks.com/api' + +    def _extract_info(self, city, genre_name, track_id=None): +        item_id = track_id if track_id else genre_name + +        cities = self._download_json( +            '%s/cities' % self._API_BASE, item_id, +            'Downloading cities info', +            'Unable to download cities info') +        city_id = [x['id'] for x in cities if x['slug'] == city][0] + +        genres = self._download_json( +            '%s/genres/%s' % (self._API_BASE, city_id), item_id, +            'Downloading %s genres info' % city, +            'Unable to download %s genres info' % city) +        genre = [x for x in genres if x['slug'] == genre_name][0] +        genre_id = genre['id'] + +        tracks = self._download_json( +            '%s/tracks/%s' % (self._API_BASE, genre_id), item_id, +            'Downloading %s genre tracks info' % genre_name, +            'Unable to download track info') + +        return [x for x in tracks if x['id'] == item_id][0] if track_id else [genre['title'], tracks] + +    def _get_track_url(self, filename, track_id): +        token = self._download_json( +            'http://22tracks.com/token.php?desktop=true&u=/128/%s' % filename, +            track_id, 'Downloading token', 'Unable to download token') +        return 'http://audio.22tracks.com%s?st=%s&e=%d' % (token['filename'], token['st'], token['e']) + +    def _extract_track_info(self, track_info, track_id): +        download_url = self._get_track_url(track_info['filename'], track_id) +        title = '%s - %s' % (track_info['artist'].strip(), track_info['title'].strip()) +        return { +            'id': track_id, +            'url': download_url, +            'ext': 'mp3', +            'title': title, +            'duration': int_or_none(track_info.get('duration')), +            'timestamp': int_or_none(track_info.get('published_at') or track_info.get('created')) +        } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) + +        city = mobj.group('city') +        genre = mobj.group('genre') +        track_id = mobj.group('id') + +        track_info = self._extract_info(city, genre, track_id) +        return self._extract_track_info(track_info, track_id) + + +class TwentyTwoTracksGenreIE(TwentyTwoTracksIE): +    _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/?$' +    IE_NAME = '22tracks:genre' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) + +        city = mobj.group('city') +        genre = mobj.group('genre') + +        genre_title, tracks = self._extract_info(city, genre) + +        entries = [ +            self._extract_track_info(track_info, track_info['id']) +            for track_info in tracks] + +        return self.playlist_result(entries, genre, genre_title) diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py index 06554a1be..96c809eaf 100644 --- a/youtube_dl/extractor/ultimedia.py +++ b/youtube_dl/extractor/ultimedia.py @@ -42,7 +42,6 @@ class UltimediaIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -          webpage = self._download_webpage(url, video_id)          deliver_url = self._search_regex( @@ -81,8 +80,8 @@ class UltimediaIE(InfoExtractor):          title = clean_html((              self._html_search_regex(                  r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>', -                webpage, 'title', default=None) -            or self._search_regex( +                webpage, 'title', default=None) or +            self._search_regex(                  r"var\s+nameVideo\s*=\s*'([^']+)'",                  deliver_page, 'title'))) diff --git a/youtube_dl/extractor/varzesh3.py b/youtube_dl/extractor/varzesh3.py new file mode 100644 index 000000000..9369abaf8 --- /dev/null +++ b/youtube_dl/extractor/varzesh3.py @@ -0,0 +1,45 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class Varzesh3IE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?video\.varzesh3\.com/(?:[^/]+/)+(?P<id>[^/]+)/?' +    _TEST = { +        'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/', +        'md5': '2a933874cb7dce4366075281eb49e855', +        'info_dict': { +            'id': '76337', +            'ext': 'mp4', +            'title': '۵ واکنش برتر دروازهبانان؛هفته ۲۶ بوندسلیگا', +            'description': 'فصل ۲۰۱۵-۲۰۱۴', +            'thumbnail': 're:^https?://.*\.jpg$', +        } +    } + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        video_url = self._search_regex( +            r'<source[^>]+src="([^"]+)"', webpage, 'video url') + +        title = self._og_search_title(webpage) +        description = self._html_search_regex( +            r'(?s)<div class="matn">(.+?)</div>', +            webpage, 'description', fatal=False) +        thumbnail = self._og_search_thumbnail(webpage) + +        video_id = self._search_regex( +            r"<link[^>]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'", +            webpage, display_id, default=display_id) + +        return { +            'url': video_url, +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py new file mode 100644 index 000000000..6215f0642 --- /dev/null +++ b/youtube_dl/extractor/vessel.py @@ -0,0 +1,127 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_urllib_request +from ..utils import ( +    ExtractorError, +    parse_iso8601, +) + + +class VesselIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P<id>[0-9a-zA-Z]+)' +    _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s' +    _LOGIN_URL = 'https://www.vessel.com/api/account/login' +    _NETRC_MACHINE = 'vessel' +    _TEST = { +        'url': 'https://www.vessel.com/videos/HDN7G5UMs', +        'md5': '455cdf8beb71c6dd797fd2f3818d05c4', +        'info_dict': { +            'id': 'HDN7G5UMs', +            'ext': 'mp4', +            'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?', +            'thumbnail': 're:^https?://.*\.jpg$', +            'upload_date': '20150317', +            'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?', +            'timestamp': int, +        }, +    } + +    @staticmethod +    def make_json_request(url, data): +        payload = json.dumps(data).encode('utf-8') +        req = compat_urllib_request.Request(url, payload) +        req.add_header('Content-Type', 'application/json; charset=utf-8') +        return req + +    @staticmethod +    def find_assets(data, asset_type): +        for asset in data.get('assets', []): +            if asset.get('type') == asset_type: +                yield asset + +    def _check_access_rights(self, data): +        access_info = data.get('__view', {}) +        if not access_info.get('allow_access', True): +            err_code = access_info.get('error_code') or '' +            if err_code == 'ITEM_PAID_ONLY': +                raise ExtractorError( +                    'This video requires subscription.', expected=True) +            else: +                raise ExtractorError( +                    'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            return +        self.report_login() +        data = { +            'client_id': 'web', +            'type': 'password', +            'user_key': username, +            'password': password, +        } +        login_request = VesselIE.make_json_request(self._LOGIN_URL, data) +        self._download_webpage(login_request, None, False, 'Wrong login info') + +    def _real_initialize(self): +        self._login() + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) +        data = self._parse_json(self._search_regex( +            r'App\.bootstrapData\((.*?)\);', webpage, 'data'), video_id) +        asset_id = data['model']['data']['id'] + +        req = VesselIE.make_json_request( +            self._API_URL_TEMPLATE % asset_id, {'client': 'web'}) +        data = self._download_json(req, video_id) + +        self._check_access_rights(data) + +        try: +            video_asset = next(VesselIE.find_assets(data, 'video')) +        except StopIteration: +            raise ExtractorError('No video assets found') + +        formats = [] +        for f in video_asset.get('sources', []): +            if f['name'] == 'hls-index': +                formats.extend(self._extract_m3u8_formats( +                    f['location'], video_id, ext='mp4', m3u8_id='m3u8')) +            else: +                formats.append({ +                    'format_id': f['name'], +                    'tbr': f.get('bitrate'), +                    'height': f.get('height'), +                    'width': f.get('width'), +                    'url': f['location'], +                }) +        self._sort_formats(formats) + +        thumbnails = [] +        for im_asset in VesselIE.find_assets(data, 'image'): +            thumbnails.append({ +                'url': im_asset['location'], +                'width': im_asset.get('width', 0), +                'height': im_asset.get('height', 0), +            }) + +        return { +            'id': video_id, +            'title': data['title'], +            'formats': formats, +            'thumbnails': thumbnails, +            'description': data.get('short_description'), +            'duration': data.get('duration'), +            'comment_count': data.get('comment_count'), +            'like_count': data.get('like_count'), +            'view_count': data.get('view_count'), +            'timestamp': parse_iso8601(data.get('released_at')), +        } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bd09652cd..28bcc89cd 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -244,6 +244,16 @@ class VimeoIE(VimeoBaseInfoExtractor):          # and latter we extract those that are Vimeo specific.          self.report_extraction(video_id) +        vimeo_config = self._search_regex( +            r'vimeo\.config\s*=\s*({.+?});', webpage, +            'vimeo config', default=None) +        if vimeo_config: +            seed_status = self._parse_json(vimeo_config, video_id).get('seed_status', {}) +            if seed_status.get('state') == 'failed': +                raise ExtractorError( +                    '%s returned error: %s' % (self.IE_NAME, seed_status['title']), +                    expected=True) +          # Extract the config JSON          try:              try: diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 4971965f9..81d885fdc 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -69,18 +69,26 @@ class XuiteIE(InfoExtractor):          'only_matching': True,      }] +    @staticmethod +    def base64_decode_utf8(data): +        return base64.b64decode(data.encode('utf-8')).decode('utf-8') + +    @staticmethod +    def base64_encode_utf8(data): +        return base64.b64encode(data.encode('utf-8')).decode('utf-8') +      def _extract_flv_config(self, media_id): -        base64_media_id = base64.b64encode(media_id.encode('utf-8')).decode('utf-8') +        base64_media_id = self.base64_encode_utf8(media_id)          flv_config = self._download_xml(              'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id,              'flv config')          prop_dict = {}          for prop in flv_config.findall('./property'): -            prop_id = base64.b64decode(prop.attrib['id']).decode('utf-8') +            prop_id = self.base64_decode_utf8(prop.attrib['id'])              # CDATA may be empty in flv config              if not prop.text:                  continue -            encoded_content = base64.b64decode(prop.text).decode('utf-8') +            encoded_content = self.base64_decode_utf8(prop.text)              prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content)          return prop_dict diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 97dbac4cc..b777159c5 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -17,6 +17,8 @@ from ..utils import (      int_or_none,  ) +from .nbc import NBCSportsVPlayerIE +  class YahooIE(InfoExtractor):      IE_DESC = 'Yahoo screen and movies' @@ -129,6 +131,15 @@ class YahooIE(InfoExtractor):          }, {              'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',              'only_matching': True, +        }, { +            'note': 'NBC Sports embeds', +            'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', +            'info_dict': { +                'id': '9CsDKds0kvHI', +                'ext': 'flv', +                'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', +                'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', +            }          }      ] @@ -151,6 +162,10 @@ class YahooIE(InfoExtractor):                  items = json.loads(items_json)                  video_id = items[0]['id']                  return self._get_info(video_id, display_id, webpage) +        # Look for NBCSports iframes +        nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) +        if nbc_sports_url: +            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')          items_json = self._search_regex(              r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index e4c855ee0..6abe72f73 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -52,7 +52,7 @@ class YouPornIE(InfoExtractor):              webpage, 'JSON parameters')          try:              params = json.loads(json_params) -        except: +        except ValueError:              raise ExtractorError('Invalid JSON')          self.report_extraction(video_id) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 27c8c4453..5488101e1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1263,27 +1263,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):          return self.playlist_result(url_results, playlist_id, title) -    def _real_extract(self, url): -        # Extract playlist id -        mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError('Invalid URL: %s' % url) -        playlist_id = mobj.group(1) or mobj.group(2) - -        # Check if it's a video-specific URL -        query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) -        if 'v' in query_dict: -            video_id = query_dict['v'][0] -            if self._downloader.params.get('noplaylist'): -                self.to_screen('Downloading just video %s because of --no-playlist' % video_id) -                return self.url_result(video_id, 'Youtube', video_id=video_id) -            else: -                self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - -        if playlist_id.startswith('RD') or playlist_id.startswith('UL'): -            # Mixes require a custom extraction process -            return self._extract_mix(playlist_id) - +    def _extract_playlist(self, playlist_id):          url = self._TEMPLATE_URL % playlist_id          page = self._download_webpage(url, playlist_id)          more_widget_html = content_html = page @@ -1327,6 +1307,29 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):          url_results = self._ids_to_results(ids)          return self.playlist_result(url_results, playlist_id, playlist_title) +    def _real_extract(self, url): +        # Extract playlist id +        mobj = re.match(self._VALID_URL, url) +        if mobj is None: +            raise ExtractorError('Invalid URL: %s' % url) +        playlist_id = mobj.group(1) or mobj.group(2) + +        # Check if it's a video-specific URL +        query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) +        if 'v' in query_dict: +            video_id = query_dict['v'][0] +            if self._downloader.params.get('noplaylist'): +                self.to_screen('Downloading just video %s because of --no-playlist' % video_id) +                return self.url_result(video_id, 'Youtube', video_id=video_id) +            else: +                self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + +        if playlist_id.startswith('RD') or playlist_id.startswith('UL'): +            # Mixes require a custom extraction process +            return self._extract_mix(playlist_id) + +        return self._extract_playlist(playlist_id) +  class YoutubeChannelIE(InfoExtractor):      IE_DESC = 'YouTube.com channels' @@ -1643,21 +1646,26 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):  class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): +    IE_NAME = 'youtube:recommended'      IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'      _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'      _FEED_NAME = 'recommended'      _PLAYLIST_TITLE = 'Youtube Recommended videos' -class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): +class YoutubeWatchLaterIE(YoutubePlaylistIE): +    IE_NAME = 'youtube:watchlater'      IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' -    _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' -    _FEED_NAME = 'watch_later' -    _PLAYLIST_TITLE = 'Youtube Watch Later' -    _PERSONAL_FEED = True +    _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' + +    _TESTS = []  # override PlaylistIE tests + +    def _real_extract(self, url): +        return self._extract_playlist('WL')  class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): +    IE_NAME = 'youtube:history'      IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'      _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'      _FEED_NAME = 'history' diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 68193a271..9bded4521 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -13,6 +13,7 @@ from .compat import (      compat_kwargs,  )  from .utils import ( +    preferredencoding,      write_string,  )  from .version import __version__ @@ -797,7 +798,7 @@ def parseOpts(overrideArguments=None):          # Workaround for Python 2.x, where argv is a byte list          if sys.version_info < (3,):              command_line_conf = [ -                a.decode('utf-8', 'replace') for a in command_line_conf] +                a.decode(preferredencoding(), 'replace') for a in command_line_conf]          if '--ignore-config' in command_line_conf:              system_conf = [] diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index b6f51cfd5..0b60ac7e7 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals  import io  import os  import subprocess -import sys  import time @@ -118,6 +117,10 @@ class FFmpegPostProcessor(PostProcessor):          return self._paths[self.basename]      @property +    def probe_available(self): +        return self.probe_basename is not None + +    @property      def probe_executable(self):          return self._paths[self.probe_basename] @@ -169,7 +172,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):      def get_audio_codec(self, path): -        if not self.probe_executable: +        if not self.probe_available:              raise PostProcessingError('ffprobe or avprobe not found. Please install one.')          try:              cmd = [ @@ -269,19 +272,17 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):              else:                  self._downloader.to_screen('[' + self.basename + '] Destination: ' + new_path)                  self.run_ffmpeg(path, new_path, acodec, more_opts) -        except: -            etype, e, tb = sys.exc_info() -            if isinstance(e, AudioConversionError): -                msg = 'audio conversion failed: ' + e.msg -            else: -                msg = 'error running ' + self.basename -            raise PostProcessingError(msg) +        except AudioConversionError as e: +            raise PostProcessingError( +                'audio conversion failed: ' + e.msg) +        except Exception: +            raise PostProcessingError('error running ' + self.basename)          # Try to update the date time for extracted audio file.          if information.get('filetime') is not None:              try:                  os.utime(encodeFilename(new_path), (time.time(), information['filetime'])) -            except: +            except Exception:                  self._downloader.report_warning('Cannot update utime of audio file')          information['filepath'] = new_path diff --git a/youtube_dl/update.py b/youtube_dl/update.py index d8be4049f..de3169eef 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -65,7 +65,7 @@ def update_self(to_screen, verbose):      # Check if there is a new version      try:          newversion = opener.open(VERSION_URL).read().decode('utf-8').strip() -    except: +    except Exception:          if verbose:              to_screen(compat_str(traceback.format_exc()))          to_screen('ERROR: can\'t find the current version. Please try again later.') @@ -78,7 +78,7 @@ def update_self(to_screen, verbose):      try:          versions_info = opener.open(JSON_URL).read().decode('utf-8')          versions_info = json.loads(versions_info) -    except: +    except Exception:          if verbose:              to_screen(compat_str(traceback.format_exc()))          to_screen('ERROR: can\'t obtain versions info. Please try again later.') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 472d4df41..52f0dd09a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -75,7 +75,7 @@ def preferredencoding():      try:          pref = locale.getpreferredencoding()          'TEST'.encode(pref) -    except: +    except Exception:          pref = 'UTF-8'      return pref @@ -127,7 +127,7 @@ def write_json_file(obj, fn):              except OSError:                  pass          os.rename(tf.name, fn) -    except: +    except Exception:          try:              os.remove(tf.name)          except OSError: @@ -348,7 +348,7 @@ def _htmlentity_transform(entity):      if entity in compat_html_entities.name2codepoint:          return compat_chr(compat_html_entities.name2codepoint[entity]) -    mobj = re.match(r'#(x?[0-9]+)', entity) +    mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)      if mobj is not None:          numstr = mobj.group(1)          if numstr.startswith('x'): @@ -730,7 +730,8 @@ def unified_strdate(date_str, day_first=True):      # Replace commas      date_str = date_str.replace(',', ' ')      # %z (UTC offset) is only supported in python>=3.2 -    date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) +    if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str): +        date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)      # Remove AM/PM + timezone      date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) @@ -759,6 +760,7 @@ def unified_strdate(date_str, day_first=True):      ]      if day_first:          format_expressions.extend([ +            '%d-%m-%Y',              '%d.%m.%Y',              '%d/%m/%Y',              '%d/%m/%y', @@ -766,6 +768,7 @@ def unified_strdate(date_str, day_first=True):          ])      else:          format_expressions.extend([ +            '%m-%d-%Y',              '%m.%d.%Y',              '%m/%d/%Y',              '%m/%d/%y', @@ -1577,7 +1580,7 @@ def js_to_json(code):          '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|          [a-zA-Z_][.a-zA-Z_0-9]*          ''', fix_kv, code) -    res = re.sub(r',(\s*\])', lambda m: m.group(1), res) +    res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)      return res diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 039ceadf2..e1c385bec 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.03.24' +__version__ = '2015.04.03' | 
