aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py14
-rw-r--r--youtube_dl/extractor/abc.py3
-rw-r--r--youtube_dl/extractor/anysex.py4
-rw-r--r--youtube_dl/extractor/ard.py2
-rw-r--r--youtube_dl/extractor/common.py54
-rw-r--r--youtube_dl/extractor/crunchyroll.py8
-rw-r--r--youtube_dl/extractor/divxstage.py4
-rw-r--r--youtube_dl/extractor/dropbox.py15
-rw-r--r--youtube_dl/extractor/eitb.py24
-rw-r--r--youtube_dl/extractor/extremetube.py11
-rw-r--r--youtube_dl/extractor/flickr.py20
-rw-r--r--youtube_dl/extractor/generic.py31
-rw-r--r--youtube_dl/extractor/golem.py71
-rw-r--r--youtube_dl/extractor/heise.py81
-rw-r--r--youtube_dl/extractor/mgoon.py87
-rw-r--r--youtube_dl/extractor/muenchentv.py7
-rw-r--r--youtube_dl/extractor/nbc.py4
-rw-r--r--youtube_dl/extractor/nfl.py103
-rw-r--r--youtube_dl/extractor/noco.py73
-rw-r--r--youtube_dl/extractor/npo.py30
-rw-r--r--youtube_dl/extractor/oktoberfesttv.py47
-rw-r--r--youtube_dl/extractor/playfm.py6
-rw-r--r--youtube_dl/extractor/sbs.py8
-rw-r--r--youtube_dl/extractor/sport5.py92
-rw-r--r--youtube_dl/extractor/theplatform.py53
-rw-r--r--youtube_dl/extractor/thvideo.py59
-rw-r--r--youtube_dl/extractor/tube8.py37
-rw-r--r--youtube_dl/extractor/vbox7.py3
-rw-r--r--youtube_dl/extractor/vevo.py67
-rw-r--r--youtube_dl/extractor/vube.py33
-rw-r--r--youtube_dl/extractor/wat.py11
-rw-r--r--youtube_dl/extractor/wistia.py15
-rw-r--r--youtube_dl/extractor/ynet.py54
-rw-r--r--youtube_dl/extractor/youku.py100
-rw-r--r--youtube_dl/extractor/yourupload.py58
-rw-r--r--youtube_dl/extractor/youtube.py326
36 files changed, 1289 insertions, 326 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 9ee3f9190..6ab3eeaf5 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -135,12 +135,14 @@ from .gametrailers import GametrailersIE
from .gdcvault import GDCVaultIE
from .generic import GenericIE
from .godtube import GodTubeIE
+from .golem import GolemIE
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
from .gorillavid import GorillaVidIE
from .goshgay import GoshgayIE
from .grooveshark import GroovesharkIE
from .hark import HarkIE
+from .heise import HeiseIE
from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE
from .hornbunny import HornBunnyIE
@@ -199,6 +201,7 @@ from .malemotion import MalemotionIE
from .mdr import MDRIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
+from .mgoon import MgoonIE
from .ministrygrid import MinistryGridIE
from .mit import TechTVMITIE, MITIE, OCWMITIE
from .mitele import MiTeleIE
@@ -239,6 +242,7 @@ from .ndtv import NDTVIE
from .newgrounds import NewgroundsIE
from .newstube import NewstubeIE
from .nfb import NFBIE
+from .nfl import NFLIE
from .nhl import NHLIE, NHLVideocenterIE
from .niconico import NiconicoIE
from .ninegag import NineGagIE
@@ -248,7 +252,10 @@ from .nosvideo import NosVideoIE
from .novamov import NovaMovIE
from .nowness import NownessIE
from .nowvideo import NowVideoIE
-from .npo import NPOIE
+from .npo import (
+ NPOIE,
+ TegenlichtVproIE,
+)
from .nrk import (
NRKIE,
NRKTVIE,
@@ -256,6 +263,7 @@ from .nrk import (
from .ntv import NTVIE
from .nytimes import NYTimesIE
from .nuvid import NuvidIE
+from .oktoberfesttv import OktoberfestTVIE
from .ooyala import OoyalaIE
from .orf import (
ORFTVthekIE,
@@ -335,6 +343,7 @@ from .spankwire import SpankwireIE
from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE
from .spike import SpikeIE
+from .sport5 import Sport5IE
from .sportdeutschland import SportDeutschlandIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
@@ -362,6 +371,7 @@ from .thisav import ThisAVIE
from .tinypic import TinyPicIE
from .tlc import TlcIE, TlcDeIE
from .tnaflix import TNAFlixIE
+from .thvideo import THVideoIE
from .toutv import TouTvIE
from .toypics import ToypicsUserIE, ToypicsIE
from .traileraddict import TrailerAddictIE
@@ -445,9 +455,11 @@ from .yahoo import (
YahooNewsIE,
YahooSearchIE,
)
+from .ynet import YnetIE
from .youjizz import YouJizzIE
from .youku import YoukuIE
from .youporn import YouPornIE
+from .yourupload import YourUploadIE
from .youtube import (
YoutubeIE,
YoutubeChannelIE,
diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py
index 7d89f44ee..69f89320c 100644
--- a/youtube_dl/extractor/abc.py
+++ b/youtube_dl/extractor/abc.py
@@ -22,8 +22,7 @@ class ABCIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
urls_info_json = self._search_regex(
diff --git a/youtube_dl/extractor/anysex.py b/youtube_dl/extractor/anysex.py
index bc64423a3..ad86d6e58 100644
--- a/youtube_dl/extractor/anysex.py
+++ b/youtube_dl/extractor/anysex.py
@@ -35,7 +35,7 @@ class AnySexIE(InfoExtractor):
title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
description = self._html_search_regex(
- r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
+ r'<div class="description"[^>]*>([^<]+)</div>', webpage, 'description', fatal=False)
thumbnail = self._html_search_regex(
r'preview_url\s*:\s*\'(.*?)\'', webpage, 'thumbnail', fatal=False)
@@ -43,7 +43,7 @@ class AnySexIE(InfoExtractor):
r'<a href="http://anysex\.com/categories/[^"]+" title="[^"]*">([^<]+)</a>', webpage)
duration = parse_duration(self._search_regex(
- r'<b>Duration:</b> (\d+:\d+)', webpage, 'duration', fatal=False))
+ r'<b>Duration:</b> (?:<q itemprop="duration">)?(\d+:\d+)', webpage, 'duration', fatal=False))
view_count = int_or_none(self._html_search_regex(
r'<b>Views:</b> (\d+)', webpage, 'view count', fatal=False))
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py
index 54cec1c2f..8de9c11ea 100644
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -8,8 +8,6 @@ from ..utils import (
determine_ext,
ExtractorError,
qualities,
- compat_urllib_parse_urlparse,
- compat_urllib_parse,
int_or_none,
parse_duration,
unified_strdate,
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 9c30a1d33..f43a0a569 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1,6 +1,7 @@
from __future__ import unicode_literals
import base64
+import datetime
import hashlib
import json
import netrc
@@ -15,11 +16,13 @@ from ..utils import (
compat_http_client,
compat_urllib_error,
compat_urllib_parse_urlparse,
+ compat_urlparse,
compat_str,
clean_html,
compiled_regex_type,
ExtractorError,
+ float_or_none,
int_or_none,
RegexNotFoundError,
sanitize_filename,
@@ -164,6 +167,14 @@ class InfoExtractor(object):
return cls._VALID_URL_RE.match(url) is not None
@classmethod
+ def _match_id(cls, url):
+ if '_VALID_URL_RE' not in cls.__dict__:
+ cls._VALID_URL_RE = re.compile(cls._VALID_URL)
+ m = cls._VALID_URL_RE.match(url)
+ assert m
+ return m.group('id')
+
+ @classmethod
def working(cls):
"""Getter method for _WORKING."""
return cls._WORKING
@@ -640,7 +651,9 @@ class InfoExtractor(object):
return formats
- def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None):
+ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
+ entry_protocol='m3u8', preference=None):
+
formats = [{
'format_id': 'm3u8-meta',
'url': m3u8_url,
@@ -651,6 +664,11 @@ class InfoExtractor(object):
'format_note': 'Quality selection URL',
}]
+ format_url = lambda u: (
+ u
+ if re.match(r'^https?://', u)
+ else compat_urlparse.urljoin(m3u8_url, u))
+
m3u8_doc = self._download_webpage(m3u8_url, video_id)
last_info = None
kv_rex = re.compile(
@@ -667,15 +685,17 @@ class InfoExtractor(object):
continue
else:
if last_info is None:
- formats.append({'url': line})
+ formats.append({'url': format_url(line)})
continue
tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
f = {
'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
- 'url': line.strip(),
+ 'url': format_url(line.strip()),
'tbr': tbr,
'ext': ext,
+ 'protocol': entry_protocol,
+ 'preference': preference,
}
codecs = last_info.get('CODECS')
if codecs:
@@ -695,6 +715,34 @@ class InfoExtractor(object):
self._sort_formats(formats)
return formats
+ def _live_title(self, name):
+ """ Generate the title for a live video """
+ now = datetime.datetime.now()
+ now_str = now.strftime("%Y-%m-%d %H:%M")
+ return name + ' ' + now_str
+
+ def _int(self, v, name, fatal=False, **kwargs):
+ res = int_or_none(v, **kwargs)
+ if 'get_attr' in kwargs:
+ print(getattr(v, kwargs['get_attr']))
+ if res is None:
+ msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+ if fatal:
+ raise ExtractorError(msg)
+ else:
+ self._downloader.report_warning(msg)
+ return res
+
+ def _float(self, v, name, fatal=False, **kwargs):
+ res = float_or_none(v, **kwargs)
+ if res is None:
+ msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+ if fatal:
+ raise ExtractorError(msg)
+ else:
+ self._downloader.report_warning(msg)
+ return res
+
class SearchInfoExtractor(InfoExtractor):
"""
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 4903764f7..f99888ecc 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -9,7 +9,7 @@ import xml.etree.ElementTree
from hashlib import sha1
from math import pow, sqrt, floor
-from .common import InfoExtractor
+from .subtitles import SubtitlesInfoExtractor
from ..utils import (
ExtractorError,
compat_urllib_parse,
@@ -26,7 +26,7 @@ from ..aes import (
)
-class CrunchyrollIE(InfoExtractor):
+class CrunchyrollIE(SubtitlesInfoExtractor):
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
_TEST = {
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
@@ -271,6 +271,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
else:
subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, subtitles)
+ return
+
return {
'id': video_id,
'title': video_title,
diff --git a/youtube_dl/extractor/divxstage.py b/youtube_dl/extractor/divxstage.py
index 4ca3f37a2..b88379e06 100644
--- a/youtube_dl/extractor/divxstage.py
+++ b/youtube_dl/extractor/divxstage.py
@@ -7,7 +7,7 @@ class DivxStageIE(NovaMovIE):
IE_NAME = 'divxstage'
IE_DESC = 'DivxStage'
- _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'divxstage\.(?:eu|net|ch|co|at|ag)'}
+ _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'divxstage\.(?:eu|net|ch|co|at|ag|to)'}
_HOST = 'www.divxstage.eu'
@@ -24,4 +24,4 @@ class DivxStageIE(NovaMovIE):
'title': 'youtubedl test video',
'description': 'This is a test video for youtubedl.',
}
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py
index 1e1763abf..817a9bd61 100644
--- a/youtube_dl/extractor/dropbox.py
+++ b/youtube_dl/extractor/dropbox.py
@@ -5,24 +5,29 @@ import os.path
import re
from .common import InfoExtractor
-from ..utils import compat_urllib_parse_unquote
+from ..utils import compat_urllib_parse_unquote, url_basename
class DropboxIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/s/(?P<id>[a-zA-Z0-9]{15})/(?P<title>[^?#]*)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/sh?/(?P<id>[a-zA-Z0-9]{15})/.*'
+ _TESTS = [{
'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4?dl=0',
'info_dict': {
'id': 'nelirfsxnmcfbfh',
'ext': 'mp4',
'title': 'youtube-dl test video \'ä"BaW_jenozKc'
}
- }
+ },
+ {
+ 'url': 'https://www.dropbox.com/sh/662glsejgzoj9sr/AAByil3FGH9KFNZ13e08eSa1a/Pregame%20Ceremony%20Program%20PA%2020140518.m4v',
+ 'only_matching': True,
+ },
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- fn = compat_urllib_parse_unquote(mobj.group('title'))
+ fn = compat_urllib_parse_unquote(url_basename(url))
title = os.path.splitext(fn)[0]
video_url = (
re.sub(r'[?&]dl=0', '', url) +
diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py
index 4ba323148..2cba82532 100644
--- a/youtube_dl/extractor/eitb.py
+++ b/youtube_dl/extractor/eitb.py
@@ -1,4 +1,6 @@
# encoding: utf-8
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -7,20 +9,20 @@ from ..utils import ExtractorError
class EitbIE(InfoExtractor):
- IE_NAME = u'eitb.tv'
+ IE_NAME = 'eitb.tv'
_VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)'
_TEST = {
- u'add_ie': ['Brightcove'],
- u'url': u'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/',
- u'md5': u'edf4436247185adee3ea18ce64c47998',
- u'info_dict': {
- u'id': u'2743577154001',
- u'ext': u'mp4',
- u'title': u'60 minutos (Lasa y Zabala, 30 años)',
+ 'add_ie': ['Brightcove'],
+ 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/',
+ 'md5': 'edf4436247185adee3ea18ce64c47998',
+ 'info_dict': {
+ 'id': '2743577154001',
+ 'ext': 'mp4',
+ 'title': '60 minutos (Lasa y Zabala, 30 años)',
# All videos from eitb has this description in the brightcove info
- u'description': u'.',
- u'uploader': u'Euskal Telebista',
+ 'description': '.',
+ 'uploader': 'Euskal Telebista',
},
}
@@ -30,7 +32,7 @@ class EitbIE(InfoExtractor):
webpage = self._download_webpage(url, chapter_id)
bc_url = BrightcoveIE._extract_brightcove_url(webpage)
if bc_url is None:
- raise ExtractorError(u'Could not extract the Brightcove url')
+ raise ExtractorError('Could not extract the Brightcove url')
# The BrightcoveExperience object doesn't contain the video id, we set
# it manually
bc_url += '&%40videoPlayer={0}'.format(chapter_id)
diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py
index 14a196ffc..aacbf1414 100644
--- a/youtube_dl/extractor/extremetube.py
+++ b/youtube_dl/extractor/extremetube.py
@@ -7,6 +7,7 @@ from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
+ str_to_int,
)
@@ -20,6 +21,7 @@ class ExtremeTubeIE(InfoExtractor):
'ext': 'mp4',
'title': 'Music Video 14 british euro brit european cumshots swallow',
'uploader': 'unknown',
+ 'view_count': int,
'age_limit': 18,
}
}, {
@@ -39,8 +41,12 @@ class ExtremeTubeIE(InfoExtractor):
video_title = self._html_search_regex(
r'<h1 [^>]*?title="([^"]+)"[^>]*>', webpage, 'title')
uploader = self._html_search_regex(
- r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, 'uploader',
- fatal=False)
+ r'Uploaded by:\s*</strong>\s*(.+?)\s*</div>',
+ webpage, 'uploader', fatal=False)
+ view_count = str_to_int(self._html_search_regex(
+ r'Views:\s*</strong>\s*<span>([\d,\.]+)</span>',
+ webpage, 'view count', fatal=False))
+
video_url = compat_urllib_parse.unquote(self._html_search_regex(
r'video_url=(.+?)&amp;', webpage, 'video_url'))
path = compat_urllib_parse_urlparse(video_url).path
@@ -51,6 +57,7 @@ class ExtremeTubeIE(InfoExtractor):
'id': video_id,
'title': video_title,
'uploader': uploader,
+ 'view_count': view_count,
'url': video_url,
'format': format,
'format_id': format,
diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py
index 21ea5ec2b..e09982e88 100644
--- a/youtube_dl/extractor/flickr.py
+++ b/youtube_dl/extractor/flickr.py
@@ -10,13 +10,13 @@ from ..utils import (
class FlickrIE(InfoExtractor):
- """Information Extractor for Flickr videos"""
- _VALID_URL = r'(?:https?://)?(?:www\.|secure\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
+ _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
_TEST = {
'url': 'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/',
- 'file': '5645318632.mp4',
'md5': '6fdc01adbc89d72fc9c4f15b4a4ba87b',
'info_dict': {
+ 'id': '5645318632',
+ 'ext': 'mp4',
"description": "Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.",
"uploader_id": "forestwander-nature-pictures",
"title": "Dark Hollow Waterfalls"
@@ -49,12 +49,12 @@ class FlickrIE(InfoExtractor):
raise ExtractorError('Unable to extract video url')
video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
- return [{
- 'id': video_id,
- 'url': video_url,
- 'ext': 'mp4',
- 'title': self._og_search_title(webpage),
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
'uploader_id': video_uploader_id,
- }]
+ }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 40eeaad16..367f930dd 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -382,6 +382,19 @@ class GenericIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$',
},
},
+ # Wistia embed
+ {
+ 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
+ 'md5': '8788b683c777a5cf25621eaf286d0c23',
+ 'info_dict': {
+ 'id': '1cfaf6b7ea',
+ 'ext': 'mov',
+ 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
+ 'duration': 643.0,
+ 'filesize': 182808282,
+ 'uploader': 'education-portal.com',
+ },
+ },
]
def report_download_webpage(self, video_id):
@@ -584,7 +597,9 @@ class GenericIE(InfoExtractor):
# Helper method
def _playlist_from_matches(matches, getter, ie=None):
- urlrs = orderedSet(self.url_result(getter(m), ie) for m in matches)
+ urlrs = orderedSet(
+ self.url_result(self._proto_relative_url(getter(m)), ie)
+ for m in matches)
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
@@ -629,11 +644,11 @@ class GenericIE(InfoExtractor):
)
(["\'])
(?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
- (?:embed|v)/.+?)
+ (?:embed|v|p)/.+?)
\1''', webpage)
if matches:
return _playlist_from_matches(
- matches, lambda m: unescapeHTML(m[1]), ie='Youtube')
+ matches, lambda m: unescapeHTML(m[1]))
# Look for embedded Dailymotion player
matches = re.findall(
@@ -654,6 +669,16 @@ class GenericIE(InfoExtractor):
'title': video_title,
'id': video_id,
}
+ match = re.search(r'(?:id=["\']wistia_|data-wistiaid=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
+ if match:
+ return {
+ '_type': 'url_transparent',
+ 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
+ 'ie_key': 'Wistia',
+ 'uploader': video_uploader,
+ 'title': video_title,
+ 'id': match.group('id')
+ }
# Look for embedded blip.tv player
mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py
new file mode 100644
index 000000000..bebfe8568
--- /dev/null
+++ b/youtube_dl/extractor/golem.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urlparse,
+ determine_ext,
+)
+
+
+class GolemIE(InfoExtractor):
+ _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P<id>.+?)/'
+ _TEST = {
+ 'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html',
+ 'md5': 'c1a2c0a3c863319651c7c992c5ee29bf',
+ 'info_dict': {
+ 'id': '14095',
+ 'format_id': 'high',
+ 'ext': 'mp4',
+ 'title': 'iPhone 6 und 6 Plus - Test',
+ 'duration': 300.44,
+ 'filesize': 65309548,
+ }
+ }
+
+ _PREFIX = 'http://video.golem.de'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ config = self._download_xml(
+ 'https://video.golem.de/xml/{0}.xml'.format(video_id), video_id)
+
+ info = {
+ 'id': video_id,
+ 'title': config.findtext('./title', 'golem'),
+ 'duration': self._float(config.findtext('./playtime'), 'duration'),
+ }
+
+ formats = []
+ for e in config.findall('./*[url]'):
+ url = e.findtext('./url')
+ if not url:
+ self._downloader.report_warning(
+ "{0}: url: empty, skipping".format(e.tag))
+ continue
+
+ formats.append({
+ 'format_id': e.tag,
+ 'url': compat_urlparse.urljoin(self._PREFIX, url),
+ 'height': self._int(e.get('height'), 'height'),
+ 'width': self._int(e.get('width'), 'width'),
+ 'filesize': self._int(e.findtext('filesize'), 'filesize'),
+ 'ext': determine_ext(e.findtext('./filename')),
+ })
+ self._sort_formats(formats)
+ info['formats'] = formats
+
+ thumbnails = []
+ for e in config.findall('.//teaser[url]'):
+ url = e.findtext('./url')
+ if not url:
+ continue
+ thumbnails.append({
+ 'url': compat_urlparse.urljoin(self._PREFIX, url),
+ 'width': self._int(e.get('width'), 'thumbnail width'),
+ 'height': self._int(e.get('height'), 'thumbnail height'),
+ })
+ info['thumbnails'] = thumbnails
+
+ return info
diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py
new file mode 100644
index 000000000..f97b1e085
--- /dev/null
+++ b/youtube_dl/extractor/heise.py
@@ -0,0 +1,81 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ get_meta_content,
+ parse_iso8601,
+)
+
+
+class HeiseIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?heise\.de/video/artikel/
+ .+?(?P<id>[0-9]+)\.html(?:$|[?#])
+ '''
+ _TEST = {
+ 'url': (
+ 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html'
+ ),
+ 'md5': 'ffed432483e922e88545ad9f2f15d30e',
+ 'info_dict': {
+ 'id': '2404147',
+ 'ext': 'mp4',
+ 'title': (
+ "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone"
+ ),
+ 'format_id': 'mp4_720',
+ 'timestamp': 1411812600,
+ 'upload_date': '20140927',
+ 'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ json_url = self._search_regex(
+ r'json_url:\s*"([^"]+)"', webpage, 'json URL')
+ config = self._download_json(json_url, video_id)
+
+ info = {
+ 'id': video_id,
+ 'thumbnail': config.get('poster'),
+ 'timestamp': parse_iso8601(get_meta_content('date', webpage)),
+ 'description': self._og_search_description(webpage),
+ }
+
+ title = get_meta_content('fulltitle', webpage)
+ if title:
+ info['title'] = title
+ elif config.get('title'):
+ info['title'] = config['title']
+ else:
+ info['title'] = self._og_search_title(webpage)
+
+ formats = []
+ for t, rs in config['formats'].items():
+ if not rs or not hasattr(rs, 'items'):
+ self._downloader.report_warning(
+ 'formats: {0}: no resolutions'.format(t))
+ continue
+
+ for height_str, obj in rs.items():
+ format_id = '{0}_{1}'.format(t, height_str)
+
+ if not obj or not obj.get('url'):
+ self._downloader.report_warning(
+ 'formats: {0}: no url'.format(format_id))
+ continue
+
+ formats.append({
+ 'url': obj['url'],
+ 'format_id': format_id,
+ 'height': self._int(height_str, 'height'),
+ })
+
+ self._sort_formats(formats)
+ info['formats'] = formats
+
+ return info
diff --git a/youtube_dl/extractor/mgoon.py b/youtube_dl/extractor/mgoon.py
new file mode 100644
index 000000000..94bc87b00
--- /dev/null
+++ b/youtube_dl/extractor/mgoon.py
@@ -0,0 +1,87 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ qualities,
+ unified_strdate,
+)
+
+
+class MgoonIE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://(?:www\.)?
+ (?:(:?m\.)?mgoon\.com/(?:ch/(?:.+)/v|play/view)|
+ video\.mgoon\.com)/(?P<id>[0-9]+)'''
+ _API_URL = 'http://mpos.mgoon.com/player/video?id={0:}'
+ _TESTS = [
+ {
+ 'url': 'http://m.mgoon.com/ch/hi6618/v/5582148',
+ 'md5': 'dd46bb66ab35cf6d51cc812fd82da79d',
+ 'info_dict': {
+ 'id': '5582148',
+ 'uploader_id': 'hi6618',
+ 'duration': 240.419,
+ 'upload_date': '20131220',
+ 'ext': 'mp4',
+ 'title': 'md5:543aa4c27a4931d371c3f433e8cebebc',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ },
+ {
+ 'url': 'http://www.mgoon.com/play/view/5582148',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://video.mgoon.com/5582148',
+ 'only_matching': True,
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ data = self._download_json(self._API_URL.format(video_id), video_id)
+
+ if data.get('errorInfo', {}).get('code') != 'NONE':
+ raise ExtractorError('%s encountered an error: %s' % (
+ self.IE_NAME, data['errorInfo']['message']), expected=True)
+
+ v_info = data['videoInfo']
+ title = v_info.get('v_title')
+ thumbnail = v_info.get('v_thumbnail')
+ duration = v_info.get('v_duration')
+ upload_date = unified_strdate(v_info.get('v_reg_date'))
+ uploader_id = data.get('userInfo', {}).get('u_alias')
+ if duration:
+ duration /= 1000.0
+
+ age_limit = None
+ if data.get('accessInfo', {}).get('code') == 'VIDEO_STATUS_ADULT':
+ age_limit = 18
+
+ formats = []
+ get_quality = qualities(['360p', '480p', '720p', '1080p'])
+ for fmt in data['videoFiles']:
+ formats.append({
+ 'format_id': fmt['label'],
+ 'quality': get_quality(fmt['label']),
+ 'url': fmt['url'],
+ 'ext': fmt['format'],
+
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'upload_date': upload_date,
+ 'uploader_id': uploader_id,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py
index 3a938861b..c7f6beb9c 100644
--- a/youtube_dl/extractor/muenchentv.py
+++ b/youtube_dl/extractor/muenchentv.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import datetime
import json
from .common import InfoExtractor
@@ -23,6 +22,7 @@ class MuenchenTVIE(InfoExtractor):
'ext': 'mp4',
'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'is_live': True,
+ 'thumbnail': 're:^https?://.*\.jpg$'
},
'params': {
'skip_download': True,
@@ -33,9 +33,7 @@ class MuenchenTVIE(InfoExtractor):
display_id = 'live'
webpage = self._download_webpage(url, display_id)
- now = datetime.datetime.now()
- now_str = now.strftime("%Y-%m-%d %H:%M")
- title = self._og_search_title(webpage) + ' ' + now_str
+ title = self._live_title(self._og_search_title(webpage))
data_js = self._search_regex(
r'(?s)\nplaylist:\s*(\[.*?}\]),related:',
@@ -73,5 +71,6 @@ class MuenchenTVIE(InfoExtractor):
'title': title,
'formats': formats,
'is_live': True,
+ 'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index d2e4acbad..e75ab7c39 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -16,9 +16,9 @@ class NBCIE(InfoExtractor):
_TEST = {
'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
- 'md5': '54d0fbc33e0b853a65d7b4de5c06d64e',
+ # md5 checksum is not stable
'info_dict': {
- 'id': 'u1RInQZRN7QJ',
+ 'id': 'bTmnLCvIbaaH',
'ext': 'flv',
'title': 'I Am a Firefighter',
'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py
new file mode 100644
index 000000000..963c4587c
--- /dev/null
+++ b/youtube_dl/extractor/nfl.py
@@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ remove_end,
+)
+
+
+class NFLIE(InfoExtractor):
+ IE_NAME = 'nfl.com'
+ _VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)'
+ _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json'
+ _TEST = {
+ 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
+ # 'md5': '5eb8c40a727dda106d510e5d6ffa79e5', # md5 checksum fluctuates
+ 'info_dict': {
+ 'id': '0ap3000000398478',
+ 'ext': 'mp4',
+ 'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights',
+ 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
+ 'upload_date': '20140921',
+ 'timestamp': 1411337580,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ config = self._download_json(self._PLAYER_CONFIG_URL, video_id,
+ note='Downloading player config')
+ url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config)
+ video_data = self._download_json(url_template.format(id=video_id), video_id)
+
+ cdns = config.get('cdns')
+ if not cdns:
+ raise ExtractorError('Failed to get CDN data', expected=True)
+
+ formats = []
+ streams = video_data.get('cdnData', {}).get('bitrateInfo', [])
+ for name, cdn in cdns.items():
+ # LimeLight streams don't seem to work
+ if cdn.get('name') == 'LIMELIGHT':
+ continue
+
+ protocol = cdn.get('protocol')
+ host = remove_end(cdn.get('host', ''), '/')
+ if not (protocol and host):
+ continue
+
+ path_prefix = cdn.get('pathprefix', '')
+ if path_prefix and not path_prefix.endswith('/'):
+ path_prefix = '%s/' % path_prefix
+
+ get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format(
+ protocol=protocol,
+ host=host,
+ prefix=path_prefix,
+ path=p,
+ )
+
+ if protocol == 'rtmp':
+ preference = -2
+ elif 'prog' in name.lower():
+ preference = -1
+ else:
+ preference = 0
+
+ for stream in streams:
+ path = stream.get('path')
+ if not path:
+ continue
+
+ formats.append({
+ 'url': get_url(path),
+ 'vbr': int_or_none(stream.get('rate', 0), 1000),
+ 'preference': preference,
+ 'format_note': name,
+ })
+
+ self._sort_formats(formats)
+
+ thumbnail = None
+ for q in ('xl', 'l', 'm', 's', 'xs'):
+ thumbnail = video_data.get('imagePaths', {}).get(q)
+ if thumbnail:
+ break
+
+ return {
+ 'id': video_id,
+ 'title': video_data.get('storyHeadline'),
+ 'formats': formats,
+ 'description': video_data.get('caption'),
+ 'duration': video_data.get('duration'),
+ 'thumbnail': thumbnail,
+ 'timestamp': int_or_none(video_data.get('posted'), 1000),
+ }
diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py
index 959fdf590..7f1bc6377 100644
--- a/youtube_dl/extractor/noco.py
+++ b/youtube_dl/extractor/noco.py
@@ -2,6 +2,8 @@
from __future__ import unicode_literals
import re
+import time
+import hashlib
from .common import InfoExtractor
from ..utils import (
@@ -17,6 +19,7 @@ from ..utils import (
class NocoIE(InfoExtractor):
_VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)'
_LOGIN_URL = 'http://noco.tv/do.php'
+ _API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s'
_NETRC_MACHINE = 'noco'
_TEST = {
@@ -55,33 +58,52 @@ class NocoIE(InfoExtractor):
login = self._download_json(request, None, 'Logging in as %s' % username)
if 'erreur' in login:
- raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True)
+ raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True)
+
+ def _call_api(self, path, video_id, note):
+ ts = compat_str(int(time.time() * 1000))
+ tk = hashlib.md5((hashlib.md5(ts.encode('ascii')).hexdigest() + '#8S?uCraTedap6a').encode('ascii')).hexdigest()
+ url = self._API_URL_TEMPLATE % (path, ts, tk)
+
+ resp = self._download_json(url, video_id, note)
+
+ if isinstance(resp, dict) and resp.get('error'):
+ self._raise_error(resp['error'], resp['description'])
+
+ return resp
+
+ def _raise_error(self, error, description):
+ raise ExtractorError(
+ '%s returned error: %s - %s' % (self.IE_NAME, error, description),
+ expected=True)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- medias = self._download_json(
- 'https://api.noco.tv/1.0/video/medias/%s' % video_id, video_id, 'Downloading video JSON')
+ medias = self._call_api(
+ 'shows/%s/medias' % video_id,
+ video_id, 'Downloading video JSON')
+
+ qualities = self._call_api(
+ 'qualities',
+ video_id, 'Downloading qualities JSON')
formats = []
- for fmt in medias['fr']['video_list']['default']['quality_list']:
- format_id = fmt['quality_key']
+ for format_id, fmt in medias['fr']['video_list']['none']['quality_list'].items():
- file = self._download_json(
- 'https://api.noco.tv/1.0/video/file/%s/fr/%s' % (format_id.lower(), video_id),
+ video = self._call_api(
+ 'shows/%s/video/%s/fr' % (video_id, format_id.lower()),
video_id, 'Downloading %s video JSON' % format_id)
- file_url = file['file']
+ file_url = video['file']
if not file_url:
continue
- if file_url == 'forbidden':
- raise ExtractorError(
- '%s returned error: %s - %s' % (
- self.IE_NAME, file['popmessage']['title'], file['popmessage']['message']),
- expected=True)
+ if file_url in ['forbidden', 'not found']:
+ popmessage = video['popmessage']
+ self._raise_error(popmessage['title'], popmessage['message'])
formats.append({
'url': file_url,
@@ -91,20 +113,31 @@ class NocoIE(InfoExtractor):
'abr': fmt['audiobitrate'],
'vbr': fmt['videobitrate'],
'filesize': fmt['filesize'],
- 'format_note': fmt['quality_name'],
- 'preference': fmt['priority'],
+ 'format_note': qualities[format_id]['quality_name'],
+ 'preference': qualities[format_id]['priority'],
})
self._sort_formats(formats)
- show = self._download_json(
- 'https://api.noco.tv/1.0/shows/show/%s' % video_id, video_id, 'Downloading show JSON')[0]
+ show = self._call_api(
+ 'shows/by_id/%s' % video_id,
+ video_id, 'Downloading show JSON')[0]
- upload_date = unified_strdate(show['indexed'])
+ upload_date = unified_strdate(show['online_date_start_utc'])
uploader = show['partner_name']
uploader_id = show['partner_key']
duration = show['duration_ms'] / 1000.0
- thumbnail = show['screenshot']
+
+ thumbnails = []
+ for thumbnail_key, thumbnail_url in show.items():
+ m = re.search(r'^screenshot_(?P<width>\d+)x(?P<height>\d+)$', thumbnail_key)
+ if not m:
+ continue
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ })
episode = show.get('show_TT') or show.get('show_OT')
family = show.get('family_TT') or show.get('family_OT')
@@ -124,7 +157,7 @@ class NocoIE(InfoExtractor):
'id': video_id,
'title': title,
'description': description,
- 'thumbnail': thumbnail,
+ 'thumbnails': thumbnails,
'upload_date': upload_date,
'uploader': uploader,
'uploader_id': uploader_id,
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
index 7a154e94a..f36d446d2 100644
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@@ -7,6 +7,7 @@ from ..utils import (
unified_strdate,
parse_duration,
qualities,
+ url_basename,
)
@@ -55,7 +56,9 @@ class NPOIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ return self._get_info(video_id)
+ def _get_info(self, video_id):
metadata = self._download_json(
'http://e.omroep.nl/metadata/aflevering/%s' % video_id,
video_id,
@@ -106,3 +109,30 @@ class NPOIE(InfoExtractor):
'duration': parse_duration(metadata.get('tijdsduur')),
'formats': formats,
}
+
+
+class TegenlichtVproIE(NPOIE):
+ IE_NAME = 'tegenlicht.vpro.nl'
+ _VALID_URL = r'https?://tegenlicht\.vpro\.nl/afleveringen/.*?'
+
+ _TESTS = [
+ {
+ 'url': 'http://tegenlicht.vpro.nl/afleveringen/2012-2013/de-toekomst-komt-uit-afrika.html',
+ 'md5': 'f8065e4e5a7824068ed3c7e783178f2c',
+ 'info_dict': {
+ 'id': 'VPWON_1169289',
+ 'ext': 'm4v',
+ 'title': 'Tegenlicht',
+ 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1',
+ 'upload_date': '20130225',
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ name = url_basename(url)
+ webpage = self._download_webpage(url, name)
+ urn = self._html_search_meta('mediaurn', webpage)
+ info_page = self._download_json(
+ 'http://rs.vpro.nl/v2/api/media/%s.json' % urn, name)
+ return self._get_info(info_page['mid'])
diff --git a/youtube_dl/extractor/oktoberfesttv.py b/youtube_dl/extractor/oktoberfesttv.py
new file mode 100644
index 000000000..4a41c0542
--- /dev/null
+++ b/youtube_dl/extractor/oktoberfesttv.py
@@ -0,0 +1,47 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class OktoberfestTVIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.oktoberfest-tv\.de/[^/]+/[^/]+/video/(?P<id>[^/?#]+)'
+
+ _TEST = {
+ 'url': 'http://www.oktoberfest-tv.de/de/kameras/video/hb-zelt',
+ 'info_dict': {
+ 'id': 'hb-zelt',
+ 'ext': 'mp4',
+ 'title': 're:^Live-Kamera: Hofbräuzelt [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._live_title(self._html_search_regex(
+ r'<h1><strong>.*?</strong>(.*?)</h1>', webpage, 'title'))
+
+ clip = self._search_regex(
+ r"clip:\s*\{\s*url:\s*'([^']+)'", webpage, 'clip')
+ ncurl = self._search_regex(
+ r"netConnectionUrl:\s*'([^']+)'", webpage, 'rtmp base')
+ video_url = ncurl + clip
+ thumbnail = self._search_regex(
+ r"canvas:\s*\{\s*backgroundImage:\s*'url\(([^)]+)\)'", webpage,
+ 'thumbnail', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'is_live': True,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py
index 72df4d842..ebc046804 100644
--- a/youtube_dl/extractor/playfm.py
+++ b/youtube_dl/extractor/playfm.py
@@ -10,6 +10,7 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ str_to_int,
)
@@ -29,6 +30,7 @@ class PlayFMIE(InfoExtractor):
'duration': 5627.428,
'upload_date': '20140712',
'view_count': int,
+ 'comment_count': int,
'thumbnail': 're:^https?://.*\.jpg$',
},
}
@@ -51,7 +53,8 @@ class PlayFMIE(InfoExtractor):
recording = rec_doc.find('./recording')
title = recording.find('./title').text
- view_count = int_or_none(recording.find('./stats/playcount').text)
+ view_count = str_to_int(recording.find('./stats/playcount').text)
+ comment_count = str_to_int(recording.find('./stats/comments').text)
duration = float_or_none(recording.find('./duration').text, scale=1000)
thumbnail = recording.find('./image').text
@@ -75,6 +78,7 @@ class PlayFMIE(InfoExtractor):
'title': title,
'upload_date': upload_date,
'view_count': view_count,
+ 'comment_count': comment_count,
'duration': duration,
'thumbnail': thumbnail,
'uploader': uploader,
diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py
index 34058fd4b..409f8540a 100644
--- a/youtube_dl/extractor/sbs.py
+++ b/youtube_dl/extractor/sbs.py
@@ -12,7 +12,7 @@ from ..utils import (
class SBSIE(InfoExtractor):
IE_DESC = 'sbs.com.au'
- _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/single/(?P<id>[0-9]+)/'
+ _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/(?:single/)?(?P<id>[0-9]+)'
_TESTS = [{
# Original URL is handled by the generic IE which finds the iframe:
@@ -21,12 +21,16 @@ class SBSIE(InfoExtractor):
'md5': '3150cf278965eeabb5b4cea1c963fe0a',
'info_dict': {
'id': '320403011771',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Dingo Conservation',
'description': 'Dingoes are on the brink of extinction; most of the animals we think are dingoes are in fact crossbred with wild dogs. This family run a dingo conservation park to prevent their extinction',
'thumbnail': 're:http://.*\.jpg',
},
'add_ies': ['generic'],
+ },
+ {
+ 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py
new file mode 100644
index 000000000..3f680bfc6
--- /dev/null
+++ b/youtube_dl/extractor/sport5.py
@@ -0,0 +1,92 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class Sport5IE(InfoExtractor):
+ _VALID_URL = r'http://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1',
+ 'info_dict': {
+ 'id': 's5-Y59xx1-GUh2',
+ 'ext': 'mp4',
+ 'title': 'ולנסיה-קורדובה 0:3',
+ 'description': 'אלקאסר, גאייה ופגולי סידרו לקבוצה של נונו ניצחון על קורדובה ואת המקום הראשון בליגה',
+ 'duration': 228,
+ 'categories': list,
+ },
+ 'skip': 'Blocked outside of Israel',
+ }, {
+ 'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE',
+ 'info_dict': {
+ 'id': 's5-SiXxx1-hKh2',
+ 'ext': 'mp4',
+ 'title': 'GOALS_CELTIC_270914.mp4',
+ 'description': '',
+ 'duration': 87,
+ 'categories': list,
+ },
+ 'skip': 'Blocked outside of Israel',
+ }
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ media_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, media_id)
+
+ video_id = self._html_search_regex('clipId=([\w-]+)', webpage, 'video id')
+
+ metadata = self._download_xml(
+ 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id,
+ video_id)
+
+ error = metadata.find('./Error')
+ if error is not None:
+ raise ExtractorError(
+ '%s returned error: %s - %s' % (
+ self.IE_NAME,
+ error.find('./Name').text,
+ error.find('./Description').text),
+ expected=True)
+
+ title = metadata.find('./Title').text
+ description = metadata.find('./Description').text
+ duration = int(metadata.find('./Duration').text)
+
+ posters_el = metadata.find('./PosterLinks')
+ thumbnails = [{
+ 'url': thumbnail.text,
+ 'width': int(thumbnail.get('width')),
+ 'height': int(thumbnail.get('height')),
+ } for thumbnail in posters_el.findall('./PosterIMG')] if posters_el is not None else []
+
+ categories_el = metadata.find('./Categories')
+ categories = [
+ cat.get('name') for cat in categories_el.findall('./Category')
+ ] if categories_el is not None else []
+
+ formats = [{
+ 'url': fmt.text,
+ 'ext': 'mp4',
+ 'vbr': int(fmt.get('bitrate')),
+ 'width': int(fmt.get('width')),
+ 'height': int(fmt.get('height')),
+ } for fmt in metadata.findall('./PlaybackLinks/FileURL')]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'duration': duration,
+ 'categories': categories,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index b6b2dba9c..0be793b1c 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -5,6 +5,7 @@ import json
from .common import InfoExtractor
from ..utils import (
+ compat_str,
ExtractorError,
xpath_with_ns,
)
@@ -55,36 +56,44 @@ class ThePlatformIE(InfoExtractor):
body = meta.find(_x('smil:body'))
f4m_node = body.find(_x('smil:seq//smil:video'))
- if f4m_node is not None:
+ if f4m_node is not None and '.f4m' in f4m_node.attrib['src']:
f4m_url = f4m_node.attrib['src']
if 'manifest.f4m?' not in f4m_url:
f4m_url += '?'
# the parameters are from syfy.com, other sites may use others,
# they also work for nbc.com
f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3'
- formats = [{
- 'ext': 'flv',
- 'url': f4m_url,
- }]
+ formats = self._extract_f4m_formats(f4m_url, video_id)
else:
- base_url = head.find(_x('smil:meta')).attrib['base']
- switch = body.find(_x('smil:switch'))
formats = []
- for f in switch.findall(_x('smil:video')):
- attr = f.attrib
- width = int(attr['width'])
- height = int(attr['height'])
- vbr = int(attr['system-bitrate']) // 1000
- format_id = '%dx%d_%dk' % (width, height, vbr)
- formats.append({
- 'format_id': format_id,
- 'url': base_url,
- 'play_path': 'mp4:' + attr['src'],
- 'ext': 'flv',
- 'width': width,
- 'height': height,
- 'vbr': vbr,
- })
+ switch = body.find(_x('smil:switch'))
+ if switch is not None:
+ base_url = head.find(_x('smil:meta')).attrib['base']
+ for f in switch.findall(_x('smil:video')):
+ attr = f.attrib
+ width = int(attr['width'])
+ height = int(attr['height'])
+ vbr = int(attr['system-bitrate']) // 1000
+ format_id = '%dx%d_%dk' % (width, height, vbr)
+ formats.append({
+ 'format_id': format_id,
+ 'url': base_url,
+ 'play_path': 'mp4:' + attr['src'],
+ 'ext': 'flv',
+ 'width': width,
+ 'height': height,
+ 'vbr': vbr,
+ })
+ else:
+ switch = body.find(_x('smil:seq//smil:switch'))
+ for f in switch.findall(_x('smil:video')):
+ attr = f.attrib
+ vbr = int(attr['system-bitrate']) // 1000
+ formats.append({
+ 'format_id': compat_str(vbr),
+ 'url': attr['src'],
+ 'vbr': vbr,
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py
new file mode 100644
index 000000000..607e947bb
--- /dev/null
+++ b/youtube_dl/extractor/thvideo.py
@@ -0,0 +1,59 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ unified_strdate
+)
+
+
+class THVideoIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://thvideo.tv/v/th1987/',
+ 'md5': 'fa107b1f73817e325e9433505a70db50',
+ 'info_dict': {
+ 'id': '1987',
+ 'ext': 'mp4',
+ 'title': '【动画】秘封活动记录 ~ The Sealed Esoteric History.分镜稿预览',
+ 'display_id': 'th1987',
+ 'thumbnail': 'http://thvideo.tv/uploadfile/2014/0722/20140722013459856.jpg',
+ 'description': '社团京都幻想剧团的第一个东方二次同人动画作品「秘封活动记录 ~ The Sealed Esoteric History.」 本视频是该动画第一期的分镜草稿...',
+ 'upload_date': '20140722'
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ # extract download link from mobile player page
+ webpage_player = self._download_webpage(
+ 'http://thvideo.tv/mobile.php?cid=%s-0' % (video_id),
+ video_id, note='Downloading video source page')
+ video_url = self._html_search_regex(
+ r'<source src="(.*?)" type', webpage_player, 'video url')
+
+ # extract video info from main page
+ webpage = self._download_webpage(
+ 'http://thvideo.tv/v/th%s' % (video_id), video_id)
+ title = self._og_search_title(webpage)
+ display_id = 'th%s' % video_id
+ thumbnail = self._og_search_thumbnail(webpage)
+ description = self._og_search_description(webpage)
+ upload_date = unified_strdate(self._html_search_regex(
+ r'span itemprop="datePublished" content="(.*?)">', webpage,
+ 'upload date', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'ext': 'mp4',
+ 'url': video_url,
+ 'title': title,
+ 'display_id': display_id,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'upload_date': upload_date
+ }
diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py
index 08a48c05a..64a1e9030 100644
--- a/youtube_dl/extractor/tube8.py
+++ b/youtube_dl/extractor/tube8.py
@@ -14,27 +14,35 @@ from ..aes import aes_decrypt_text
class Tube8IE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/){2}(?P<id>\d+)'
- _TEST = {
- 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/',
- 'md5': '44bf12b98313827dd52d35b8706a4ea0',
- 'info_dict': {
- 'id': '229795',
- 'ext': 'mp4',
- 'description': 'hot teen Kasia grinding',
- 'uploader': 'unknown',
- 'title': 'Kasia music video',
- 'age_limit': 18,
- }
- }
+ _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/',
+ 'md5': '44bf12b98313827dd52d35b8706a4ea0',
+ 'info_dict': {
+ 'id': '229795',
+ 'display_id': 'kasia-music-video',
+ 'ext': 'mp4',
+ 'description': 'hot teen Kasia grinding',
+ 'uploader': 'unknown',
+ 'title': 'Kasia music video',
+ 'age_limit': 18,
+ }
+ },
+ {
+ 'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/',
+ 'only_matching': True,
+ },
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
- webpage = self._download_webpage(req, video_id)
+ webpage = self._download_webpage(req, display_id)
flashvars = json.loads(self._html_search_regex(
r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars'))
@@ -70,6 +78,7 @@ class Tube8IE(InfoExtractor):
return {
'id': video_id,
+ 'display_id': display_id,
'url': video_url,
'title': title,
'description': description,
diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py
index df115d251..ebd64f0f5 100644
--- a/youtube_dl/extractor/vbox7.py
+++ b/youtube_dl/extractor/vbox7.py
@@ -19,7 +19,7 @@ class Vbox7IE(InfoExtractor):
'md5': '99f65c0c9ef9b682b97313e052734c3f',
'info_dict': {
'id': '249bb972c2',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Смях! Чудо - чист за секунди - Скрита камера',
},
}
@@ -50,7 +50,6 @@ class Vbox7IE(InfoExtractor):
return {
'id': video_id,
'url': final_url,
- 'ext': 'flv',
'title': title,
'thumbnail': thumbnail_url,
}
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index d2ffd1b6b..5b1a3ec78 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -5,7 +5,7 @@ import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
- compat_HTTPError,
+ compat_urllib_request,
ExtractorError,
)
@@ -24,7 +24,7 @@ class VevoIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
- "md5": "06bea460acb744eab74a9d7dcb4bfd61",
+ "md5": "95ee28ee45e70130e3ab02b0f579ae23",
'info_dict': {
'id': 'GB1101300280',
'ext': 'mp4',
@@ -40,7 +40,7 @@ class VevoIE(InfoExtractor):
}, {
'note': 'v3 SMIL format',
'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',
- 'md5': '893ec0e0d4426a1d96c01de8f2bdff58',
+ 'md5': 'f6ab09b034f8c22969020b042e5ac7fc',
'info_dict': {
'id': 'USUV71302923',
'ext': 'mp4',
@@ -69,6 +69,21 @@ class VevoIE(InfoExtractor):
}]
_SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'
+ def _real_initialize(self):
+ req = compat_urllib_request.Request(
+ 'http://www.vevo.com/auth', data=b'')
+ webpage = self._download_webpage(
+ req, None,
+ note='Retrieving oauth token',
+ errnote='Unable to retrieve oauth token',
+ fatal=False)
+ if webpage is False:
+ self._oauth_token = None
+ else:
+ self._oauth_token = self._search_regex(
+ r'access_token":\s*"([^"]+)"',
+ webpage, 'access token', fatal=False)
+
def _formats_from_json(self, video_info):
last_version = {'version': -1}
for version in video_info['videoVersions']:
@@ -129,6 +144,26 @@ class VevoIE(InfoExtractor):
})
return formats
+ def _download_api_formats(self, video_id):
+ if not self._oauth_token:
+ self._downloader.report_warning(
+ 'No oauth token available, skipping API HLS download')
+ return []
+
+ api_url = 'https://apiv2.vevo.com/video/%s/streams/hls?token=%s' % (
+ video_id, self._oauth_token)
+ api_data = self._download_json(
+ api_url, video_id,
+ note='Downloading HLS formats',
+ errnote='Failed to download HLS format list', fatal=False)
+ if api_data is None:
+ return []
+
+ m3u8_url = api_data[0]['url']
+ return self._extract_m3u8_formats(
+ m3u8_url, video_id, entry_protocol='m3u8_native', ext='mp4',
+ preference=0)
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
@@ -152,30 +187,8 @@ class VevoIE(InfoExtractor):
else:
age_limit = None
- # Download SMIL
- smil_blocks = sorted((
- f for f in video_info['videoVersions']
- if f['sourceType'] == 13),
- key=lambda f: f['version'])
-
- smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
- self._SMIL_BASE_URL, video_id, video_id.lower())
- if smil_blocks:
- smil_url_m = self._search_regex(
- r'url="([^"]+)"', smil_blocks[-1]['data'], 'SMIL URL',
- fatal=False)
- if smil_url_m is not None:
- smil_url = smil_url_m
-
- try:
- smil_xml = self._download_webpage(smil_url, video_id,
- 'Downloading SMIL info')
- formats.extend(self._formats_from_smil(smil_xml))
- except ExtractorError as ee:
- if not isinstance(ee.cause, compat_HTTPError):
- raise
- self._downloader.report_warning(
- 'Cannot download SMIL information, falling back to JSON ..')
+ # Download via HLS API
+ formats.extend(self._download_api_formats(video_id))
self._sort_formats(formats)
timestamp_ms = int(self._search_regex(
diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py
index 2544c24bd..1b2f731e9 100644
--- a/youtube_dl/extractor/vube.py
+++ b/youtube_dl/extractor/vube.py
@@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..utils import (
int_or_none,
compat_str,
+ ExtractorError,
)
@@ -16,6 +17,24 @@ class VubeIE(InfoExtractor):
_TESTS = [
{
+ 'url': 'http://vube.com/trending/William+Wei/Y8NUZ69Tf7?t=s',
+ 'md5': 'e7aabe1f8f1aa826b9e4735e1f9cee42',
+ 'info_dict': {
+ 'id': 'Y8NUZ69Tf7',
+ 'ext': 'mp4',
+ 'title': 'Best Drummer Ever [HD]',
+ 'description': 'md5:2d63c4b277b85c2277761c2cf7337d71',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'uploader': 'William',
+ 'timestamp': 1406876915,
+ 'upload_date': '20140801',
+ 'duration': 258.051,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ 'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'],
+ },
+ }, {
'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon',
'md5': 'db7aba89d4603dadd627e9d1973946fe',
'info_dict': {
@@ -32,7 +51,8 @@ class VubeIE(InfoExtractor):
'dislike_count': int,
'comment_count': int,
'categories': ['pop', 'music', 'cover', 'singing', 'jessie j', 'price tag', 'chiara grispo'],
- }
+ },
+ 'skip': 'Removed due to DMCA',
},
{
'url': 'http://vube.com/SerainaMusic/my-7-year-old-sister-and-i-singing-alive-by-krewella/UeBhTudbfS?t=s&n=1',
@@ -51,7 +71,8 @@ class VubeIE(InfoExtractor):
'dislike_count': int,
'comment_count': int,
'categories': ['seraina', 'jessica', 'krewella', 'alive'],
- }
+ },
+ 'skip': 'Removed due to DMCA',
}, {
'url': 'http://vube.com/vote/Siren+Gene/0nmsMY5vEq?n=2&t=s',
'md5': '0584fc13b50f887127d9d1007589d27f',
@@ -69,7 +90,8 @@ class VubeIE(InfoExtractor):
'dislike_count': int,
'comment_count': int,
'categories': ['let it go', 'cover', 'idina menzel', 'frozen', 'singing', 'disney', 'siren gene'],
- }
+ },
+ 'skip': 'Removed due to DMCA',
}
]
@@ -102,6 +124,11 @@ class VubeIE(InfoExtractor):
self._sort_formats(formats)
+ if not formats and video.get('vst') == 'dmca':
+ raise ExtractorError(
+ 'This video has been removed in response to a complaint received under the US Digital Millennium Copyright Act.',
+ expected=True)
+
title = video['title']
description = video.get('description')
thumbnail = self._proto_relative_url(video.get('thumbnail_src'), scheme='http:')
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py
index 46b4d9133..bf9e40bad 100644
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -5,7 +5,10 @@ import re
import hashlib
from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..utils import (
+ ExtractorError,
+ unified_strdate,
+)
class WatIE(InfoExtractor):
@@ -37,6 +40,7 @@ class WatIE(InfoExtractor):
'upload_date': '20140816',
'duration': 2910,
},
+ 'skip': "Ce contenu n'est pas disponible pour l'instant.",
},
]
@@ -57,6 +61,11 @@ class WatIE(InfoExtractor):
video_info = self.download_video_info(real_id)
+ error_desc = video_info.get('error_desc')
+ if error_desc:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True)
+
geo_list = video_info.get('geoList')
country = geo_list[0] if geo_list else ''
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
index e6bfa9e14..748443f81 100644
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -1,13 +1,14 @@
from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
+from ..utils import ExtractorError, compat_urllib_request
class WistiaIE(InfoExtractor):
_VALID_URL = r'https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'
+ _API_URL = 'http://fast.wistia.com/embed/medias/{0:}.json'
_TEST = {
'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt',
@@ -24,11 +25,13 @@ class WistiaIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- webpage = self._download_webpage(url, video_id)
- data_json = self._html_search_regex(
- r'Wistia\.iframeInit\((.*?), {}\);', webpage, 'video data')
-
- data = json.loads(data_json)
+ request = compat_urllib_request.Request(self._API_URL.format(video_id))
+ request.add_header('Referer', url) # Some videos require this.
+ data_json = self._download_json(request, video_id)
+ if data_json.get('error'):
+ raise ExtractorError('Error while getting the playlist',
+ expected=True)
+ data = data_json['media']
formats = []
thumbnails = []
diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py
new file mode 100644
index 000000000..24872861a
--- /dev/null
+++ b/youtube_dl/extractor/ynet.py
@@ -0,0 +1,54 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import compat_urllib_parse
+
+
+class YnetIE(InfoExtractor):
+ _VALID_URL = r'http://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html'
+ _TESTS = [
+ {
+ 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html',
+ 'md5': '002b44ee2f33d50363a1c153bed524cf',
+ 'info_dict': {
+ 'id': 'L-11659-99244',
+ 'ext': 'flv',
+ 'title': 'איש לא יודע מאיפה באנו',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ }
+ }, {
+ 'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html',
+ 'md5': '6455046ae1b48cf7e2b7cae285e53a16',
+ 'info_dict': {
+ 'id': 'L-8859-84418',
+ 'ext': 'flv',
+ 'title': "צפו: הנשיקה הלוהטת של תורגי' ויוליה פלוטקין",
+ 'thumbnail': 're:^https?://.*\.jpg',
+ }
+ }
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage))
+ config = json.loads(self._search_regex(r'config=({.+?})$', content, 'video config'))
+ f4m_url = config['clip']['url']
+ title = self._og_search_title(webpage)
+ m = re.search(r'ynet - HOT -- (["\']+)(?P<title>.+?)\1', title)
+ if m:
+ title = m.group('title')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': self._extract_f4m_formats(f4m_url, video_id),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index a8fd40c83..48d47a245 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -1,6 +1,7 @@
# coding: utf-8
-import json
+from __future__ import unicode_literals
+
import math
import random
import re
@@ -13,18 +14,25 @@ from ..utils import (
class YoukuIE(InfoExtractor):
- _VALID_URL = r'(?:(?:http://)?(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|youku:)(?P<ID>[A-Za-z0-9]+)(?:\.html|/v\.swf|)'
- _TEST = {
- u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",
- u"file": u"XNDgyMDQ2NTQw_part00.flv",
- u"md5": u"ffe3f2e435663dc2d1eea34faeff5b5b",
- u"params": {u"test": False},
- u"info_dict": {
- u"title": u"youtube-dl test video \"'/\\ä↭𝕐"
+ _VALID_URL = r'''(?x)
+ (?:
+ http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|
+ youku:)
+ (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|)
+ '''
+ _TEST = {
+ 'url': 'http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html',
+ 'md5': 'ffe3f2e435663dc2d1eea34faeff5b5b',
+ 'params': {
+ 'test': False
+ },
+ 'info_dict': {
+ 'id': 'XNDgyMDQ2NTQw_part00',
+ 'ext': 'flv',
+ 'title': 'youtube-dl test video "\'/\\ä↭𝕐'
}
}
-
def _gen_sid(self):
nowTime = int(time.time() * 1000)
random1 = random.randint(1000,1998)
@@ -55,49 +63,42 @@ class YoukuIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
- video_id = mobj.group('ID')
+ video_id = mobj.group('id')
info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
- jsondata = self._download_webpage(info_url, video_id)
-
- self.report_extraction(video_id)
- try:
- config = json.loads(jsondata)
- error_code = config['data'][0].get('error_code')
- if error_code:
- # -8 means blocked outside China.
- error = config['data'][0].get('error') # Chinese and English, separated by newline.
- raise ExtractorError(error or u'Server reported error %i' % error_code,
- expected=True)
-
- video_title = config['data'][0]['title']
- seed = config['data'][0]['seed']
-
- format = self._downloader.params.get('format', None)
- supported_format = list(config['data'][0]['streamfileids'].keys())
-
- if format is None or format == 'best':
- if 'hd2' in supported_format:
- format = 'hd2'
- else:
- format = 'flv'
- ext = u'flv'
- elif format == 'worst':
- format = 'mp4'
- ext = u'mp4'
- else:
- format = 'flv'
- ext = u'flv'
+ config = self._download_json(info_url, video_id)
+
+ error_code = config['data'][0].get('error_code')
+ if error_code:
+ # -8 means blocked outside China.
+ error = config['data'][0].get('error') # Chinese and English, separated by newline.
+ raise ExtractorError(error or 'Server reported error %i' % error_code,
+ expected=True)
+ video_title = config['data'][0]['title']
+ seed = config['data'][0]['seed']
- fileid = config['data'][0]['streamfileids'][format]
- keys = [s['k'] for s in config['data'][0]['segs'][format]]
- # segs is usually a dictionary, but an empty *list* if an error occured.
- except (UnicodeDecodeError, ValueError, KeyError):
- raise ExtractorError(u'Unable to extract info section')
+ format = self._downloader.params.get('format', None)
+ supported_format = list(config['data'][0]['streamfileids'].keys())
+
+ # TODO proper format selection
+ if format is None or format == 'best':
+ if 'hd2' in supported_format:
+ format = 'hd2'
+ else:
+ format = 'flv'
+ ext = 'flv'
+ elif format == 'worst':
+ format = 'mp4'
+ ext = 'mp4'
+ else:
+ format = 'flv'
+ ext = 'flv'
+
+ fileid = config['data'][0]['streamfileids'][format]
+ keys = [s['k'] for s in config['data'][0]['segs'][format]]
+ # segs is usually a dictionary, but an empty *list* if an error occured.
files_info=[]
sid = self._gen_sid()
@@ -106,9 +107,8 @@ class YoukuIE(InfoExtractor):
#column 8,9 of fileid represent the segment number
#fileid[7:9] should be changed
for index, key in enumerate(keys):
-
temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
- download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
+ download_url = 'http://k.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
info = {
'id': '%s_part%02d' % (video_id, index),
diff --git a/youtube_dl/extractor/yourupload.py b/youtube_dl/extractor/yourupload.py
new file mode 100644
index 000000000..40fc4165f
--- /dev/null
+++ b/youtube_dl/extractor/yourupload.py
@@ -0,0 +1,58 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class YourUploadIE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://(?:www\.)?
+ (?:yourupload\.com/watch|
+ embed\.yourupload\.com|
+ embed\.yucache\.net
+ )/(?P<id>[A-Za-z0-9]+)
+ '''
+ _TESTS = [
+ {
+ 'url': 'http://yourupload.com/watch/14i14h',
+ 'md5': 'bf5c2f95c4c917536e80936af7bc51e1',
+ 'info_dict': {
+ 'id': '14i14h',
+ 'ext': 'mp4',
+ 'title': 'BigBuckBunny_320x180.mp4',
+ 'thumbnail': 're:^https?://.*\.jpe?g',
+ }
+ },
+ {
+ 'url': 'http://embed.yourupload.com/14i14h',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://embed.yucache.net/14i14h?client_file_id=803349',
+ 'only_matching': True,
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ url = 'http://embed.yucache.net/{0:}'.format(video_id)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ url = self._og_search_video_url(webpage)
+
+ formats = [{
+ 'format_id': 'sd',
+ 'url': url,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index b54c69122..99198e380 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -46,7 +46,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def _set_language(self):
return bool(self._download_webpage(
self._LANG_URL, None,
- note=u'Setting language', errnote='unable to set language',
+ note='Setting language', errnote='unable to set language',
fatal=False))
def _login(self):
@@ -61,13 +61,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# No authentication to be performed
if username is None:
if self._LOGIN_REQUIRED:
- raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+ raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
return True
login_page = self._download_webpage(
self._LOGIN_URL, None,
- note=u'Downloading login page',
- errnote=u'unable to fetch login page', fatal=False)
+ note='Downloading login page',
+ errnote='unable to fetch login page', fatal=False)
if login_page is False:
return
@@ -105,12 +105,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
login_results = self._download_webpage(
req, None,
- note=u'Logging in', errnote=u'unable to log in', fatal=False)
+ note='Logging in', errnote='unable to log in', fatal=False)
if login_results is False:
return False
if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
- raise ExtractorError(u'Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
+ raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
# Two-Factor
# TODO add SMS and phone call support - these require making a request and then prompting the user
@@ -119,19 +119,19 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
tfa_code = self._get_tfa_info()
if tfa_code is None:
- self._downloader.report_warning(u'Two-factor authentication required. Provide it with --twofactor <code>')
- self._downloader.report_warning(u'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
+ self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
+ self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
return False
# Unlike the first login form, secTok and timeStmp are both required for the TFA form
match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
if match is None:
- self._downloader.report_warning(u'Failed to get secTok - did the page structure change?')
+ self._downloader.report_warning('Failed to get secTok - did the page structure change?')
secTok = match.group(1)
match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
if match is None:
- self._downloader.report_warning(u'Failed to get timeStmp - did the page structure change?')
+ self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
timeStmp = match.group(1)
tfa_form_strs = {
@@ -155,23 +155,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
tfa_results = self._download_webpage(
tfa_req, None,
- note=u'Submitting TFA code', errnote=u'unable to submit tfa', fatal=False)
+ note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
if tfa_results is False:
return False
if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
- self._downloader.report_warning(u'Two-factor code expired. Please try again, or use a one-use backup code instead.')
+ self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
return False
if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
- self._downloader.report_warning(u'unable to log in - did the page structure change?')
+ self._downloader.report_warning('unable to log in - did the page structure change?')
return False
if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
- self._downloader.report_warning(u'Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
+ self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
return False
if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
- self._downloader.report_warning(u'unable to log in: bad username or password')
+ self._downloader.report_warning('unable to log in: bad username or password')
return False
return True
@@ -185,7 +185,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
self._download_webpage(
req, None,
- note=u'Confirming age', errnote=u'Unable to confirm age')
+ note='Confirming age', errnote='Unable to confirm age')
return True
def _real_initialize(self):
@@ -211,7 +211,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
- (?:(?:v|embed|e)/) # v/ or embed/ or e/
+ (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
|(?: # or the v= param in all its forms
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
@@ -307,69 +307,74 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
IE_NAME = 'youtube'
_TESTS = [
{
- u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
- u"file": u"BaW_jenozKc.mp4",
- u"info_dict": {
- u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
- u"uploader": u"Philipp Hagemeister",
- u"uploader_id": u"phihag",
- u"upload_date": u"20121002",
- u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
- u"categories": [u'Science & Technology'],
+ 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
+ 'info_dict': {
+ 'id': 'BaW_jenozKc',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
+ 'uploader': 'Philipp Hagemeister',
+ 'uploader_id': 'phihag',
+ 'upload_date': '20121002',
+ 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
+ 'categories': ['Science & Technology'],
'like_count': int,
'dislike_count': int,
}
},
{
- u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
- u"file": u"UxxajLWwzqY.mp4",
- u"note": u"Test generic use_cipher_signature video (#897)",
- u"info_dict": {
- u"upload_date": u"20120506",
- u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
- u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
- u"uploader": u"Icona Pop",
- u"uploader_id": u"IconaPop"
+ 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
+ 'note': 'Test generic use_cipher_signature video (#897)',
+ 'info_dict': {
+ 'id': 'UxxajLWwzqY',
+ 'ext': 'mp4',
+ 'upload_date': '20120506',
+ 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
+ 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
+ 'uploader': 'Icona Pop',
+ 'uploader_id': 'IconaPop',
}
},
{
- u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
- u"file": u"07FYdnEawAQ.mp4",
- u"note": u"Test VEVO video with age protection (#956)",
- u"info_dict": {
- u"upload_date": u"20130703",
- u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
- u"description": u"md5:64249768eec3bc4276236606ea996373",
- u"uploader": u"justintimberlakeVEVO",
- u"uploader_id": u"justintimberlakeVEVO"
+ 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
+ 'note': 'Test VEVO video with age protection (#956)',
+ 'info_dict': {
+ 'id': '07FYdnEawAQ',
+ 'ext': 'mp4',
+ 'upload_date': '20130703',
+ 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
+ 'description': 'md5:64249768eec3bc4276236606ea996373',
+ 'uploader': 'justintimberlakeVEVO',
+ 'uploader_id': 'justintimberlakeVEVO',
}
},
{
- u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
- u"file": u"yZIXLfi8CZQ.mp4",
- u"note": u"Embed-only video (#1746)",
- u"info_dict": {
- u"upload_date": u"20120608",
- u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
- u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
- u"uploader": u"SET India",
- u"uploader_id": u"setindia"
+ 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
+ 'note': 'Embed-only video (#1746)',
+ 'info_dict': {
+ 'id': 'yZIXLfi8CZQ',
+ 'ext': 'mp4',
+ 'upload_date': '20120608',
+ 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
+ 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
+ 'uploader': 'SET India',
+ 'uploader_id': 'setindia'
}
},
{
- u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
- u"file": u"a9LDPn-MO4I.m4a",
- u"note": u"256k DASH audio (format 141) via DASH manifest",
- u"info_dict": {
- u"upload_date": "20121002",
- u"uploader_id": "8KVIDEO",
- u"description": '',
- u"uploader": "8KVIDEO",
- u"title": "UHDTV TEST 8K VIDEO.mp4"
+ 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
+ 'note': '256k DASH audio (format 141) via DASH manifest',
+ 'info_dict': {
+ 'id': 'a9LDPn-MO4I',
+ 'ext': 'm4a',
+ 'upload_date': '20121002',
+ 'uploader_id': '8KVIDEO',
+ 'description': '',
+ 'uploader': '8KVIDEO',
+ 'title': 'UHDTV TEST 8K VIDEO.mp4'
},
- u"params": {
- u"youtube_include_dash_manifest": True,
- u"format": "141",
+ 'params': {
+ 'youtube_include_dash_manifest': True,
+ 'format': '141',
},
},
# DASH manifest with encrypted signature
@@ -384,7 +389,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'uploader_id': 'AfrojackVEVO',
'upload_date': '20131011',
},
- u"params": {
+ 'params': {
'youtube_include_dash_manifest': True,
'format': '141',
},
@@ -397,19 +402,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def report_video_info_webpage_download(self, video_id):
"""Report attempt to download video info webpage."""
- self.to_screen(u'%s: Downloading video info webpage' % video_id)
+ self.to_screen('%s: Downloading video info webpage' % video_id)
def report_information_extraction(self, video_id):
"""Report attempt to extract video information."""
- self.to_screen(u'%s: Extracting video information' % video_id)
+ self.to_screen('%s: Extracting video information' % video_id)
def report_unavailable_format(self, video_id, format):
"""Report extracted video URL."""
- self.to_screen(u'%s: Format %s not available' % (video_id, format))
+ self.to_screen('%s: Format %s not available' % (video_id, format))
def report_rtmp_download(self):
"""Indicate the download will use the RTMP protocol."""
- self.to_screen(u'RTMP download detected')
+ self.to_screen('RTMP download detected')
def _signature_cache_id(self, example_sig):
""" Return a string representation of a signature """
@@ -429,21 +434,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
player_type, player_id, self._signature_cache_id(example_sig))
assert os.path.basename(func_id) == func_id
- cache_spec = self._downloader.cache.load(u'youtube-sigfuncs', func_id)
+ cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
if cache_spec is not None:
return lambda s: ''.join(s[i] for i in cache_spec)
if player_type == 'js':
code = self._download_webpage(
player_url, video_id,
- note=u'Downloading %s player %s' % (player_type, player_id),
- errnote=u'Download of %s failed' % player_url)
+ note='Downloading %s player %s' % (player_type, player_id),
+ errnote='Download of %s failed' % player_url)
res = self._parse_sig_js(code)
elif player_type == 'swf':
urlh = self._request_webpage(
player_url, video_id,
- note=u'Downloading %s player %s' % (player_type, player_id),
- errnote=u'Download of %s failed' % player_url)
+ note='Downloading %s player %s' % (player_type, player_id),
+ errnote='Download of %s failed' % player_url)
code = urlh.read()
res = self._parse_sig_swf(code)
else:
@@ -454,15 +459,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
cache_res = res(test_string)
cache_spec = [ord(c) for c in cache_res]
- self._downloader.cache.store(u'youtube-sigfuncs', func_id, cache_spec)
+ self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
return res
def _print_sig_code(self, func, example_sig):
def gen_sig_code(idxs):
def _genslice(start, end, step):
starts = '' if start == 0 else str(start)
- ends = (u':%d' % (end+step)) if end + step >= 0 else ':'
- steps = '' if step == 1 else (u':%d' % step)
+ ends = (':%d' % (end+step)) if end + step >= 0 else ':'
+ steps = '' if step == 1 else (':%d' % step)
return 's[%s%s%s]' % (starts, ends, steps)
step = None
@@ -492,9 +497,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
expr_code = ' + '.join(gen_sig_code(cache_spec))
signature_id_tuple = '(%s)' % (
', '.join(compat_str(len(p)) for p in example_sig.split('.')))
- code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
+ code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
' return %s\n') % (signature_id_tuple, expr_code)
- self.to_screen(u'Extracted signature function:\n' + code)
+ self.to_screen('Extracted signature function:\n' + code)
def _parse_sig_js(self, jscode):
funcname = self._search_regex(
@@ -516,9 +521,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
"""Turn the encrypted s field into a working signature"""
if player_url is None:
- raise ExtractorError(u'Cannot decrypt signature without player_url')
+ raise ExtractorError('Cannot decrypt signature without player_url')
- if player_url.startswith(u'//'):
+ if player_url.startswith('//'):
player_url = 'https:' + player_url
try:
player_id = (player_url, self._signature_cache_id(s))
@@ -542,7 +547,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
video_id, note=False)
except ExtractorError as err:
- self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
+ self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
return {}
lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
@@ -560,7 +565,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
url = 'https://www.youtube.com/api/timedtext?' + params
sub_lang_list[lang] = url
if not sub_lang_list:
- self._downloader.report_warning(u'video doesn\'t have subtitles')
+ self._downloader.report_warning('video doesn\'t have subtitles')
return {}
return sub_lang_list
@@ -568,7 +573,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
sub_format = self._downloader.params.get('subtitlesformat', 'srt')
- self.to_screen(u'%s: Looking for automatic captions' % video_id)
+ self.to_screen('%s: Looking for automatic captions' % video_id)
mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
err_msg = 'Couldn\'t find automatic captions for %s' % video_id
if mobj is None:
@@ -589,7 +594,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
caption_list = self._download_xml(list_url, video_id)
original_lang_node = caption_list.find('track')
if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
- self._downloader.report_warning(u'Video doesn\'t have automatic captions')
+ self._downloader.report_warning('Video doesn\'t have automatic captions')
return {}
original_lang = original_lang_node.attrib['lang_code']
@@ -615,7 +620,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def extract_id(cls, url):
mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
video_id = mobj.group(2)
return video_id
@@ -635,7 +640,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _extract_annotations(self, video_id):
url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
- return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
+ return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
def _real_extract(self, url):
proto = (
@@ -705,14 +710,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# Check for "rental" videos
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
- raise ExtractorError(u'"rental" videos not supported')
+ raise ExtractorError('"rental" videos not supported')
# Start extracting information
self.report_information_extraction(video_id)
# uploader
if 'author' not in video_info:
- raise ExtractorError(u'Unable to extract uploader name')
+ raise ExtractorError('Unable to extract uploader name')
video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
# uploader_id
@@ -721,13 +726,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if mobj is not None:
video_uploader_id = mobj.group(1)
else:
- self._downloader.report_warning(u'unable to extract uploader nickname')
+ self._downloader.report_warning('unable to extract uploader nickname')
# title
if 'title' in video_info:
video_title = video_info['title'][0]
else:
- self._downloader.report_warning(u'Unable to extract video title')
+ self._downloader.report_warning('Unable to extract video title')
video_title = '_'
# thumbnail image
@@ -737,7 +742,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if m_thumb is not None:
video_thumbnail = m_thumb.group(1)
elif 'thumbnail_url' not in video_info:
- self._downloader.report_warning(u'unable to extract video thumbnail')
+ self._downloader.report_warning('unable to extract video thumbnail')
video_thumbnail = None
else: # don't panic if we can't find it
video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
@@ -791,8 +796,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if count is not None:
return int(count.replace(',', ''))
return None
- like_count = _extract_count(u'like')
- dislike_count = _extract_count(u'dislike')
+ like_count = _extract_count('like')
+ dislike_count = _extract_count('dislike')
# subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage)
@@ -802,7 +807,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
return
if 'length_seconds' not in video_info:
- self._downloader.report_warning(u'unable to extract video duration')
+ self._downloader.report_warning('unable to extract video duration')
video_duration = None
else:
video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
@@ -823,11 +828,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# Easy way to know if the 's' value is in url_encoded_fmt_stream_map
# this signatures are encrypted
if 'url_encoded_fmt_stream_map' not in args:
- raise ValueError(u'No stream_map present') # caught below
+ raise ValueError('No stream_map present') # caught below
re_signature = re.compile(r'[&,]s=')
m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
if m_s is not None:
- self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
+ self.to_screen('%s: Encrypted signatures detected.' % video_id)
video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
m_s = re_signature.search(args.get('adaptive_fmts', ''))
if m_s is not None:
@@ -905,7 +910,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
player_desc = 'html5 player %s' % player_version
parts_sizes = self._signature_cache_id(encrypted_sig)
- self.to_screen(u'{%s} signature length %s, %s' %
+ self.to_screen('{%s} signature length %s, %s' %
(format_id, parts_sizes, player_desc))
signature = self._decrypt_signature(
@@ -920,7 +925,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
url_map = self._extract_from_m3u8(manifest_url, video_id)
formats = _map_to_format_list(url_map)
else:
- raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
+ raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
# Look for the DASH manifest
if (self._downloader.params.get('youtube_include_dash_manifest', False)):
@@ -941,9 +946,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dash_doc = self._download_xml(
dash_manifest_url, video_id,
- note=u'Downloading DASH manifest',
- errnote=u'Could not download DASH manifest')
- for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
+ note='Downloading DASH manifest',
+ errnote='Could not download DASH manifest')
+ for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
if url_el is None:
continue
@@ -969,7 +974,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
existing_format.update(f)
except (ExtractorError, KeyError) as e:
- self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
+ self.report_warning('Skipping DASH manifest: %s' % e, video_id)
self._sort_formats(formats)
@@ -1000,7 +1005,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
(?:\w+\.)?
youtube\.com/
(?:
- (?:course|view_play_list|my_playlists|artist|playlist|watch)
+ (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
\? (?:.*?&)*? (?:p|a|list)=
| p/
)
@@ -1056,6 +1061,20 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
'title': 'YDL_safe_search',
},
'playlist_count': 2,
+ }, {
+ 'note': 'embedded',
+ 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
+ 'playlist_count': 4,
+ 'info_dict': {
+ 'title': 'JODA15',
+ }
+ }, {
+ 'note': 'Embedded SWF player',
+ 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
+ 'playlist_count': 4,
+ 'info_dict': {
+ 'title': 'JODA7',
+ }
}]
def _real_initialize(self):
@@ -1090,7 +1109,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
# Extract playlist id
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
playlist_id = mobj.group(1) or mobj.group(2)
# Check if it's a video-specific URL
@@ -1098,16 +1117,16 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
if 'v' in query_dict:
video_id = query_dict['v'][0]
if self._downloader.params.get('noplaylist'):
- self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
return self.url_result(video_id, 'Youtube', video_id=video_id)
else:
- self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+ self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
if playlist_id.startswith('RD'):
# Mixes require a custom extraction process
return self._extract_mix(playlist_id)
if playlist_id.startswith('TL'):
- raise ExtractorError(u'For downloading YouTube.com top lists, use '
+ raise ExtractorError('For downloading YouTube.com top lists, use '
'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
url = self._TEMPLATE_URL % playlist_id
@@ -1152,19 +1171,28 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
class YoutubeTopListIE(YoutubePlaylistIE):
IE_NAME = 'youtube:toplist'
- IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
+ IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
' (Example: "yttoplist:music:Top Tracks")')
_VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
- _TESTS = []
+ _TESTS = [{
+ 'url': 'yttoplist:music:Trending',
+ 'playlist_mincount': 5,
+ 'skip': 'Only works for logged-in users',
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
channel = mobj.group('chann')
title = mobj.group('title')
query = compat_urllib_parse.urlencode({'title': title})
- playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
- channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
- link = self._html_search_regex(playlist_re, channel_page, 'list')
+ channel_page = self._download_webpage(
+ 'https://www.youtube.com/%s' % channel, title)
+ link = self._html_search_regex(
+ r'''(?x)
+ <a\s+href="([^"]+)".*?>\s*
+ <span\s+class="branded-page-module-title-text">\s*
+ <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
+ channel_page, 'list')
url = compat_urlparse.urljoin('https://www.youtube.com/', link)
video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
@@ -1190,6 +1218,11 @@ class YoutubeChannelIE(InfoExtractor):
_MORE_PAGES_INDICATOR = 'yt-uix-load-more'
_MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
IE_NAME = 'youtube:channel'
+ _TESTS = [{
+ 'note': 'paginated channel',
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'playlist_mincount': 91,
+ }]
def extract_videos_from_page(self, page):
ids_in_page = []
@@ -1202,7 +1235,7 @@ class YoutubeChannelIE(InfoExtractor):
# Extract channel id
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
# Download channel page
channel_id = mobj.group(1)
@@ -1224,7 +1257,7 @@ class YoutubeChannelIE(InfoExtractor):
for pagenum in itertools.count(1):
url = self._MORE_PAGES_URL % (pagenum, channel_id)
page = self._download_json(
- url, channel_id, note=u'Downloading page #%s' % pagenum,
+ url, channel_id, note='Downloading page #%s' % pagenum,
transform_source=uppercase_escape)
ids_in_page = self.extract_videos_from_page(page['content_html'])
@@ -1233,7 +1266,7 @@ class YoutubeChannelIE(InfoExtractor):
if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
break
- self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
+ self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
for video_id in video_ids]
@@ -1248,6 +1281,17 @@ class YoutubeUserIE(InfoExtractor):
_GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
IE_NAME = 'youtube:user'
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
+ 'playlist_mincount': 320,
+ 'info_dict': {
+ 'title': 'TheLinuxFoundation',
+ }
+ }, {
+ 'url': 'ytuser:phihag',
+ 'only_matching': True,
+ }]
+
@classmethod
def suitable(cls, url):
# Don't return True if the url can be extracted with other youtube
@@ -1260,7 +1304,7 @@ class YoutubeUserIE(InfoExtractor):
# Extract username
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
username = mobj.group(1)
@@ -1281,7 +1325,7 @@ class YoutubeUserIE(InfoExtractor):
try:
response = json.loads(page)
except ValueError as err:
- raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
+ raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
if 'entry' not in response['feed']:
return
@@ -1322,9 +1366,9 @@ class YoutubeSearchIE(SearchInfoExtractor):
compat_urllib_parse.quote_plus(query.encode('utf-8')),
(PAGE_SIZE * pagenum) + 1)
data_json = self._download_webpage(
- result_url, video_id=u'query "%s"' % query,
- note=u'Downloading page %s' % (pagenum + 1),
- errnote=u'Unable to download API page')
+ result_url, video_id='query "%s"' % query,
+ note='Downloading page %s' % (pagenum + 1),
+ errnote='Unable to download API page')
data = json.loads(data_json)
api_response = data['data']
@@ -1356,6 +1400,13 @@ class YoutubeSearchURLIE(InfoExtractor):
IE_DESC = 'YouTube.com search URLs'
IE_NAME = 'youtube:search_url'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'title': 'youtube-dl test video',
+ }
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -1390,17 +1441,38 @@ class YoutubeSearchURLIE(InfoExtractor):
class YoutubeShowIE(InfoExtractor):
IE_DESC = 'YouTube.com (multi-season) shows'
- _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
+ _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
IE_NAME = 'youtube:show'
+ _TESTS = [{
+ 'url': 'http://www.youtube.com/show/airdisasters',
+ 'playlist_mincount': 3,
+ 'info_dict': {
+ 'id': 'airdisasters',
+ 'title': 'Air Disasters',
+ }
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- show_name = mobj.group(1)
- webpage = self._download_webpage(url, show_name, 'Downloading show webpage')
+ playlist_id = mobj.group('id')
+ webpage = self._download_webpage(
+ url, playlist_id, 'Downloading show webpage')
# There's one playlist for each season of the show
m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
- self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
- return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
+ self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
+ entries = [
+ self.url_result(
+ 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
+ for season in m_seasons
+ ]
+ title = self._og_search_title(webpage, fatal=False)
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'title': title,
+ 'entries': entries,
+ }
class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):