aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py1
-rw-r--r--youtube_dl/extractor/addanime.py5
-rw-r--r--youtube_dl/extractor/brightcove.py19
-rw-r--r--youtube_dl/extractor/channel9.py35
-rw-r--r--youtube_dl/extractor/cinemassacre.py81
-rw-r--r--youtube_dl/extractor/cloudy.py10
-rw-r--r--youtube_dl/extractor/cnn.py6
-rw-r--r--youtube_dl/extractor/common.py24
-rw-r--r--youtube_dl/extractor/crunchyroll.py1
-rw-r--r--youtube_dl/extractor/dropbox.py3
-rw-r--r--youtube_dl/extractor/facebook.py4
-rw-r--r--youtube_dl/extractor/gamespot.py15
-rw-r--r--youtube_dl/extractor/generic.py21
-rw-r--r--youtube_dl/extractor/globo.py8
-rw-r--r--youtube_dl/extractor/goshgay.py43
-rw-r--r--youtube_dl/extractor/heise.py16
-rw-r--r--youtube_dl/extractor/imdb.py12
-rw-r--r--youtube_dl/extractor/izlesene.py15
-rw-r--r--youtube_dl/extractor/laola1tv.py1
-rw-r--r--youtube_dl/extractor/myvideo.py5
-rw-r--r--youtube_dl/extractor/niconico.py4
-rw-r--r--youtube_dl/extractor/played.py7
-rw-r--r--youtube_dl/extractor/ro220.py2
-rw-r--r--youtube_dl/extractor/trutube.py2
-rw-r--r--youtube_dl/extractor/ustream.py3
-rw-r--r--youtube_dl/extractor/vice.py38
-rw-r--r--youtube_dl/extractor/vimeo.py4
-rw-r--r--youtube_dl/extractor/wimp.py2
-rw-r--r--youtube_dl/extractor/youtube.py10
29 files changed, 245 insertions, 152 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 3f85c99cd..3c1807f15 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -421,6 +421,7 @@ from .vesti import VestiIE
from .vevo import VevoIE
from .vgtv import VGTVIE
from .vh1 import VH1IE
+from .vice import ViceIE
from .viddler import ViddlerIE
from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py
index fcf296057..11f149f9e 100644
--- a/youtube_dl/extractor/addanime.py
+++ b/youtube_dl/extractor/addanime.py
@@ -3,12 +3,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_HTTPError,
compat_str,
compat_urllib_parse,
compat_urllib_parse_urlparse,
-
+)
+from ..utils import (
ExtractorError,
)
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index ad22cbafd..a6920685e 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -14,6 +14,7 @@ from ..utils import (
compat_str,
compat_urllib_request,
compat_parse_qs,
+ compat_urllib_parse_urlparse,
determine_ext,
ExtractorError,
@@ -23,7 +24,7 @@ from ..utils import (
class BrightcoveIE(InfoExtractor):
- _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
+ _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*?\?(?P<query>.*)'
_FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
_TESTS = [
@@ -260,11 +261,19 @@ class BrightcoveIE(InfoExtractor):
formats = []
for rend in renditions:
url = rend['defaultURL']
+ if not url:
+ continue
if rend['remote']:
- # This type of renditions are served through akamaihd.net,
- # but they don't use f4m manifests
- url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB'
- ext = 'flv'
+ url_comp = compat_urllib_parse_urlparse(url)
+ if url_comp.path.endswith('.m3u8'):
+ formats.extend(
+ self._extract_m3u8_formats(url, info['id'], 'mp4'))
+ continue
+ elif 'akamaihd.net' in url_comp.netloc:
+ # This type of renditions are served through
+ # akamaihd.net, but they don't use f4m manifests
+ url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB'
+ ext = 'flv'
else:
ext = determine_ext(url)
size = rend.get('size')
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py
index 4f000292b..16d800512 100644
--- a/youtube_dl/extractor/channel9.py
+++ b/youtube_dl/extractor/channel9.py
@@ -27,7 +27,7 @@ class Channel9IE(InfoExtractor):
'title': 'Developer Kick-Off Session: Stuff We Love',
'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
'duration': 4576,
- 'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
+ 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
'session_code': 'KOS002',
'session_day': 'Day 1',
'session_room': 'Arena 1A',
@@ -43,7 +43,7 @@ class Channel9IE(InfoExtractor):
'title': 'Self-service BI with Power BI - nuclear testing',
'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
'duration': 1540,
- 'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
+ 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
'authors': [ 'Mike Wilmot' ],
},
}
@@ -94,7 +94,7 @@ class Channel9IE(InfoExtractor):
def _extract_title(self, html):
title = self._html_search_meta('title', html, 'title')
- if title is None:
+ if title is None:
title = self._og_search_title(html)
TITLE_SUFFIX = ' (Channel 9)'
if title is not None and title.endswith(TITLE_SUFFIX):
@@ -115,7 +115,7 @@ class Channel9IE(InfoExtractor):
return self._html_search_meta('description', html, 'description')
def _extract_duration(self, html):
- m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
+ m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
def _extract_slides(self, html):
@@ -167,7 +167,7 @@ class Channel9IE(InfoExtractor):
return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
def _extract_content(self, html, content_path):
- # Look for downloadable content
+ # Look for downloadable content
formats = self._formats_from_html(html)
slides = self._extract_slides(html)
zip_ = self._extract_zip(html)
@@ -258,16 +258,17 @@ class Channel9IE(InfoExtractor):
webpage = self._download_webpage(url, content_path, 'Downloading web page')
- page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
- if page_type_m is None:
- raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)
-
- page_type = page_type_m.group('pagetype')
- if page_type == 'List': # List page, may contain list of 'item'-like objects
+ page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
+ if page_type_m is not None:
+ page_type = page_type_m.group('pagetype')
+ if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
+ return self._extract_entry_item(webpage, content_path)
+ elif page_type == 'Session': # Event session page, may contain downloadable content
+ return self._extract_session(webpage, content_path)
+ elif page_type == 'Event':
+ return self._extract_list(content_path)
+ else:
+ raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
+
+ else: # Assuming list
return self._extract_list(content_path)
- elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
- return self._extract_entry_item(webpage, content_path)
- elif page_type == 'Session': # Event session page, may contain downloadable content
- return self._extract_session(webpage, content_path)
- else:
- raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True) \ No newline at end of file
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
index d064a28f9..31fe906b4 100644
--- a/youtube_dl/extractor/cinemassacre.py
+++ b/youtube_dl/extractor/cinemassacre.py
@@ -42,11 +42,12 @@ class CinemassacreIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
- mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
+ mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<full_video_id>(?:Cinemassacre-)?(?P<video_id>.+?)))"', webpage)
if not mobj:
raise ExtractorError('Can\'t extract embed url and video id')
playerdata_url = mobj.group('embed_url')
video_id = mobj.group('video_id')
+ full_video_id = mobj.group('full_video_id')
video_title = self._html_search_regex(
r'<title>(?P<title>.+?)\|', webpage, 'title')
@@ -59,41 +60,53 @@ class CinemassacreIE(InfoExtractor):
vidurl = self._search_regex(
r'\'vidurl\'\s*:\s*"([^\']+)"', playerdata, 'vidurl').replace('\\/', '/')
- vidid = self._search_regex(
- r'\'vidid\'\s*:\s*"([^\']+)"', playerdata, 'vidid')
- videoserver = self._html_search_regex(
- r"'videoserver'\s*:\s*'([^']+)'", playerdata, 'videoserver')
- videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)
- videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
+ videolist_url = None
- formats = []
- baseurl = vidurl[:vidurl.rfind('/')+1]
- for video in videolist.findall('.//video'):
- src = video.get('src')
- if not src:
- continue
- file_ = src.partition(':')[-1]
- width = int_or_none(video.get('width'))
- height = int_or_none(video.get('height'))
- bitrate = int_or_none(video.get('system-bitrate'))
- format = {
- 'url': baseurl + file_,
- 'format_id': src.rpartition('.')[0].rpartition('_')[-1],
- }
- if width or height:
- format.update({
- 'tbr': bitrate // 1000 if bitrate else None,
- 'width': width,
- 'height': height,
- })
- else:
- format.update({
- 'abr': bitrate // 1000 if bitrate else None,
- 'vcodec': 'none',
- })
- formats.append(format)
- self._sort_formats(formats)
+ mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata)
+ if mobj:
+ videoserver = mobj.group('videoserver')
+ mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata)
+ vidid = mobj.group('vidid') if mobj else full_video_id
+ videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)
+ else:
+ mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata)
+ if mobj:
+ videolist_url = mobj.group('smil')
+
+ if videolist_url:
+ videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
+ formats = []
+ baseurl = vidurl[:vidurl.rfind('/')+1]
+ for video in videolist.findall('.//video'):
+ src = video.get('src')
+ if not src:
+ continue
+ file_ = src.partition(':')[-1]
+ width = int_or_none(video.get('width'))
+ height = int_or_none(video.get('height'))
+ bitrate = int_or_none(video.get('system-bitrate'))
+ format = {
+ 'url': baseurl + file_,
+ 'format_id': src.rpartition('.')[0].rpartition('_')[-1],
+ }
+ if width or height:
+ format.update({
+ 'tbr': bitrate // 1000 if bitrate else None,
+ 'width': width,
+ 'height': height,
+ })
+ else:
+ format.update({
+ 'abr': bitrate // 1000 if bitrate else None,
+ 'vcodec': 'none',
+ })
+ formats.append(format)
+ self._sort_formats(formats)
+ else:
+ formats = [{
+ 'url': vidurl,
+ }]
return {
'id': video_id,
diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py
index 386f080d2..abf8cc280 100644
--- a/youtube_dl/extractor/cloudy.py
+++ b/youtube_dl/extractor/cloudy.py
@@ -4,14 +4,16 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_parse_qs,
compat_urllib_parse,
- remove_end,
- HEADRequest,
compat_HTTPError,
)
+from ..utils import (
+ ExtractorError,
+ HEADRequest,
+ remove_end,
+)
class CloudyIE(InfoExtractor):
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index 78877b1cf..3826ce7e1 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -16,9 +16,10 @@ class CNNIE(InfoExtractor):
_TESTS = [{
'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
- 'file': 'sports_2013_06_09_nadal-1-on-1.cnn.mp4',
'md5': '3e6121ea48df7e2259fe73a0628605c4',
'info_dict': {
+ 'id': 'sports_2013_06_09_nadal-1-on-1.cnn',
+ 'ext': 'mp4',
'title': 'Nadal wins 8th French Open title',
'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
'duration': 135,
@@ -27,9 +28,10 @@ class CNNIE(InfoExtractor):
},
{
"url": "http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29",
- "file": "us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4",
"md5": "b5cc60c60a3477d185af8f19a2a26f4e",
"info_dict": {
+ 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology',
+ 'ext': 'mp4',
"title": "Student's epic speech stuns new freshmen",
"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"",
"upload_date": "20130821",
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 7e4113213..b77f0e519 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -12,13 +12,14 @@ import sys
import time
import xml.etree.ElementTree
-from ..utils import (
+from ..compat import (
compat_http_client,
compat_urllib_error,
compat_urllib_parse_urlparse,
compat_urlparse,
compat_str,
-
+)
+from ..utils import (
clean_html,
compiled_regex_type,
ExtractorError,
@@ -403,7 +404,7 @@ class InfoExtractor(object):
video_info['title'] = playlist_title
return video_info
- def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+ def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Perform a regex search on the given string, using a single or a list of
patterns returning the first matching group.
@@ -424,8 +425,11 @@ class InfoExtractor(object):
_name = name
if mobj:
- # return the first matching group
- return next(g for g in mobj.groups() if g is not None)
+ if group is None:
+ # return the first matching group
+ return next(g for g in mobj.groups() if g is not None)
+ else:
+ return mobj.group(group)
elif default is not _NO_DEFAULT:
return default
elif fatal:
@@ -435,11 +439,11 @@ class InfoExtractor(object):
'please report this issue on http://yt-dl.org/bug' % _name)
return None
- def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+ def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
- res = self._search_regex(pattern, string, name, default, fatal, flags)
+ res = self._search_regex(pattern, string, name, default, fatal, flags, group)
if res:
return clean_html(res).strip()
else:
@@ -533,9 +537,9 @@ class InfoExtractor(object):
display_name = name
return self._html_search_regex(
r'''(?ix)<meta
- (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
- [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
- html, display_name, fatal=fatal, **kwargs)
+ (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+ [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
+ html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
return self._html_search_meta('dc.creator', html, 'uploader')
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index cc612d08e..0bd0eccba 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -17,7 +17,6 @@ from ..utils import (
bytes_to_intlist,
intlist_to_bytes,
unified_strdate,
- clean_html,
urlencode_postdata,
)
from ..aes import (
diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py
index 5f24ac721..aefca848a 100644
--- a/youtube_dl/extractor/dropbox.py
+++ b/youtube_dl/extractor/dropbox.py
@@ -5,7 +5,8 @@ import os.path
import re
from .common import InfoExtractor
-from ..utils import compat_urllib_parse_unquote, url_basename
+from ..compat import compat_urllib_parse_unquote
+from ..utils import url_basename
class DropboxIE(InfoExtractor):
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index 3ad993751..104803563 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -5,12 +5,14 @@ import re
import socket
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_http_client,
compat_str,
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
urlencode_postdata,
ExtractorError,
limit_length,
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index 3d67b9d60..d570e3f6a 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -8,12 +8,11 @@ from ..utils import (
compat_urllib_parse,
compat_urlparse,
unescapeHTML,
- get_meta_content,
)
class GameSpotIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
+ _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
_TEST = {
'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
@@ -26,10 +25,10 @@ class GameSpotIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- page_id = mobj.group('page_id')
+ page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
- data_video_json = self._search_regex(r'data-video=["\'](.*?)["\']', webpage, 'data video')
+ data_video_json = self._search_regex(
+ r'data-video=["\'](.*?)["\']', webpage, 'data video')
data_video = json.loads(unescapeHTML(data_video_json))
# Transform the manifest url to a link to the mp4 files
@@ -41,7 +40,8 @@ class GameSpotIE(InfoExtractor):
http_path = f4m_path[1:].split('/', 1)[1]
http_template = re.sub(QUALITIES_RE, r'%s', http_path)
http_template = http_template.replace('.csmil/manifest.f4m', '')
- http_template = compat_urlparse.urljoin('http://video.gamespotcdn.com/', http_template)
+ http_template = compat_urlparse.urljoin(
+ 'http://video.gamespotcdn.com/', http_template)
formats = []
for q in qualities:
formats.append({
@@ -52,8 +52,9 @@ class GameSpotIE(InfoExtractor):
return {
'id': data_video['guid'],
+ 'display_id': page_id,
'title': compat_urllib_parse.unquote(data_video['title']),
'formats': formats,
- 'description': get_meta_content('description', webpage),
+ 'description': self._html_search_meta('description', webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 8abc340b4..01d6a57f8 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -7,11 +7,12 @@ import re
from .common import InfoExtractor
from .youtube import YoutubeIE
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urlparse,
compat_xml_parse_error,
-
+)
+from ..utils import (
determine_ext,
ExtractorError,
float_or_none,
@@ -99,6 +100,22 @@ class GenericIE(InfoExtractor):
'uploader': 'Championat',
},
},
+ {
+ # https://github.com/rg3/youtube-dl/issues/3541
+ 'add_ie': ['Brightcove'],
+ 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
+ 'info_dict': {
+ 'id': '3866516442001',
+ 'ext': 'mp4',
+ 'title': 'Leer mij vrouwen kennen: Aflevering 1',
+ 'description': 'Leer mij vrouwen kennen: Aflevering 1',
+ 'uploader': 'SBS Broadcasting',
+ },
+ 'skip': 'Restricted to Netherlands',
+ 'params': {
+ 'skip_download': True, # m3u8 download
+ },
+ },
# Direct link to a video
{
'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py
index 77c3ad4fc..66ca37918 100644
--- a/youtube_dl/extractor/globo.py
+++ b/youtube_dl/extractor/globo.py
@@ -5,13 +5,15 @@ import random
import math
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- float_or_none,
+from ..compat import (
compat_str,
compat_chr,
compat_ord,
)
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+)
class GloboIE(InfoExtractor):
diff --git a/youtube_dl/extractor/goshgay.py b/youtube_dl/extractor/goshgay.py
index 7bca21ad0..18474cbb7 100644
--- a/youtube_dl/extractor/goshgay.py
+++ b/youtube_dl/extractor/goshgay.py
@@ -1,15 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
- str_to_int,
ExtractorError,
)
-import json
class GoshgayIE(InfoExtractor):
@@ -27,36 +23,27 @@ class GoshgayIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._search_regex(r'class="video-title"><h1>(.+?)<', webpage, 'title')
+ title = self._og_search_title(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ family_friendly = self._html_search_meta(
+ 'isFamilyFriendly', webpage, default='false')
+ config_url = self._search_regex(
+ r"'config'\s*:\s*'([^']+)'", webpage, 'config URL')
- player_config = self._search_regex(
- r'(?s)jwplayer\("player"\)\.setup\(({.+?})\)', webpage, 'config settings')
- player_vars = json.loads(player_config.replace("'", '"'))
- width = str_to_int(player_vars.get('width'))
- height = str_to_int(player_vars.get('height'))
- config_uri = player_vars.get('config')
+ config = self._download_xml(
+ config_url, video_id, 'Downloading player config XML')
- if config_uri is None:
- raise ExtractorError('Missing config URI')
- node = self._download_xml(config_uri, video_id, 'Downloading player config XML',
- errnote='Unable to download XML')
- if node is None:
+ if config is None:
raise ExtractorError('Missing config XML')
- if node.tag != 'config':
+ if config.tag != 'config':
raise ExtractorError('Missing config attribute')
- fns = node.findall('file')
- imgs = node.findall('image')
- if len(fns) != 1:
+ fns = config.findall('file')
+ if len(fns) < 1:
raise ExtractorError('Missing media URI')
video_url = fns[0].text
- if len(imgs) < 1:
- thumbnail = None
- else:
- thumbnail = imgs[0].text
url_comp = compat_urlparse.urlparse(url)
ref = "%s://%s%s" % (url_comp[0], url_comp[1], url_comp[2])
@@ -65,9 +52,7 @@ class GoshgayIE(InfoExtractor):
'id': video_id,
'url': video_url,
'title': title,
- 'width': width,
- 'height': height,
'thumbnail': thumbnail,
'http_referer': ref,
- 'age_limit': 18,
+ 'age_limit': 0 if family_friendly == 'true' else 18,
}
diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py
index d41c0413f..278d9f527 100644
--- a/youtube_dl/extractor/heise.py
+++ b/youtube_dl/extractor/heise.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- get_meta_content,
+ determine_ext,
int_or_none,
parse_iso8601,
)
@@ -25,11 +25,11 @@ class HeiseIE(InfoExtractor):
'title': (
"Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone"
),
- 'format_id': 'mp4_720',
+ 'format_id': 'mp4_720p',
'timestamp': 1411812600,
'upload_date': '20140927',
'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.',
- 'thumbnail': 're:https?://.*\.jpg$',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
}
}
@@ -49,11 +49,12 @@ class HeiseIE(InfoExtractor):
info = {
'id': video_id,
'thumbnail': self._og_search_thumbnail(webpage),
- 'timestamp': parse_iso8601(get_meta_content('date', webpage)),
+ 'timestamp': parse_iso8601(
+ self._html_search_meta('date', webpage)),
'description': self._og_search_description(webpage),
}
- title = get_meta_content('fulltitle', webpage)
+ title = self._html_search_meta('fulltitle', webpage)
if title:
info['title'] = title
else:
@@ -64,9 +65,12 @@ class HeiseIE(InfoExtractor):
label = source_node.attrib['label']
height = int_or_none(self._search_regex(
r'^(.*?_)?([0-9]+)p$', label, 'height', default=None))
+ video_url = source_node.attrib['file']
+ ext = determine_ext(video_url, '')
formats.append({
- 'url': source_node.attrib['file'],
+ 'url': video_url,
'format_note': label,
+ 'format_id': '%s_%s' % (ext, label),
'height': height,
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index 4536db3bf..6108ed552 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -6,7 +6,6 @@ import json
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
- get_element_by_attribute,
)
@@ -27,10 +26,11 @@ class ImdbIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id)
- descr = get_element_by_attribute('itemprop', 'description', webpage)
+ descr = self._html_search_regex(
+ r'(?s)<span itemprop="description">(.*?)</span>',
+ webpage, 'description', fatal=False)
available_formats = re.findall(
r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage,
flags=re.MULTILINE)
@@ -73,9 +73,7 @@ class ImdbListIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- list_id = mobj.group('id')
-
+ list_id = self._match_id(url)
webpage = self._download_webpage(url, list_id)
entries = [
self.url_result('http://www.imdb.com' + m, 'Imdb')
diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py
index 07ef682ee..d16d483ee 100644
--- a/youtube_dl/extractor/izlesene.py
+++ b/youtube_dl/extractor/izlesene.py
@@ -5,11 +5,11 @@ import re
from .common import InfoExtractor
from ..utils import (
- get_element_by_id,
- parse_iso8601,
determine_ext,
- int_or_none,
float_or_none,
+ get_element_by_id,
+ int_or_none,
+ parse_iso8601,
str_to_int,
)
@@ -30,7 +30,7 @@ class IzleseneIE(InfoExtractor):
'description': 'md5:253753e2655dde93f59f74b572454f6d',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'pelikzzle',
- 'timestamp': 1404298698,
+ 'timestamp': 1404302298,
'upload_date': '20140702',
'duration': 95.395,
'age_limit': 0,
@@ -46,7 +46,7 @@ class IzleseneIE(InfoExtractor):
'description': 'Tarkan Dortmund 2006 Konseri',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'parlayankiz',
- 'timestamp': 1163318593,
+ 'timestamp': 1163322193,
'upload_date': '20061112',
'duration': 253.666,
'age_limit': 0,
@@ -55,10 +55,9 @@ class IzleseneIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- url = 'http://www.izlesene.com/video/%s' % video_id
+ video_id = self._match_id(url)
+ url = 'http://www.izlesene.com/video/%s' % video_id
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage)
diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py
index 263f68773..102e29f7a 100644
--- a/youtube_dl/extractor/laola1tv.py
+++ b/youtube_dl/extractor/laola1tv.py
@@ -4,6 +4,7 @@ import random
import re
from .common import InfoExtractor
+from ..utils import ExtractorError
class Laola1TvIE(InfoExtractor):
diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py
index ccb5959c4..a89153985 100644
--- a/youtube_dl/extractor/myvideo.py
+++ b/youtube_dl/extractor/myvideo.py
@@ -7,11 +7,12 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_ord,
compat_urllib_parse,
compat_urllib_request,
-
+)
+from ..utils import (
ExtractorError,
)
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
index 62d5707fe..45cbd4ee9 100644
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@@ -12,6 +12,7 @@ from ..utils import (
unified_strdate,
parse_duration,
int_or_none,
+ ExtractorError,
)
@@ -108,6 +109,9 @@ class NiconicoIE(InfoExtractor):
flv_info_request, video_id,
note='Downloading flv info', errnote='Unable to download flv info')
+ if 'deleted=' in flv_info_webpage:
+ raise ExtractorError('The video has been deleted.',
+ expected=True)
video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
# Start extracting information
diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py
index 645a1e06d..17880471d 100644
--- a/youtube_dl/extractor/played.py
+++ b/youtube_dl/extractor/played.py
@@ -6,6 +6,7 @@ import os.path
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
compat_urllib_parse,
compat_urllib_request,
)
@@ -29,6 +30,12 @@ class PlayedIE(InfoExtractor):
video_id = self._match_id(url)
orig_webpage = self._download_webpage(url, video_id)
+
+ m_error = re.search(
+ r'(?s)Reason for deletion:.*?<b class="err"[^>]*>(?P<msg>[^<]+)</b>', orig_webpage)
+ if m_error:
+ raise ExtractorError(m_error.group('msg'), expected=True)
+
fields = re.findall(
r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage)
data = dict(fields)
diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py
index 0a3a71448..962b524e9 100644
--- a/youtube_dl/extractor/ro220.py
+++ b/youtube_dl/extractor/ro220.py
@@ -1,7 +1,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import compat_urllib_parse_unquote
+from ..compat import compat_urllib_parse_unquote
class Ro220IE(InfoExtractor):
diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py
index a73f3c43a..e7b79243a 100644
--- a/youtube_dl/extractor/trutube.py
+++ b/youtube_dl/extractor/trutube.py
@@ -29,7 +29,7 @@ class TruTubeIE(InfoExtractor):
# filehd is always 404
video_url = xpath_text(config, './file', 'video URL', fatal=True)
- title = xpath_text(config, './title', 'title')
+ title = xpath_text(config, './title', 'title').strip()
thumbnail = xpath_text(config, './image', ' thumbnail')
return {
diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py
index cee1ea8f6..875450908 100644
--- a/youtube_dl/extractor/ustream.py
+++ b/youtube_dl/extractor/ustream.py
@@ -5,7 +5,6 @@ import re
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
- get_meta_content,
)
@@ -79,7 +78,7 @@ class UstreamChannelIE(InfoExtractor):
m = re.match(self._VALID_URL, url)
display_id = m.group('slug')
webpage = self._download_webpage(url, display_id)
- channel_id = get_meta_content('ustream:channel_id', webpage)
+ channel_id = self._html_search_meta('ustream:channel_id', webpage)
BASE = 'http://www.ustream.tv'
next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
new file mode 100644
index 000000000..f11ca8217
--- /dev/null
+++ b/youtube_dl/extractor/vice.py
@@ -0,0 +1,38 @@
+from __future__ import unicode_literals
+import re
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+from ..utils import ExtractorError
+
+
+class ViceIE(InfoExtractor):
+ _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
+
+ _TEST = {
+ 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
+ 'info_dict': {
+ 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+ 'ext': 'mp4',
+ 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
+ },
+ 'params': {
+ # Requires ffmpeg (m3u8 manifest)
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ name = mobj.group('name')
+ webpage = self._download_webpage(url, name)
+ try:
+ embed_code = self._search_regex(
+ r'embedCode=([^&\'"]+)', webpage,
+ 'ooyala embed code')
+ ooyala_url = OoyalaIE._url_for_embed_code(embed_code)
+ print(ooyala_url)
+ except ExtractorError:
+ raise ExtractorError('The page doesn\'t contain a video', expected=True)
+ return self.url_result(ooyala_url, ie='Ooyala')
+
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index d9cad0ea5..c744d4f04 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -7,11 +7,13 @@ import itertools
from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
-from ..utils import (
+from ..compat import (
compat_HTTPError,
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
+)
+from ..utils import (
ExtractorError,
InAdvancePagedList,
int_or_none,
diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py
index 3377a543e..d6dec25ca 100644
--- a/youtube_dl/extractor/wimp.py
+++ b/youtube_dl/extractor/wimp.py
@@ -37,7 +37,7 @@ class WimpIE(InfoExtractor):
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
- r"'file'\s*:\s*'([^']+)'", webpage, 'video URL')
+ r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL')
if YoutubeIE.suitable(video_url):
self.to_screen('Found YouTube video')
return {
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index aad8ffbf4..c77d4056f 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -684,7 +684,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# Get video info
self.report_video_info_webpage_download(video_id)
if re.search(r'player-age-gate-content">', video_webpage) is not None:
- self.report_age_confirmation()
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
@@ -692,12 +691,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'video_id': video_id,
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
'sts': self._search_regex(
- r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
+ r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
})
video_info_url = proto + '://www.youtube.com/get_video_info?' + data
- video_info_webpage = self._download_webpage(video_info_url, video_id,
- note=False,
- errnote='unable to download video info webpage')
+ video_info_webpage = self._download_webpage(
+ video_info_url, video_id,
+ note='Refetching age-gated info webpage',
+ errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
else:
age_gate = False