aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py13
-rw-r--r--youtube_dl/extractor/appletrailers.py3
-rw-r--r--youtube_dl/extractor/bandcamp.py69
-rw-r--r--youtube_dl/extractor/blinkx.py7
-rw-r--r--youtube_dl/extractor/bliptv.py162
-rw-r--r--youtube_dl/extractor/brightcove.py77
-rw-r--r--youtube_dl/extractor/channel9.py20
-rw-r--r--youtube_dl/extractor/cmt.py19
-rw-r--r--youtube_dl/extractor/cnn.py71
-rw-r--r--youtube_dl/extractor/collegehumor.py106
-rw-r--r--youtube_dl/extractor/comedycentral.py4
-rw-r--r--youtube_dl/extractor/common.py86
-rw-r--r--youtube_dl/extractor/cspan.py65
-rw-r--r--youtube_dl/extractor/dreisat.py15
-rw-r--r--youtube_dl/extractor/generic.py147
-rw-r--r--youtube_dl/extractor/imdb.py29
-rw-r--r--youtube_dl/extractor/internetvideoarchive.py13
-rw-r--r--youtube_dl/extractor/ivi.py18
-rw-r--r--youtube_dl/extractor/jpopsukitv.py73
-rw-r--r--youtube_dl/extractor/lynda.py142
-rw-r--r--youtube_dl/extractor/macgamestore.py43
-rw-r--r--youtube_dl/extractor/mdr.py3
-rw-r--r--youtube_dl/extractor/mit.py17
-rw-r--r--youtube_dl/extractor/mixcloud.py2
-rw-r--r--youtube_dl/extractor/mtv.py2
-rw-r--r--youtube_dl/extractor/myvideo.py6
-rw-r--r--youtube_dl/extractor/orf.py120
-rw-r--r--youtube_dl/extractor/pornhd.py2
-rw-r--r--youtube_dl/extractor/smotri.py57
-rw-r--r--youtube_dl/extractor/soundcloud.py4
-rw-r--r--youtube_dl/extractor/spiegel.py3
-rw-r--r--youtube_dl/extractor/theplatform.py14
-rw-r--r--youtube_dl/extractor/veehd.py39
-rw-r--r--youtube_dl/extractor/veoh.py46
-rw-r--r--youtube_dl/extractor/vimeo.py146
-rw-r--r--youtube_dl/extractor/wistia.py4
-rw-r--r--youtube_dl/extractor/yahoo.py12
-rw-r--r--youtube_dl/extractor/youporn.py38
-rw-r--r--youtube_dl/extractor/youtube.py322
-rw-r--r--youtube_dl/extractor/zdf.py36
40 files changed, 1218 insertions, 837 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index a39a1e2f4..f1167989e 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -28,6 +28,7 @@ from .channel9 import Channel9IE
from .cinemassacre import CinemassacreIE
from .clipfish import ClipfishIE
from .clipsyndicate import ClipsyndicateIE
+from .cmt import CMTIE
from .cnn import CNNIE
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
@@ -79,7 +80,10 @@ from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
from .hypem import HypemIE
from .ign import IGNIE, OneUPIE
-from .imdb import ImdbIE
+from .imdb import (
+ ImdbIE,
+ ImdbListIE
+)
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE
@@ -91,12 +95,18 @@ from .ivi import (
from .jeuxvideo import JeuxVideoIE
from .jukebox import JukeboxIE
from .justintv import JustinTVIE
+from .jpopsukitv import JpopsukiIE
from .kankan import KankanIE
from .keezmovies import KeezMoviesIE
from .kickstarter import KickStarterIE
from .keek import KeekIE
from .liveleak import LiveLeakIE
from .livestream import LivestreamIE, LivestreamOriginalIE
+from .lynda import (
+ LyndaIE,
+ LyndaCourseIE
+)
+from .macgamestore import MacGameStoreIE
from .mdr import MDRIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
@@ -189,6 +199,7 @@ from .vimeo import (
VimeoUserIE,
VimeoAlbumIE,
VimeoGroupsIE,
+ VimeoReviewIE,
)
from .vine import VineIE
from .viki import VikiIE
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index ef5644aa5..e7361ae06 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -110,7 +110,8 @@ class AppleTrailersIE(InfoExtractor):
'width': format['width'],
'height': int(format['height']),
})
- formats = sorted(formats, key=lambda f: (f['height'], f['width']))
+
+ self._sort_formats(formats)
playlist.append({
'_type': 'video',
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 3a32c14c5..15aee2786 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -10,14 +10,14 @@ from ..utils import (
class BandcampIE(InfoExtractor):
- IE_NAME = u'Bandcamp'
_VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
_TESTS = [{
u'url': u'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
u'file': u'1812978515.mp3',
- u'md5': u'cdeb30cdae1921719a3cbcab696ef53c',
+ u'md5': u'c557841d5e50261777a6585648adf439',
u'info_dict': {
- u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad"
+ u"title": u"youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
+ u"duration": 10,
},
u'skip': u'There is a limit of 200 free downloads / month for the test song'
}]
@@ -30,29 +30,42 @@ class BandcampIE(InfoExtractor):
m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
if m_download is None:
m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
- if m_trackinfo:
- json_code = m_trackinfo.group(1)
- data = json.loads(json_code)
+ if m_trackinfo:
+ json_code = m_trackinfo.group(1)
+ data = json.loads(json_code)
+ d = data[0]
+
+ duration = int(round(d['duration']))
+ formats = []
+ for format_id, format_url in d['file'].items():
+ ext, _, abr_str = format_id.partition('-')
+
+ formats.append({
+ 'format_id': format_id,
+ 'url': format_url,
+ 'ext': format_id.partition('-')[0],
+ 'vcodec': 'none',
+ 'acodec': format_id.partition('-')[0],
+ 'abr': int(format_id.partition('-')[2]),
+ })
+
+ self._sort_formats(formats)
- for d in data:
- formats = [{
- 'format_id': 'format_id',
- 'url': format_url,
- 'ext': format_id.partition('-')[0]
- } for format_id, format_url in sorted(d['file'].items())]
return {
'id': compat_str(d['id']),
'title': d['title'],
'formats': formats,
+ 'duration': duration,
}
- else:
- raise ExtractorError(u'No free songs found')
+ else:
+ raise ExtractorError(u'No free songs found')
download_link = m_download.group(1)
- id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
- webpage, re.MULTILINE|re.DOTALL).group('id')
+ video_id = re.search(
+ r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
+ webpage, re.MULTILINE | re.DOTALL).group('id')
- download_webpage = self._download_webpage(download_link, id,
+ download_webpage = self._download_webpage(download_link, video_id,
'Downloading free downloads page')
# We get the dictionary of the track from some javascrip code
info = re.search(r'items: (.*?),$',
@@ -66,21 +79,21 @@ class BandcampIE(InfoExtractor):
m_url = re.match(re_url, initial_url)
#We build the url we will use to get the final track url
# This url is build in Bandcamp in the script download_bunde_*.js
- request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
+ request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), video_id, m_url.group('ts'))
final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
# If we could correctly generate the .rand field the url would be
#in the "download_url" key
final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
- track_info = {'id':id,
- 'title' : info[u'title'],
- 'ext' : 'mp3',
- 'url' : final_url,
- 'thumbnail' : info[u'thumb_url'],
- 'uploader' : info[u'artist']
- }
-
- return [track_info]
+ return {
+ 'id': video_id,
+ 'title': info[u'title'],
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ 'url': final_url,
+ 'thumbnail': info[u'thumb_url'],
+ 'uploader': info[u'artist'],
+ }
class BandcampAlbumIE(InfoExtractor):
@@ -117,7 +130,7 @@ class BandcampAlbumIE(InfoExtractor):
webpage = self._download_webpage(url, title)
tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
if not tracks_paths:
- raise ExtractorError(u'The page doesn\'t contain any track')
+ raise ExtractorError(u'The page doesn\'t contain any tracks')
entries = [
self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
for t_path in tracks_paths]
diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py
index 144ce64cc..0229840a3 100644
--- a/youtube_dl/extractor/blinkx.py
+++ b/youtube_dl/extractor/blinkx.py
@@ -61,9 +61,10 @@ class BlinkxIE(InfoExtractor):
elif m['type'] in ('flv', 'mp4'):
vcodec = remove_start(m['vcodec'], 'ff')
acodec = remove_start(m['acodec'], 'ff')
+ tbr = (int(m['vbr']) + int(m['abr'])) // 1000
format_id = (u'%s-%sk-%s' %
(vcodec,
- (int(m['vbr']) + int(m['abr'])) // 1000,
+ tbr,
m['w']))
formats.append({
'format_id': format_id,
@@ -72,10 +73,12 @@ class BlinkxIE(InfoExtractor):
'acodec': acodec,
'abr': int(m['abr']) // 1000,
'vbr': int(m['vbr']) // 1000,
+ 'tbr': tbr,
'width': int(m['w']),
'height': int(m['h']),
})
- formats.sort(key=lambda f: (f['width'], f['vbr'], f['abr']))
+
+ self._sort_formats(formats)
return {
'id': display_id,
diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py
index 5e33a69df..3ce9b5324 100644
--- a/youtube_dl/extractor/bliptv.py
+++ b/youtube_dl/extractor/bliptv.py
@@ -1,16 +1,15 @@
+from __future__ import unicode_literals
+
import datetime
import json
-import os
import re
import socket
from .common import InfoExtractor
from ..utils import (
compat_http_client,
- compat_parse_qs,
compat_str,
compat_urllib_error,
- compat_urllib_parse_urlparse,
compat_urllib_request,
ExtractorError,
@@ -22,42 +21,35 @@ class BlipTVIE(InfoExtractor):
"""Information extractor for blip.tv"""
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
- _URL_EXT = r'^.*\.([a-z0-9]+)$'
- IE_NAME = u'blip.tv'
+
_TEST = {
- u'url': u'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
- u'file': u'5779306.m4v',
- u'md5': u'80baf1ec5c3d2019037c1c707d676b9f',
- u'info_dict': {
- u"upload_date": u"20111205",
- u"description": u"md5:9bc31f227219cde65e47eeec8d2dc596",
- u"uploader": u"Comic Book Resources - CBR TV",
- u"title": u"CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3"
+ 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
+ 'file': '5779306.mov',
+ 'md5': 'c6934ad0b6acf2bd920720ec888eb812',
+ 'info_dict': {
+ 'upload_date': '20111205',
+ 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596',
+ 'uploader': 'Comic Book Resources - CBR TV',
+ 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',
}
}
def report_direct_download(self, title):
"""Report information extraction."""
- self.to_screen(u'%s: Direct download detected' % title)
+ self.to_screen('%s: Direct download detected' % title)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
# See https://github.com/rg3/youtube-dl/issues/857
- api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
- if api_mobj is not None:
- url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
- urlp = compat_urllib_parse_urlparse(url)
- if urlp.path.startswith('/play/'):
- response = self._request_webpage(url, None, False)
- redirecturl = response.geturl()
- rurlp = compat_urllib_parse_urlparse(redirecturl)
- file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
- url = 'http://blip.tv/a/a-' + file_id
- return self._real_extract(url)
-
+ embed_mobj = re.search(r'^(?:https?://)?(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', url)
+ if embed_mobj:
+ info_url = 'http://blip.tv/play/%s.x?p=1' % embed_mobj.group(1)
+ info_page = self._download_webpage(info_url, embed_mobj.group(1))
+ video_id = self._search_regex(r'data-episode-id="(\d+)', info_page, 'video_id')
+ return self.url_result('http://blip.tv/a/a-' + video_id, 'BlipTV')
if '?' in url:
cchar = '&'
@@ -67,67 +59,55 @@ class BlipTVIE(InfoExtractor):
request = compat_urllib_request.Request(json_url)
request.add_header('User-Agent', 'iTunes/10.6.1')
self.report_extraction(mobj.group(1))
- info = None
urlh = self._request_webpage(request, None, False,
- u'unable to download video info webpage')
- if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
- basename = url.split('/')[-1]
- title,ext = os.path.splitext(basename)
- title = title.decode('UTF-8')
- ext = ext.replace('.', '')
- self.report_direct_download(title)
- info = {
- 'id': title,
- 'url': url,
- 'uploader': None,
- 'upload_date': None,
- 'title': title,
- 'ext': ext,
- 'urlhandle': urlh
+ 'unable to download video info webpage')
+
+ try:
+ json_code_bytes = urlh.read()
+ json_code = json_code_bytes.decode('utf-8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ raise ExtractorError('Unable to read video info webpage: %s' % compat_str(err))
+
+ try:
+ json_data = json.loads(json_code)
+ if 'Post' in json_data:
+ data = json_data['Post']
+ else:
+ data = json_data
+
+ upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
+ formats = []
+ if 'additionalMedia' in data:
+ for f in sorted(data['additionalMedia'], key=lambda f: int(f['media_height'])):
+ if not int(f['media_width']): # filter m3u8
+ continue
+ formats.append({
+ 'url': f['url'],
+ 'format_id': f['role'],
+ 'width': int(f['media_width']),
+ 'height': int(f['media_height']),
+ })
+ else:
+ formats.append({
+ 'url': data['media']['url'],
+ 'width': int(data['media']['width']),
+ 'height': int(data['media']['height']),
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': compat_str(data['item_id']),
+ 'uploader': data['display_name'],
+ 'upload_date': upload_date,
+ 'title': data['title'],
+ 'thumbnail': data['thumbnailUrl'],
+ 'description': data['description'],
+ 'user_agent': 'iTunes/10.6.1',
+ 'formats': formats,
}
- if info is None: # Regular URL
- try:
- json_code_bytes = urlh.read()
- json_code = json_code_bytes.decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
-
- try:
- json_data = json.loads(json_code)
- if 'Post' in json_data:
- data = json_data['Post']
- else:
- data = json_data
-
- upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
- if 'additionalMedia' in data:
- formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height']))
- best_format = formats[-1]
- video_url = best_format['url']
- else:
- video_url = data['media']['url']
- umobj = re.match(self._URL_EXT, video_url)
- if umobj is None:
- raise ValueError('Can not determine filename extension')
- ext = umobj.group(1)
-
- info = {
- 'id': compat_str(data['item_id']),
- 'url': video_url,
- 'uploader': data['display_name'],
- 'upload_date': upload_date,
- 'title': data['title'],
- 'ext': ext,
- 'format': data['media']['mimeType'],
- 'thumbnail': data['thumbnailUrl'],
- 'description': data['description'],
- 'player_url': data['embedUrl'],
- 'user_agent': 'iTunes/10.6.1',
- }
- except (ValueError,KeyError) as err:
- raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
-
- return [info]
+ except (ValueError, KeyError) as err:
+ raise ExtractorError('Unable to parse video information: %s' % repr(err))
class BlipTVUserIE(InfoExtractor):
@@ -135,19 +115,19 @@ class BlipTVUserIE(InfoExtractor):
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
_PAGE_SIZE = 12
- IE_NAME = u'blip.tv:user'
+ IE_NAME = 'blip.tv:user'
def _real_extract(self, url):
# Extract username
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
username = mobj.group(1)
page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
- page = self._download_webpage(url, username, u'Downloading user page')
+ page = self._download_webpage(url, username, 'Downloading user page')
mobj = re.search(r'data-users-id="([^"]+)"', page)
page_base = page_base % mobj.group(1)
@@ -163,7 +143,7 @@ class BlipTVUserIE(InfoExtractor):
while True:
url = page_base + "&page=" + str(pagenum)
page = self._download_webpage(url, username,
- u'Downloading video ids from page %d' % pagenum)
+ 'Downloading video ids from page %d' % pagenum)
# Extract video identifiers
ids_in_page = []
@@ -185,6 +165,6 @@ class BlipTVUserIE(InfoExtractor):
pagenum += 1
- urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
+ urls = ['http://blip.tv/%s' % video_id for video_id in video_ids]
url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls]
return [self.playlist_result(url_entries, playlist_title = username)]
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index f7f0041c0..4ba3f7c42 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -1,4 +1,5 @@
# encoding: utf-8
+from __future__ import unicode_literals
import re
import json
@@ -13,6 +14,7 @@ from ..utils import (
compat_urllib_request,
ExtractorError,
+ unsmuggle_url,
)
@@ -24,47 +26,47 @@ class BrightcoveIE(InfoExtractor):
_TESTS = [
{
# From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
- u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
- u'file': u'2371591881001.mp4',
- u'md5': u'5423e113865d26e40624dce2e4b45d95',
- u'note': u'Test Brightcove downloads and detection in GenericIE',
- u'info_dict': {
- u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
- u'uploader': u'8TV',
- u'description': u'md5:a950cc4285c43e44d763d036710cd9cd',
+ 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
+ 'file': '2371591881001.mp4',
+ 'md5': '5423e113865d26e40624dce2e4b45d95',
+ 'note': 'Test Brightcove downloads and detection in GenericIE',
+ 'info_dict': {
+ 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
+ 'uploader': '8TV',
+ 'description': 'md5:a950cc4285c43e44d763d036710cd9cd',
}
},
{
# From http://medianetwork.oracle.com/video/player/1785452137001
- u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001',
- u'file': u'1785452137001.flv',
- u'info_dict': {
- u'title': u'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
- u'description': u'John Rose speaks at the JVM Language Summit, August 1, 2012.',
- u'uploader': u'Oracle',
+ 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001',
+ 'file': '1785452137001.flv',
+ 'info_dict': {
+ 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
+ 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.',
+ 'uploader': 'Oracle',
},
},
{
# From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/
- u'url': u'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
- u'info_dict': {
- u'id': u'2750934548001',
- u'ext': u'mp4',
- u'title': u'This Bracelet Acts as a Personal Thermostat',
- u'description': u'md5:547b78c64f4112766ccf4e151c20b6a0',
- u'uploader': u'Mashable',
+ 'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
+ 'info_dict': {
+ 'id': '2750934548001',
+ 'ext': 'mp4',
+ 'title': 'This Bracelet Acts as a Personal Thermostat',
+ 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0',
+ 'uploader': 'Mashable',
},
},
{
# test that the default referer works
# from http://national.ballet.ca/interact/video/Lost_in_Motion_II/
- u'url': u'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
- u'info_dict': {
- u'id': u'2878862109001',
- u'ext': u'mp4',
- u'title': u'Lost in Motion II',
- u'description': u'md5:363109c02998fee92ec02211bd8000df',
- u'uploader': u'National Ballet of Canada',
+ 'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
+ 'info_dict': {
+ 'id': '2878862109001',
+ 'ext': 'mp4',
+ 'title': 'Lost in Motion II',
+ 'description': 'md5:363109c02998fee92ec02211bd8000df',
+ 'uploader': 'National Ballet of Canada',
},
},
]
@@ -80,10 +82,10 @@ class BrightcoveIE(InfoExtractor):
object_str = re.sub(r'(<param name="[^"]+" value="[^"]+")>',
lambda m: m.group(1) + '/>', object_str)
# Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
- object_str = object_str.replace(u'<--', u'<!--')
+ object_str = object_str.replace('<--', '<!--')
object_doc = xml.etree.ElementTree.fromstring(object_str)
- assert u'BrightcoveExperience' in object_doc.attrib['class']
+ assert 'BrightcoveExperience' in object_doc.attrib['class']
params = {'flashID': object_doc.attrib['id'],
'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
}
@@ -120,6 +122,8 @@ class BrightcoveIE(InfoExtractor):
return None
def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+
# Change the 'videoId' and others field to '@videoPlayer'
url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url)
# Change bckey (used by bcove.me urls) to playerKey
@@ -130,9 +134,10 @@ class BrightcoveIE(InfoExtractor):
videoPlayer = query.get('@videoPlayer')
if videoPlayer:
- return self._get_video_info(videoPlayer[0], query_str, query,
- # We set the original url as the default 'Referer' header
- referer=url)
+ # We set the original url as the default 'Referer' header
+ referer = smuggled_data.get('Referer', url)
+ return self._get_video_info(
+ videoPlayer[0], query_str, query, referer=referer)
else:
player_key = query['playerKey']
return self._get_playlist_info(player_key[0])
@@ -156,11 +161,11 @@ class BrightcoveIE(InfoExtractor):
def _get_playlist_info(self, player_key):
playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
- player_key, u'Downloading playlist information')
+ player_key, 'Downloading playlist information')
json_data = json.loads(playlist_info)
if 'videoList' not in json_data:
- raise ExtractorError(u'Empty playlist')
+ raise ExtractorError('Empty playlist')
playlist_info = json_data['videoList']
videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
@@ -189,5 +194,5 @@ class BrightcoveIE(InfoExtractor):
'url': video_info['FLVFullLengthURL'],
})
else:
- raise ExtractorError(u'Unable to extract video url for %s' % info['id'])
+ raise ExtractorError('Unable to extract video url for %s' % info['id'])
return info
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py
index ae70ea229..574881b70 100644
--- a/youtube_dl/extractor/channel9.py
+++ b/youtube_dl/extractor/channel9.py
@@ -76,14 +76,18 @@ class Channel9IE(InfoExtractor):
</div>)? # File size part may be missing
'''
# Extract known formats
- formats = [{'url': x.group('url'),
- 'format_id': x.group('quality'),
- 'format_note': x.group('note'),
- 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
- 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
- } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
- # Sort according to known formats list
- formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
+ formats = [{
+ 'url': x.group('url'),
+ 'format_id': x.group('quality'),
+ 'format_note': x.group('note'),
+ 'format': u'%s (%s)' % (x.group('quality'), x.group('note')),
+ 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
+ 'preference': self._known_formats.index(x.group('quality')),
+ 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
+ } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
+
+ self._sort_formats(formats)
+
return formats
def _extract_title(self, html):
diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py
new file mode 100644
index 000000000..88e0e9aba
--- /dev/null
+++ b/youtube_dl/extractor/cmt.py
@@ -0,0 +1,19 @@
+from .mtv import MTVIE
+
+class CMTIE(MTVIE):
+ IE_NAME = u'cmt.com'
+ _VALID_URL = r'https?://www\.cmt\.com/videos/.+?/(?P<videoid>[^/]+)\.jhtml'
+ _FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/'
+
+ _TESTS = [
+ {
+ u'url': u'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061',
+ u'md5': u'e6b7ef3c4c45bbfae88061799bbba6c2',
+ u'info_dict': {
+ u'id': u'989124',
+ u'ext': u'mp4',
+ u'title': u'Garth Brooks - "The Call (featuring Trisha Yearwood)"',
+ u'description': u'Blame It All On My Roots',
+ },
+ },
+ ]
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index a034bb2fb..ecac5e0e9 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -1,7 +1,10 @@
import re
from .common import InfoExtractor
-from ..utils import determine_ext
+from ..utils import (
+ int_or_none,
+ parse_duration,
+)
class CNNIE(InfoExtractor):
@@ -15,6 +18,8 @@ class CNNIE(InfoExtractor):
u'info_dict': {
u'title': u'Nadal wins 8th French Open title',
u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
+ u'duration': 135,
+ u'upload_date': u'20130609',
},
},
{
@@ -35,22 +40,58 @@ class CNNIE(InfoExtractor):
info = self._download_xml(info_url, page_title)
formats = []
+ rex = re.compile(r'''(?x)
+ (?P<width>[0-9]+)x(?P<height>[0-9]+)
+ (?:_(?P<bitrate>[0-9]+)k)?
+ ''')
for f in info.findall('files/file'):
- mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate'])
- if mf is not None:
- formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text))
- formats = sorted(formats)
- (_,_,_, video_path) = formats[-1]
- video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path
+ video_url = 'http://ht.cdn.turner.com/cnn/big%s' % (f.text.strip())
+ fdct = {
+ 'format_id': f.attrib['bitrate'],
+ 'url': video_url,
+ }
+
+ mf = rex.match(f.attrib['bitrate'])
+ if mf:
+ fdct['width'] = int(mf.group('width'))
+ fdct['height'] = int(mf.group('height'))
+ fdct['tbr'] = int_or_none(mf.group('bitrate'))
+ else:
+ mf = rex.search(f.text)
+ if mf:
+ fdct['width'] = int(mf.group('width'))
+ fdct['height'] = int(mf.group('height'))
+ fdct['tbr'] = int_or_none(mf.group('bitrate'))
+ else:
+ mi = re.match(r'ios_(audio|[0-9]+)$', f.attrib['bitrate'])
+ if mi:
+ if mi.group(1) == 'audio':
+ fdct['vcodec'] = 'none'
+ fdct['ext'] = 'm4a'
+ else:
+ fdct['tbr'] = int(mi.group(1))
+
+ formats.append(fdct)
+
+ self._sort_formats(formats)
thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')])
thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails]
- return {'id': info.attrib['id'],
- 'title': info.find('headline').text,
- 'url': video_url,
- 'ext': determine_ext(video_url),
- 'thumbnail': thumbnails[-1][1],
- 'thumbnails': thumbs_dict,
- 'description': info.find('description').text,
- }
+ metas_el = info.find('metas')
+ upload_date = (
+ metas_el.attrib.get('version') if metas_el is not None else None)
+
+ duration_el = info.find('length')
+ duration = parse_duration(duration_el.text)
+
+ return {
+ 'id': info.attrib['id'],
+ 'title': info.find('headline').text,
+ 'formats': formats,
+ 'thumbnail': thumbnails[-1][1],
+ 'thumbnails': thumbs_dict,
+ 'description': info.find('description').text,
+ 'duration': duration,
+ 'upload_date': upload_date,
+ }
diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py
index b27c1dfc5..d10b7bd0c 100644
--- a/youtube_dl/extractor/collegehumor.py
+++ b/youtube_dl/extractor/collegehumor.py
@@ -1,82 +1,68 @@
+from __future__ import unicode_literals
+
+import json
import re
from .common import InfoExtractor
-from ..utils import (
- compat_urllib_parse_urlparse,
- determine_ext,
-
- ExtractorError,
-)
class CollegeHumorIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$'
_TESTS = [{
- u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
- u'file': u'6902724.mp4',
- u'md5': u'1264c12ad95dca142a9f0bf7968105a0',
- u'info_dict': {
- u'title': u'Comic-Con Cosplay Catastrophe',
- u'description': u'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.',
+ 'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
+ 'file': '6902724.mp4',
+ 'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd',
+ 'info_dict': {
+ 'title': 'Comic-Con Cosplay Catastrophe',
+ 'description': 'Fans get creative this year at San Diego. Too',
+ 'age_limit': 13,
},
},
{
- u'url': u'http://www.collegehumor.com/video/3505939/font-conference',
- u'file': u'3505939.mp4',
- u'md5': u'c51ca16b82bb456a4397987791a835f5',
- u'info_dict': {
- u'title': u'Font Conference',
- u'description': u'This video wasn\'t long enough, so we made it double-spaced.',
+ 'url': 'http://www.collegehumor.com/video/3505939/font-conference',
+ 'file': '3505939.mp4',
+ 'md5': '72fa701d8ef38664a4dbb9e2ab721816',
+ 'info_dict': {
+ 'title': 'Font Conference',
+ 'description': 'This video wasn\'t long enough, so we made it double-spaced.',
+ 'age_limit': 10,
},
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('videoid')
- info = {
- 'id': video_id,
- 'uploader': None,
- 'upload_date': None,
- }
-
- self.report_extraction(video_id)
- xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
- mdoc = self._download_xml(xmlUrl, video_id,
- u'Downloading info XML',
- u'Unable to download video info XML')
+ jsonUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id + '.json'
+ data = json.loads(self._download_webpage(
+ jsonUrl, video_id, 'Downloading info JSON'))
+ vdata = data['video']
- try:
- videoNode = mdoc.findall('./video')[0]
- youtubeIdNode = videoNode.find('./youtubeID')
- if youtubeIdNode is not None:
- return self.url_result(youtubeIdNode.text, 'Youtube')
- info['description'] = videoNode.findall('./description')[0].text
- info['title'] = videoNode.findall('./caption')[0].text
- info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
- next_url = videoNode.findall('./file')[0].text
- except IndexError:
- raise ExtractorError(u'Invalid metadata XML file')
-
- if next_url.endswith(u'manifest.f4m'):
- manifest_url = next_url + '?hdcore=2.10.3'
- adoc = self._download_xml(manifest_url, video_id,
- u'Downloading XML manifest',
- u'Unable to download video info XML')
-
- try:
- video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
- except IndexError:
- raise ExtractorError(u'Invalid manifest file')
- url_pr = compat_urllib_parse_urlparse(info['thumbnail'])
- info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','')
- info['ext'] = 'mp4'
+ AGE_LIMITS = {'nc17': 18, 'r': 18, 'pg13': 13, 'pg': 10, 'g': 0}
+ rating = vdata.get('rating')
+ if rating:
+ age_limit = AGE_LIMITS.get(rating.lower())
else:
- # Old-style direct links
- info['url'] = next_url
- info['ext'] = determine_ext(info['url'])
+ age_limit = None # None = No idea
+
+ PREFS = {'high_quality': 2, 'low_quality': 0}
+ formats = []
+ for format_key in ('mp4', 'webm'):
+ for qname, qurl in vdata[format_key].items():
+ formats.append({
+ 'format_id': format_key + '_' + qname,
+ 'url': qurl,
+ 'format': format_key,
+ 'preference': PREFS.get(qname),
+ })
+ self._sort_formats(formats)
- return info
+ return {
+ 'id': video_id,
+ 'title': vdata['title'],
+ 'description': vdata.get('description'),
+ 'thumbnail': vdata.get('thumbnail'),
+ 'formats': formats,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index a54ce3ee7..27bd8256e 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -12,7 +12,9 @@ from ..utils import (
class ComedyCentralIE(MTVServicesInfoExtractor):
- _VALID_URL = r'https?://(?:www.)?comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)'
+ _VALID_URL = r'''(?x)https?://(?:www.)?comedycentral.com/
+ (video-clips|episodes|cc-studios|video-collections)
+ /(?P<title>.*)'''
_FEED_URL = u'http://comedycentral.com/feeds/mrss/'
_TEST = {
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index ba46a7bc7..2a5e8076c 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -9,6 +9,7 @@ import xml.etree.ElementTree
from ..utils import (
compat_http_client,
compat_urllib_error,
+ compat_urllib_parse_urlparse,
compat_str,
clean_html,
@@ -37,10 +38,12 @@ class InfoExtractor(object):
id: Video identifier.
title: Video title, unescaped.
- Additionally, it must contain either a formats entry or url and ext:
+ Additionally, it must contain either a formats entry or a url one:
- formats: A list of dictionaries for each format available, it must
- be ordered from worst to best quality. Potential fields:
+ formats: A list of dictionaries for each format available, ordered
+ from worst to best quality.
+
+ Potential fields:
* url Mandatory. The URL of the video file
* ext Will be calculated from url if missing
* format A human-readable description of the format
@@ -48,23 +51,36 @@ class InfoExtractor(object):
Calculated from the format_id, width, height.
and format_note fields if missing.
* format_id A short description of the format
- ("mp4_h264_opus" or "19")
+ ("mp4_h264_opus" or "19").
+ Technically optional, but strongly recommended.
* format_note Additional info about the format
("3D" or "DASH video")
* width Width of the video, if known
* height Height of the video, if known
+ * resolution Textual description of width and height
+ * tbr Average bitrate of audio and video in KBit/s
* abr Average audio bitrate in KBit/s
* acodec Name of the audio codec in use
* vbr Average video bitrate in KBit/s
* vcodec Name of the video codec in use
* filesize The number of bytes, if known in advance
* player_url SWF Player URL (used for rtmpdump).
+ * protocol The protocol that will be used for the actual
+ download, lower-case.
+ "http", "https", "rtsp", "rtmp" or so.
+ * preference Order number of this format. If this field is
+ present and not None, the formats get sorted
+ by this field.
+ -1 for default (order by other properties),
+ -2 or smaller for less than default.
+ * quality Order number of the video quality of this
+ format, irrespective of the file format.
+ -1 for default (order by other properties),
+ -2 or smaller for less than default.
url: Final video URL.
ext: Video filename extension.
format: The video format, defaults to ext (used for --get-format)
player_url: SWF Player URL (used for rtmpdump).
- urlhandle: [internal] The urlHandle to be used to download the file,
- like returned by urllib.request.urlopen
The following fields are optional:
@@ -244,6 +260,11 @@ class InfoExtractor(object):
xml_string = transform_source(xml_string)
return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+ def report_warning(self, msg, video_id=None):
+ idstr = u'' if video_id is None else u'%s: ' % video_id
+ self._downloader.report_warning(
+ u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
+
def to_screen(self, msg):
"""Print msg to screen, prefixing it with '[ie_name]'"""
self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
@@ -361,7 +382,7 @@ class InfoExtractor(object):
@staticmethod
def _og_regexes(prop):
content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
- property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
+ property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
template = r'<meta[^>]+?%s[^>]+?%s'
return [
template % (property_re, content_re),
@@ -426,6 +447,57 @@ class InfoExtractor(object):
}
return RATING_TABLE.get(rating.lower(), None)
+ def _sort_formats(self, formats):
+ def _formats_key(f):
+ # TODO remove the following workaround
+ from ..utils import determine_ext
+ if not f.get('ext') and 'url' in f:
+ f['ext'] = determine_ext(f['url'])
+
+ preference = f.get('preference')
+ if preference is None:
+ proto = f.get('protocol')
+ if proto is None:
+ proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
+
+ preference = 0 if proto in ['http', 'https'] else -0.1
+ if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
+ preference -= 0.5
+
+ if f.get('vcodec') == 'none': # audio only
+ if self._downloader.params.get('prefer_free_formats'):
+ ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
+ else:
+ ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
+ ext_preference = 0
+ try:
+ audio_ext_preference = ORDER.index(f['ext'])
+ except ValueError:
+ audio_ext_preference = -1
+ else:
+ if self._downloader.params.get('prefer_free_formats'):
+ ORDER = [u'flv', u'mp4', u'webm']
+ else:
+ ORDER = [u'webm', u'flv', u'mp4']
+ try:
+ ext_preference = ORDER.index(f['ext'])
+ except ValueError:
+ ext_preference = -1
+ audio_ext_preference = 0
+
+ return (
+ preference,
+ f.get('quality') if f.get('quality') is not None else -1,
+ f.get('height') if f.get('height') is not None else -1,
+ f.get('width') if f.get('width') is not None else -1,
+ ext_preference,
+ f.get('vbr') if f.get('vbr') is not None else -1,
+ f.get('abr') if f.get('abr') is not None else -1,
+ audio_ext_preference,
+ f.get('filesize') if f.get('filesize') is not None else -1,
+ f.get('format_id'),
+ )
+ formats.sort(key=_formats_key)
class SearchInfoExtractor(InfoExtractor):
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index d5730684d..a2cbd4d8d 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -1,20 +1,25 @@
+from __future__ import unicode_literals
+
+import json
import re
from .common import InfoExtractor
from ..utils import (
- compat_urllib_parse,
+ unescapeHTML,
)
+
class CSpanIE(InfoExtractor):
_VALID_URL = r'http://www\.c-spanvideo\.org/program/(.*)'
+ IE_DESC = 'C-SPAN'
_TEST = {
- u'url': u'http://www.c-spanvideo.org/program/HolderonV',
- u'file': u'315139.flv',
- u'md5': u'74a623266956f69e4df0068ab6c80fe4',
- u'info_dict': {
- u"title": u"Attorney General Eric Holder on Voting Rights Act Decision"
+ 'url': 'http://www.c-spanvideo.org/program/HolderonV',
+ 'file': '315139.mp4',
+ 'md5': '8e44ce11f0f725527daccc453f553eb0',
+ 'info_dict': {
+ 'title': 'Attorney General Eric Holder on Voting Rights Act Decision',
+ 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in [Shelby County v. Holder] in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.',
},
- u'skip': u'Requires rtmpdump'
}
def _real_extract(self, url):
@@ -22,30 +27,22 @@ class CSpanIE(InfoExtractor):
prog_name = mobj.group(1)
webpage = self._download_webpage(url, prog_name)
video_id = self._search_regex(r'programid=(.*?)&', webpage, 'video id')
- data = compat_urllib_parse.urlencode({'programid': video_id,
- 'dynamic':'1'})
- info_url = 'http://www.c-spanvideo.org/common/services/flashXml.php?' + data
- video_info = self._download_webpage(info_url, video_id, u'Downloading video info')
-
- self.report_extraction(video_id)
-
- title = self._html_search_regex(r'<string name="title">(.*?)</string>',
- video_info, 'title')
- description = self._html_search_regex(r'<meta (?:property="og:|name=")description" content="(.*?)"',
- webpage, 'description',
- flags=re.MULTILINE|re.DOTALL)
-
- url = self._search_regex(r'<string name="URL">(.*?)</string>',
- video_info, 'video url')
- url = url.replace('$(protocol)', 'rtmp').replace('$(port)', '443')
- path = self._search_regex(r'<string name="path">(.*?)</string>',
- video_info, 'rtmp play path')
-
- return {'id': video_id,
- 'title': title,
- 'ext': 'flv',
- 'url': url,
- 'play_path': path,
- 'description': description,
- 'thumbnail': self._og_search_thumbnail(webpage),
- }
+
+ title = self._html_search_regex(
+ r'<!-- title -->\n\s*<h1[^>]*>(.*?)</h1>', webpage, 'title')
+ description = self._og_search_description(webpage)
+
+ info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id
+ data_json = self._download_webpage(
+ info_url, video_id, 'Downloading video info')
+ data = json.loads(data_json)
+
+ url = unescapeHTML(data['video']['files'][0]['path']['#text'])
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': url,
+ 'description': description,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py
index cb7226f82..0b11d1f10 100644
--- a/youtube_dl/extractor/dreisat.py
+++ b/youtube_dl/extractor/dreisat.py
@@ -4,18 +4,17 @@ import re
from .common import InfoExtractor
from ..utils import (
- determine_ext,
unified_strdate,
)
class DreiSatIE(InfoExtractor):
IE_NAME = '3sat'
- _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/index\.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
+ _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
_TEST = {
u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983",
- u'file': u'36983.webm',
- u'md5': u'57c97d0469d71cf874f6815aa2b7c944',
+ u'file': u'36983.mp4',
+ u'md5': u'9dcfe344732808dbfcc901537973c922',
u'info_dict': {
u"title": u"Kaffeeland Schweiz",
u"description": u"Über 80 Kaffeeröstereien liefern in der Schweiz das Getränk, in das das Land so vernarrt ist: Mehr als 1000 Tassen trinkt ein Schweizer pro Jahr. SCHWEIZWEIT nimmt die Kaffeekultur unter die...",
@@ -52,18 +51,12 @@ class DreiSatIE(InfoExtractor):
'width': int(fe.find('./width').text),
'height': int(fe.find('./height').text),
'url': fe.find('./url').text,
- 'ext': determine_ext(fe.find('./url').text),
'filesize': int(fe.find('./filesize').text),
'video_bitrate': int(fe.find('./videoBitrate').text),
- '3sat_qualityname': fe.find('./quality').text,
} for fe in format_els
if not fe.find('./url').text.startswith('http://www.metafilegenerator.de/')]
- def _sortkey(format):
- qidx = ['low', 'med', 'high', 'veryhigh'].index(format['3sat_qualityname'])
- prefer_http = 1 if 'rtmp' in format['url'] else 0
- return (qidx, prefer_http, format['video_bitrate'])
- formats.sort(key=_sortkey)
+ self._sort_formats(formats)
return {
'_type': 'video',
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 7a14c98f9..7d0e117de 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1,9 +1,12 @@
# encoding: utf-8
+from __future__ import unicode_literals
+
import os
import re
from .common import InfoExtractor
+from .youtube import YoutubeIE
from ..utils import (
compat_urllib_error,
compat_urllib_parse,
@@ -22,78 +25,78 @@ from .ooyala import OoyalaIE
class GenericIE(InfoExtractor):
- IE_DESC = u'Generic downloader that works on some sites'
+ IE_DESC = 'Generic downloader that works on some sites'
_VALID_URL = r'.*'
- IE_NAME = u'generic'
+ IE_NAME = 'generic'
_TESTS = [
{
- u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
- u'file': u'13601338388002.mp4',
- u'md5': u'6e15c93721d7ec9e9ca3fdbf07982cfd',
- u'info_dict': {
- u"uploader": u"www.hodiho.fr",
- u"title": u"R\u00e9gis plante sa Jeep"
+ 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
+ 'file': '13601338388002.mp4',
+ 'md5': '6e15c93721d7ec9e9ca3fdbf07982cfd',
+ 'info_dict': {
+ 'uploader': 'www.hodiho.fr',
+ 'title': 'R\u00e9gis plante sa Jeep',
}
},
# embedded vimeo video
{
- u'add_ie': ['Vimeo'],
- u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',
- u'file': u'22444065.mp4',
- u'md5': u'2903896e23df39722c33f015af0666e2',
- u'info_dict': {
- u'title': u'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011',
- u"uploader_id": u"skillsmatter",
- u"uploader": u"Skills Matter",
+ 'add_ie': ['Vimeo'],
+ 'url': 'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',
+ 'file': '22444065.mp4',
+ 'md5': '2903896e23df39722c33f015af0666e2',
+ 'info_dict': {
+ 'title': 'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011',
+ 'uploader_id': 'skillsmatter',
+ 'uploader': 'Skills Matter',
}
},
# bandcamp page with custom domain
{
- u'add_ie': ['Bandcamp'],
- u'url': u'http://bronyrock.com/track/the-pony-mash',
- u'file': u'3235767654.mp3',
- u'info_dict': {
- u'title': u'The Pony Mash',
- u'uploader': u'M_Pallante',
+ 'add_ie': ['Bandcamp'],
+ 'url': 'http://bronyrock.com/track/the-pony-mash',
+ 'file': '3235767654.mp3',
+ 'info_dict': {
+ 'title': 'The Pony Mash',
+ 'uploader': 'M_Pallante',
},
- u'skip': u'There is a limit of 200 free downloads / month for the test song',
+ 'skip': 'There is a limit of 200 free downloads / month for the test song',
},
# embedded brightcove video
# it also tests brightcove videos that need to set the 'Referer' in the
# http requests
{
- u'add_ie': ['Brightcove'],
- u'url': u'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
- u'info_dict': {
- u'id': u'2765128793001',
- u'ext': u'mp4',
- u'title': u'Le cours de bourse : l’analyse technique',
- u'description': u'md5:7e9ad046e968cb2d1114004aba466fd9',
- u'uploader': u'BFM BUSINESS',
+ 'add_ie': ['Brightcove'],
+ 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
+ 'info_dict': {
+ 'id': '2765128793001',
+ 'ext': 'mp4',
+ 'title': 'Le cours de bourse : l’analyse technique',
+ 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
+ 'uploader': 'BFM BUSINESS',
},
- u'params': {
- u'skip_download': True,
+ 'params': {
+ 'skip_download': True,
},
},
# Direct link to a video
{
- u'url': u'http://media.w3.org/2010/05/sintel/trailer.mp4',
- u'file': u'trailer.mp4',
- u'md5': u'67d406c2bcb6af27fa886f31aa934bbe',
- u'info_dict': {
- u'id': u'trailer',
- u'title': u'trailer',
- u'upload_date': u'20100513',
+ 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
+ 'file': 'trailer.mp4',
+ 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
+ 'info_dict': {
+ 'id': 'trailer',
+ 'title': 'trailer',
+ 'upload_date': '20100513',
}
},
# ooyala video
{
- u'url': u'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
- u'md5': u'5644c6ca5d5782c1d0d350dad9bd840c',
- u'info_dict': {
- u'id': u'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
- u'ext': u'mp4',
- u'title': u'2cc213299525360.mov', #that's what we get
+ 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
+ 'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
+ 'info_dict': {
+ 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
+ 'ext': 'mp4',
+ 'title': '2cc213299525360.mov', #that's what we get
},
},
]
@@ -101,12 +104,12 @@ class GenericIE(InfoExtractor):
def report_download_webpage(self, video_id):
"""Report webpage download."""
if not self._downloader.params.get('test', False):
- self._downloader.report_warning(u'Falling back on generic information extractor.')
+ self._downloader.report_warning('Falling back on generic information extractor.')
super(GenericIE, self).report_download_webpage(video_id)
def report_following_redirect(self, new_url):
"""Report information extraction."""
- self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
+ self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
def _send_head(self, url):
"""Check if it is a redirect, like url shorteners, in case return the new url."""
@@ -152,7 +155,7 @@ class GenericIE(InfoExtractor):
response = opener.open(HEADRequest(url))
if response is None:
- raise ExtractorError(u'Invalid URL protocol')
+ raise ExtractorError('Invalid URL protocol')
return response
def _real_extract(self, url):
@@ -162,6 +165,8 @@ class GenericIE(InfoExtractor):
return self.url_result('http://' + url)
video_id = os.path.splitext(url.split('/')[-1])[0]
+ self.to_screen('%s: Requesting header' % video_id)
+
try:
response = self._send_head(url)
@@ -184,7 +189,7 @@ class GenericIE(InfoExtractor):
'formats': [{
'format_id': m.group('format_id'),
'url': url,
- 'vcodec': u'none' if m.group('type') == 'audio' else None
+ 'vcodec': 'none' if m.group('type') == 'audio' else None
}],
'upload_date': upload_date,
}
@@ -198,7 +203,7 @@ class GenericIE(InfoExtractor):
except ValueError:
# since this is the last-resort InfoExtractor, if
# this error is thrown, it'll be thrown here
- raise ExtractorError(u'Failed to download URL: %s' % url)
+ raise ExtractorError('Failed to download URL: %s' % url)
self.report_extraction(video_id)
@@ -209,18 +214,19 @@ class GenericIE(InfoExtractor):
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
video_title = self._html_search_regex(
- r'(?s)<title>(.*?)</title>', webpage, u'video title',
- default=u'video')
+ r'(?s)<title>(.*?)</title>', webpage, 'video title',
+ default='video')
# video uploader is domain name
video_uploader = self._search_regex(
- r'^(?:https?://)?([^/]*)/.*', url, u'video uploader')
+ r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
# Look for BrightCove:
bc_url = BrightcoveIE._extract_brightcove_url(webpage)
if bc_url is not None:
- self.to_screen(u'Brightcove video detected.')
- return self.url_result(bc_url, 'Brightcove')
+ self.to_screen('Brightcove video detected.')
+ surl = smuggle_url(bc_url, {'Referer': url})
+ return self.url_result(surl, 'Brightcove')
# Look for embedded (iframe) Vimeo player
mobj = re.search(
@@ -271,16 +277,12 @@ class GenericIE(InfoExtractor):
}
# Look for embedded blip.tv player
- mobj = re.search(r'<meta\s[^>]*https?://api.blip.tv/\w+/redirect/\w+/(\d+)', webpage)
+ mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
if mobj:
- return self.url_result('http://blip.tv/seo/-'+mobj.group(1), 'BlipTV')
- mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*https?://(?:\w+\.)?blip.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', webpage)
+ return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV')
+ mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9]+)', webpage)
if mobj:
- player_url = 'http://blip.tv/play/%s.x?p=1' % mobj.group(1)
- player_page = self._download_webpage(player_url, mobj.group(1))
- blip_video_id = self._search_regex(r'data-episode-id="(\d+)', player_page, u'blip_video_id', fatal=False)
- if blip_video_id:
- return self.url_result('http://blip.tv/seo/-'+blip_video_id, 'BlipTV')
+ return self.url_result(mobj.group(1), 'BlipTV')
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
@@ -308,6 +310,9 @@ class GenericIE(InfoExtractor):
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
+ # Look for gorilla-vid style embedding
+ mobj = re.search(r'(?s)jw_plugins.*?file:\s*["\'](.*?)["\']', webpage)
+ if mobj is None:
# Broaden the search a little bit
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
if mobj is None:
@@ -327,23 +332,27 @@ class GenericIE(InfoExtractor):
# HTML5 video
mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
if mobj is None:
- raise ExtractorError(u'Unsupported URL: %s' % url)
+ raise ExtractorError('Unsupported URL: %s' % url)
# It's possible that one of the regexes
# matched, but returned an empty group:
if mobj.group(1) is None:
- raise ExtractorError(u'Did not find a valid video URL at %s' % url)
+ raise ExtractorError('Did not find a valid video URL at %s' % url)
video_url = mobj.group(1)
video_url = compat_urlparse.urljoin(url, video_url)
video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
+ # Sometimes, jwplayer extraction will result in a YouTube URL
+ if YoutubeIE.suitable(video_url):
+ return self.url_result(video_url, 'Youtube')
+
# here's a fun little line of code for you:
video_id = os.path.splitext(video_id)[0]
return {
- 'id': video_id,
- 'url': video_url,
+ 'id': video_id,
+ 'url': video_url,
'uploader': video_uploader,
- 'title': video_title,
+ 'title': video_title,
}
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index e5332cce8..16926b4d3 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -55,3 +55,32 @@ class ImdbIE(InfoExtractor):
'description': descr,
'thumbnail': format_info['slate'],
}
+
+class ImdbListIE(InfoExtractor):
+ IE_NAME = u'imdb:list'
+ IE_DESC = u'Internet Movie Database lists'
+ _VALID_URL = r'http://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ list_id = mobj.group('id')
+
+ # RSS XML is sometimes malformed
+ rss = self._download_webpage('http://rss.imdb.com/list/%s' % list_id, list_id, u'Downloading list RSS')
+ list_title = self._html_search_regex(r'<title>(.*?)</title>', rss, u'list title')
+
+ # Export is independent of actual author_id, but returns 404 if no author_id is provided.
+ # However, passing dummy author_id seems to be enough.
+ csv = self._download_webpage('http://www.imdb.com/list/export?list_id=%s&author_id=ur00000000' % list_id,
+ list_id, u'Downloading list CSV')
+
+ entries = []
+ for item in csv.split('\n')[1:]:
+ cols = item.split(',')
+ if len(cols) < 2:
+ continue
+ item_id = cols[1][1:-1]
+ if item_id.startswith('vi'):
+ entries.append(self.url_result('http://www.imdb.com/video/imdb/%s' % item_id, 'Imdb'))
+
+ return self.playlist_result(entries, list_id, list_title) \ No newline at end of file
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py
index 16a6f73c8..4ddda2f1b 100644
--- a/youtube_dl/extractor/internetvideoarchive.py
+++ b/youtube_dl/extractor/internetvideoarchive.py
@@ -5,7 +5,6 @@ from ..utils import (
compat_urlparse,
compat_urllib_parse,
xpath_with_ns,
- determine_ext,
)
@@ -63,13 +62,17 @@ class InternetVideoArchiveIE(InfoExtractor):
for content in item.findall(_bp('media:group/media:content')):
attr = content.attrib
f_url = attr['url']
+ width = int(attr['width'])
+ bitrate = int(attr['bitrate'])
+ format_id = '%d-%dk' % (width, bitrate)
formats.append({
+ 'format_id': format_id,
'url': f_url,
- 'ext': determine_ext(f_url),
- 'width': int(attr['width']),
- 'bitrate': int(attr['bitrate']),
+ 'width': width,
+ 'tbr': bitrate,
})
- formats = sorted(formats, key=lambda f: f['bitrate'])
+
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py
index 4bdf55f93..98d1d272a 100644
--- a/youtube_dl/extractor/ivi.py
+++ b/youtube_dl/extractor/ivi.py
@@ -84,14 +84,16 @@ class IviIE(InfoExtractor):
result = video_json[u'result']
- formats = [{'url': x[u'url'],
- 'format_id': x[u'content_format']
- } for x in result[u'files'] if x[u'content_format'] in self._known_formats]
- formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
-
- if len(formats) == 0:
- self._downloader.report_warning(u'No media links available for %s' % video_id)
- return
+ formats = [{
+ 'url': x[u'url'],
+ 'format_id': x[u'content_format'],
+ 'preference': self._known_formats.index(x[u'content_format']),
+ } for x in result[u'files'] if x[u'content_format'] in self._known_formats]
+
+ self._sort_formats(formats)
+
+ if not formats:
+ raise ExtractorError(u'No media links available for %s' % video_id)
duration = result[u'duration']
compilation = result[u'compilation']
diff --git a/youtube_dl/extractor/jpopsukitv.py b/youtube_dl/extractor/jpopsukitv.py
new file mode 100644
index 000000000..aad782578
--- /dev/null
+++ b/youtube_dl/extractor/jpopsukitv.py
@@ -0,0 +1,73 @@
+# coding=utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_strdate,
+)
+
+
+class JpopsukiIE(InfoExtractor):
+ IE_NAME = 'jpopsuki.tv'
+ _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/video/(.*?)/(?P<id>\S+)'
+
+ _TEST = {
+ 'url': 'http://www.jpopsuki.tv/video/ayumi-hamasaki---evolution/00be659d23b0b40508169cdee4545771',
+ 'md5': '88018c0c1a9b1387940e90ec9e7e198e',
+ 'file': '00be659d23b0b40508169cdee4545771.mp4',
+ 'info_dict': {
+ 'id': '00be659d23b0b40508169cdee4545771',
+ 'title': 'ayumi hamasaki - evolution',
+ 'description': 'Release date: 2001.01.31\r\n浜崎あゆみ - evolution',
+ 'thumbnail': 'http://www.jpopsuki.tv/cache/89722c74d2a2ebe58bcac65321c115b2.jpg',
+ 'uploader': 'plama_chan',
+ 'uploader_id': '404',
+ 'upload_date': '20121101'
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = 'http://www.jpopsuki.tv' + self._html_search_regex(
+ r'<source src="(.*?)" type', webpage, 'video url')
+
+ video_title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ uploader = self._html_search_regex(
+ r'<li>from: <a href="/user/view/user/(.*?)/uid/',
+ webpage, 'video uploader', fatal=False)
+ uploader_id = self._html_search_regex(
+ r'<li>from: <a href="/user/view/user/\S*?/uid/(\d*)',
+ webpage, 'video uploader_id', fatal=False)
+ upload_date = self._html_search_regex(
+ r'<li>uploaded: (.*?)</li>', webpage, 'video upload_date',
+ fatal=False)
+ if upload_date is not None:
+ upload_date = unified_strdate(upload_date)
+ view_count_str = self._html_search_regex(
+ r'<li>Hits: ([0-9]+?)</li>', webpage, 'video view_count',
+ fatal=False)
+ comment_count_str = self._html_search_regex(
+ r'<h2>([0-9]+?) comments</h2>', webpage, 'video comment_count',
+ fatal=False)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'upload_date': upload_date,
+ 'view_count': int_or_none(view_count_str),
+ 'comment_count': int_or_none(comment_count_str),
+ }
diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py
new file mode 100644
index 000000000..844ba4dcb
--- /dev/null
+++ b/youtube_dl/extractor/lynda.py
@@ -0,0 +1,142 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class LyndaIE(SubtitlesInfoExtractor):
+ IE_NAME = 'lynda'
+ IE_DESC = 'lynda.com videos'
+ _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html'
+
+ _TEST = {
+ 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
+ 'file': '114408.mp4',
+ 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1',
+ u"info_dict": {
+ 'title': 'Using the exercise files',
+ 'duration': 68
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+
+ page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id,
+ video_id, 'Downloading video JSON')
+ video_json = json.loads(page)
+
+ if 'Status' in video_json and video_json['Status'] == 'NotFound':
+ raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+ if video_json['HasAccess'] is False:
+ raise ExtractorError('Video %s is only available for members' % video_id, expected=True)
+
+ video_id = video_json['ID']
+ duration = video_json['DurationInSeconds']
+ title = video_json['Title']
+
+ formats = [{'url': fmt['Url'],
+ 'ext': fmt['Extension'],
+ 'width': fmt['Width'],
+ 'height': fmt['Height'],
+ 'filesize': fmt['FileSize'],
+ 'format_id': str(fmt['Resolution'])
+ } for fmt in video_json['Formats']]
+
+ self._sort_formats(formats)
+
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, page)
+ return
+
+ subtitles = self._fix_subtitles(self.extract_subtitles(video_id, page))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'duration': duration,
+ 'subtitles': subtitles,
+ 'formats': formats
+ }
+
+ _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
+
+ def _fix_subtitles(self, subtitles):
+ fixed_subtitles = {}
+ for k, v in subtitles.items():
+ subs = json.loads(v)
+ if len(subs) == 0:
+ continue
+ srt = ''
+ for pos in range(0, len(subs) - 1):
+ seq_current = subs[pos]
+ m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode'])
+ if m_current is None:
+ continue
+ seq_next = subs[pos+1]
+ m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode'])
+ if m_next is None:
+ continue
+ appear_time = m_current.group('timecode')
+ disappear_time = m_next.group('timecode')
+ text = seq_current['Caption']
+ srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text)
+ if srt:
+ fixed_subtitles[k] = srt
+ return fixed_subtitles
+
+ def _get_available_subtitles(self, video_id, webpage):
+ url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
+ sub = self._download_webpage(url, None, note=False)
+ sub_json = json.loads(sub)
+ return {'en': url} if len(sub_json) > 0 else {}
+
+
+class LyndaCourseIE(InfoExtractor):
+ IE_NAME = 'lynda:course'
+ IE_DESC = 'lynda.com online courses'
+
+ # Course link equals to welcome/introduction video link of same course
+ # We will recognize it as course link
+ _VALID_URL = r'https?://(?:www|m)\.lynda\.com/(?P<coursepath>[^/]+/[^/]+/(?P<courseid>\d+))-\d\.html'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ course_path = mobj.group('coursepath')
+ course_id = mobj.group('courseid')
+
+ page = self._download_webpage('http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
+ course_id, 'Downloading course JSON')
+ course_json = json.loads(page)
+
+ if 'Status' in course_json and course_json['Status'] == 'NotFound':
+ raise ExtractorError('Course %s does not exist' % course_id, expected=True)
+
+ unaccessible_videos = 0
+ videos = []
+
+ for chapter in course_json['Chapters']:
+ for video in chapter['Videos']:
+ if video['HasAccess'] is not True:
+ unaccessible_videos += 1
+ continue
+ videos.append(video['ID'])
+
+ if unaccessible_videos > 0:
+ self._downloader.report_warning('%s videos are only available for members and will not be downloaded' % unaccessible_videos)
+
+ entries = [
+ self.url_result('http://www.lynda.com/%s/%s-4.html' %
+ (course_path, video_id),
+ 'Lynda')
+ for video_id in videos]
+
+ course_title = course_json['Title']
+
+ return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/macgamestore.py b/youtube_dl/extractor/macgamestore.py
new file mode 100644
index 000000000..b818cf50c
--- /dev/null
+++ b/youtube_dl/extractor/macgamestore.py
@@ -0,0 +1,43 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class MacGameStoreIE(InfoExtractor):
+ IE_NAME = 'macgamestore'
+ IE_DESC = 'MacGameStore trailers'
+ _VALID_URL = r'https?://www\.macgamestore\.com/mediaviewer\.php\?trailer=(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'http://www.macgamestore.com/mediaviewer.php?trailer=2450',
+ 'file': '2450.m4v',
+ 'md5': '8649b8ea684b6666b4c5be736ecddc61',
+ 'info_dict': {
+ 'title': 'Crow',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id, 'Downloading trailer page')
+
+ if re.search(r'>Missing Media<', webpage) is not None:
+ raise ExtractorError('Trailer %s does not exist' % video_id, expected=True)
+
+ video_title = self._html_search_regex(
+ r'<title>MacGameStore: (.*?) Trailer</title>', webpage, 'title')
+
+ video_url = self._html_search_regex(
+ r'(?s)<div\s+id="video-player".*?href="([^"]+)"\s*>',
+ webpage, 'video URL')
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title
+ }
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py
index 08ce0647f..7aa0080d7 100644
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@@ -52,10 +52,11 @@ class MDRIE(InfoExtractor):
'format_id': u'%s-%d' % (media_type, vbr),
})
formats.append(format)
- formats.sort(key=lambda f: (f.get('vbr'), f['abr']))
if not formats:
raise ExtractorError(u'Could not find any valid formats')
+ self._sort_formats(formats)
+
return {
'id': video_id,
'title': title,
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
index 52be9232f..76b717fe5 100644
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@@ -33,8 +33,18 @@ class TechTVMITIE(InfoExtractor):
raw_page, u'base url')
formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,
u'video formats')
- formats = json.loads(formats_json)
- formats = sorted(formats, key=lambda f: f['bitrate'])
+ formats_mit = json.loads(formats_json)
+ formats = [
+ {
+ 'format_id': f['label'],
+ 'url': base_url + f['url'].partition(':')[2],
+ 'ext': f['url'].partition(':')[0],
+ 'format': f['label'],
+ 'width': f['width'],
+ 'vbr': f['bitrate'],
+ }
+ for f in formats_mit
+ ]
title = get_element_by_id('edit-title', clean_page)
description = clean_html(get_element_by_id('edit-description', clean_page))
@@ -43,8 +53,7 @@ class TechTVMITIE(InfoExtractor):
return {'id': video_id,
'title': title,
- 'url': base_url + formats[-1]['url'].replace('mp4:', ''),
- 'ext': 'mp4',
+ 'formats': formats,
'description': description,
'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index 125d81551..7c54ea0f4 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -53,7 +53,7 @@ class MixcloudIE(InfoExtractor):
info = json.loads(json_data)
preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url')
- song_url = preview_url.replace('/previews/', '/cloudcasts/originals/')
+ song_url = preview_url.replace('/previews/', '/c/originals/')
template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
final_song_url = self._get_url(template_url)
if final_song_url is None:
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index ed11f521a..f1cf41e2d 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -129,7 +129,7 @@ class MTVIE(MTVServicesInfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
- uri = mobj.group('mgid')
+ uri = mobj.groupdict().get('mgid')
if uri is None:
webpage = self._download_webpage(url, video_id)
diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py
index 0404e6e43..6d35c7861 100644
--- a/youtube_dl/extractor/myvideo.py
+++ b/youtube_dl/extractor/myvideo.py
@@ -143,8 +143,10 @@ class MyVideoIE(InfoExtractor):
if mobj:
video_url = compat_urllib_parse.unquote(mobj.group(1))
if 'myvideo2flash' in video_url:
- self._downloader.report_warning(u'forcing RTMPT ...')
- video_url = video_url.replace('rtmpe://', 'rtmpt://')
+ self.report_warning(
+ u'Rewriting URL to use unencrypted rtmp:// ...',
+ video_id)
+ video_url = video_url.replace('rtmpe://', 'rtmp://')
if not video_url:
# extract non rtmp videos
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index b42eae89a..88f03608b 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -1,54 +1,98 @@
# coding: utf-8
+from __future__ import unicode_literals
-import re
-import xml.etree.ElementTree
import json
+import re
from .common import InfoExtractor
from ..utils import (
- compat_urlparse,
- ExtractorError,
- find_xpath_attr,
+ HEADRequest,
+ unified_strdate,
)
+
class ORFIE(InfoExtractor):
- _VALID_URL = r'https?://tvthek\.orf\.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'
+ _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'http://tvthek.orf.at/program/matinee-Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7317210/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319746/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319747',
+ 'file': '7319747.mp4',
+ 'md5': 'bd803c5d8c32d3c64a0ea4b4eeddf375',
+ 'info_dict': {
+ 'title': 'Was Sie schon immer über Klassik wissen wollten',
+ 'description': 'md5:0ddf0d5f0060bd53f744edaa5c2e04a4',
+ 'duration': 3508,
+ 'upload_date': '20140105',
+ },
+ 'skip': 'Blocked outside of Austria',
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id')
webpage = self._download_webpage(url, playlist_id)
- flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml')
- flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0]
- flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8'))
- playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"')
- playlist = json.loads(playlist_json)
-
- videos = []
- ns = '{http://tempuri.org/XMLSchema.xsd}'
- xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns}
- webpage_description = self._og_search_description(webpage)
- for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1):
- # Get best quality url
- rtmp_url = None
- for q in ['Q6A', 'Q4A', 'Q1A']:
- video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q)
- if video_url is not None:
- rtmp_url = video_url.text
- break
- if rtmp_url is None:
- raise ExtractorError(u'Couldn\'t get video url: %s' % info['id'])
- description = self._html_search_regex(
- r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage,
- u'description', default=webpage_description, flags=re.DOTALL)
- videos.append({
+ data_json = self._search_regex(
+ r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
+ all_data = json.loads(data_json)
+ sdata = all_data[0]['values']['segments']
+
+ def quality_to_int(s):
+ m = re.search('([0-9]+)', s)
+ if m is None:
+ return -1
+ return int(m.group(1))
+
+ entries = []
+ for sd in sdata:
+ video_id = sd['id']
+ formats = [{
+ 'preference': -10 if fd['delivery'] == 'hls' else None,
+ 'format_id': '%s-%s-%s' % (
+ fd['delivery'], fd['quality'], fd['quality_string']),
+ 'url': fd['src'],
+ 'protocol': fd['protocol'],
+ 'quality': quality_to_int(fd['quality']),
+ } for fd in sd['playlist_item_array']['sources']]
+
+ # Check for geoblocking.
+ # There is a property is_geoprotection, but that's always false
+ geo_str = sd.get('geoprotection_string')
+ if geo_str:
+ try:
+ http_url = next(
+ f['url']
+ for f in formats
+ if re.match(r'^https?://.*\.mp4$', f['url']))
+ except StopIteration:
+ pass
+ else:
+ req = HEADRequest(http_url)
+ response = self._request_webpage(
+ req, video_id,
+ note='Testing for geoblocking',
+ errnote=((
+ 'This video seems to be blocked outside of %s. '
+ 'You may want to try the streaming-* formats.')
+ % geo_str),
+ fatal=False)
+
+ self._sort_formats(formats)
+
+ upload_date = unified_strdate(sd['created_date'])
+ entries.append({
'_type': 'video',
- 'id': info['id'],
- 'title': info['title'],
- 'url': rtmp_url,
- 'ext': 'flv',
- 'description': description,
- })
-
- return videos
+ 'id': video_id,
+ 'title': sd['header'],
+ 'formats': formats,
+ 'description': sd.get('description'),
+ 'duration': int(sd['duration_in_seconds']),
+ 'upload_date': upload_date,
+ 'thumbnail': sd.get('image_full_url'),
+ })
+
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'id': playlist_id,
+ }
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
index 71abd5013..e9ff8d1af 100644
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -5,7 +5,7 @@ from ..utils import compat_urllib_parse
class PornHdIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)'
+ _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)'
_TEST = {
u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
u'file': u'1962.flv',
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
index a589a893b..99f5b19d2 100644
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -1,5 +1,6 @@
# encoding: utf-8
+import os.path
import re
import json
import hashlib
@@ -10,6 +11,7 @@ from ..utils import (
compat_urllib_parse,
compat_urllib_request,
ExtractorError,
+ url_basename,
)
@@ -132,7 +134,16 @@ class SmotriIE(InfoExtractor):
# We will extract some from the video web page instead
video_page_url = 'http://' + mobj.group('url')
video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page')
-
+
+ # Warning if video is unavailable
+ warning = self._html_search_regex(
+ r'<div class="videoUnModer">(.*?)</div>', video_page,
+ u'warning message', default=None)
+ if warning is not None:
+ self._downloader.report_warning(
+ u'Video %s may not be available; smotri said: %s ' %
+ (video_id, warning))
+
# Adult content
if re.search(u'EroConfirmText">', video_page) is not None:
self.report_age_confirmation()
@@ -148,38 +159,44 @@ class SmotriIE(InfoExtractor):
# Extract the rest of meta data
video_title = self._search_meta(u'name', video_page, u'title')
if not video_title:
- video_title = video_url.rsplit('/', 1)[-1]
+ video_title = os.path.splitext(url_basename(video_url))[0]
video_description = self._search_meta(u'description', video_page)
END_TEXT = u' на сайте Smotri.com'
- if video_description.endswith(END_TEXT):
+ if video_description and video_description.endswith(END_TEXT):
video_description = video_description[:-len(END_TEXT)]
START_TEXT = u'Смотреть онлайн ролик '
- if video_description.startswith(START_TEXT):
+ if video_description and video_description.startswith(START_TEXT):
video_description = video_description[len(START_TEXT):]
video_thumbnail = self._search_meta(u'thumbnail', video_page)
upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date')
- upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
- video_upload_date = (
- (
- upload_date_m.group('year') +
- upload_date_m.group('month') +
- upload_date_m.group('day')
+ if upload_date_str:
+ upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
+ video_upload_date = (
+ (
+ upload_date_m.group('year') +
+ upload_date_m.group('month') +
+ upload_date_m.group('day')
+ )
+ if upload_date_m else None
)
- if upload_date_m else None
- )
+ else:
+ video_upload_date = None
duration_str = self._search_meta(u'duration', video_page)
- duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
- video_duration = (
- (
- (int(duration_m.group('hours')) * 60 * 60) +
- (int(duration_m.group('minutes')) * 60) +
- int(duration_m.group('seconds'))
+ if duration_str:
+ duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
+ video_duration = (
+ (
+ (int(duration_m.group('hours')) * 60 * 60) +
+ (int(duration_m.group('minutes')) * 60) +
+ int(duration_m.group('seconds'))
+ )
+ if duration_m else None
)
- if duration_m else None
- )
+ else:
+ video_duration = None
video_uploader = self._html_search_regex(
u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>',
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index e22ff9c38..951e977bd 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor):
(?!sets/)(?P<title>[\w\d-]+)/?
(?P<token>[^?]+?)?(?:[?].*)?$)
|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+))
- |(?P<widget>w\.soundcloud\.com/player/?.*?url=.*)
+ |(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*)
)
'''
IE_NAME = u'soundcloud'
@@ -193,7 +193,7 @@ class SoundcloudIE(InfoExtractor):
if track_id is not None:
info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
full_title = track_id
- elif mobj.group('widget'):
+ elif mobj.group('player'):
query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
return self.url_result(query['url'][0], ie='Soundcloud')
else:
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py
index 695520524..051a34d5b 100644
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -51,9 +51,10 @@ class SpiegelIE(InfoExtractor):
# Blacklist type 6, it's extremely LQ and not available on the same server
if n.tag.startswith('type') and n.tag != 'type6'
]
- formats.sort(key=lambda f: f['vbr'])
duration = float(idoc[0].findall('./duration')[0].text)
+ self._sort_formats(formats)
+
info = {
'id': video_id,
'title': video_title,
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index cec65261b..23172143e 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -55,15 +55,21 @@ class ThePlatformIE(InfoExtractor):
formats = []
for f in switch.findall(_x('smil:video')):
attr = f.attrib
+ width = int(attr['width'])
+ height = int(attr['height'])
+ vbr = int(attr['system-bitrate']) // 1000
+ format_id = '%dx%d_%dk' % (width, height, vbr)
formats.append({
+ 'format_id': format_id,
'url': base_url,
'play_path': 'mp4:' + attr['src'],
'ext': 'flv',
- 'width': int(attr['width']),
- 'height': int(attr['height']),
- 'vbr': int(attr['system-bitrate']),
+ 'width': width,
+ 'height': height,
+ 'vbr': vbr,
})
- formats.sort(key=lambda f: (f['height'], f['width'], f['vbr']))
+
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py
index 3cf8c853d..b1c854a64 100644
--- a/youtube_dl/extractor/veehd.py
+++ b/youtube_dl/extractor/veehd.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
import json
@@ -8,16 +10,17 @@ from ..utils import (
clean_html,
)
+
class VeeHDIE(InfoExtractor):
_VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)'
_TEST = {
- u'url': u'http://veehd.com/video/4686958',
- u'file': u'4686958.mp4',
- u'info_dict': {
- u'title': u'Time Lapse View from Space ( ISS)',
- u'uploader_id': u'spotted',
- u'description': u'md5:f0094c4cf3a72e22bc4e4239ef767ad7',
+ 'url': 'http://veehd.com/video/4686958',
+ 'file': '4686958.mp4',
+ 'info_dict': {
+ 'title': 'Time Lapse View from Space ( ISS)',
+ 'uploader_id': 'spotted',
+ 'description': 'md5:f0094c4cf3a72e22bc4e4239ef767ad7',
},
}
@@ -25,24 +28,30 @@ class VeeHDIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ # VeeHD seems to send garbage on the first request.
+ # See https://github.com/rg3/youtube-dl/issues/2102
+ self._download_webpage(url, video_id, 'Requesting webpage')
webpage = self._download_webpage(url, video_id)
- player_path = self._search_regex(r'\$\("#playeriframe"\).attr\({src : "(.+?)"',
- webpage, u'player path')
+ player_path = self._search_regex(
+ r'\$\("#playeriframe"\).attr\({src : "(.+?)"',
+ webpage, 'player path')
player_url = compat_urlparse.urljoin(url, player_path)
- player_page = self._download_webpage(player_url, video_id,
- u'Downloading player page')
- config_json = self._search_regex(r'value=\'config=({.+?})\'',
- player_page, u'config json')
+
+ self._download_webpage(player_url, video_id, 'Requesting player page')
+ player_page = self._download_webpage(
+ player_url, video_id, 'Downloading player page')
+ config_json = self._search_regex(
+ r'value=\'config=({.+?})\'', player_page, 'config json')
config = json.loads(config_json)
video_url = compat_urlparse.unquote(config['clip']['url'])
title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0])
uploader_id = self._html_search_regex(r'<a href="/profile/\d+">(.+?)</a>',
- webpage, u'uploader')
+ webpage, 'uploader')
thumbnail = self._search_regex(r'<img id="veehdpreview" src="(.+?)"',
- webpage, u'thumbnail')
+ webpage, 'thumbnail')
description = self._html_search_regex(r'<td class="infodropdown".*?<div>(.*?)<ul',
- webpage, u'description', flags=re.DOTALL)
+ webpage, 'description', flags=re.DOTALL)
return {
'_type': 'video',
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py
index 00672c9e5..baa57f343 100644
--- a/youtube_dl/extractor/veoh.py
+++ b/youtube_dl/extractor/veoh.py
@@ -1,22 +1,22 @@
+from __future__ import unicode_literals
+
import re
import json
from .common import InfoExtractor
-from ..utils import (
- determine_ext,
-)
+
class VeohIE(InfoExtractor):
- _VALID_URL = r'http://www\.veoh\.com/watch/v(?P<id>\d*)'
+ _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/v(?P<id>\d*)'
_TEST = {
- u'url': u'http://www.veoh.com/watch/v56314296nk7Zdmz3',
- u'file': u'56314296.mp4',
- u'md5': u'620e68e6a3cff80086df3348426c9ca3',
- u'info_dict': {
- u'title': u'Straight Backs Are Stronger',
- u'uploader': u'LUMOback',
- u'description': u'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
+ 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
+ 'file': '56314296.mp4',
+ 'md5': '620e68e6a3cff80086df3348426c9ca3',
+ 'info_dict': {
+ 'title': 'Straight Backs Are Stronger',
+ 'uploader': 'LUMOback',
+ 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
}
}
@@ -28,20 +28,20 @@ class VeohIE(InfoExtractor):
m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
if m_youtube is not None:
youtube_id = m_youtube.group(1)
- self.to_screen(u'%s: detected Youtube video.' % video_id)
+ self.to_screen('%s: detected Youtube video.' % video_id)
return self.url_result(youtube_id, 'Youtube')
self.report_extraction(video_id)
info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info')
info = json.loads(info)
- video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
-
- return {'id': info['videoId'],
- 'title': info['title'],
- 'ext': determine_ext(video_url),
- 'url': video_url,
- 'uploader': info['username'],
- 'thumbnail': info.get('highResImage') or info.get('medResImage'),
- 'description': info['description'],
- 'view_count': info['views'],
- }
+ video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
+
+ return {
+ 'id': info['videoId'],
+ 'title': info['title'],
+ 'url': video_url,
+ 'uploader': info['username'],
+ 'thumbnail': info.get('highResImage') or info.get('medResImage'),
+ 'description': info['description'],
+ 'view_count': info['views'],
+ }
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index c3623fcbe..ad86d033a 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -1,4 +1,6 @@
# encoding: utf-8
+from __future__ import unicode_literals
+
import json
import re
import itertools
@@ -31,54 +33,55 @@ class VimeoIE(InfoExtractor):
(?P<id>[0-9]+)
/?(?:[?&].*)?(?:[#].*)?$'''
_NETRC_MACHINE = 'vimeo'
- IE_NAME = u'vimeo'
+ IE_NAME = 'vimeo'
_TESTS = [
{
- u'url': u'http://vimeo.com/56015672#at=0',
- u'file': u'56015672.mp4',
- u'md5': u'8879b6cc097e987f02484baf890129e5',
- u'info_dict': {
- u"upload_date": u"20121220",
- u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
- u"uploader_id": u"user7108434",
- u"uploader": u"Filippo Valsorda",
- u"title": u"youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+ 'url': 'http://vimeo.com/56015672#at=0',
+ 'file': '56015672.mp4',
+ 'md5': '8879b6cc097e987f02484baf890129e5',
+ 'info_dict': {
+ "upload_date": "20121220",
+ "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+ "uploader_id": "user7108434",
+ "uploader": "Filippo Valsorda",
+ "title": "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
},
},
{
- u'url': u'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876',
- u'file': u'68093876.mp4',
- u'md5': u'3b5ca6aa22b60dfeeadf50b72e44ed82',
- u'note': u'Vimeo Pro video (#1197)',
- u'info_dict': {
- u'uploader_id': u'openstreetmapus',
- u'uploader': u'OpenStreetMap US',
- u'title': u'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
+ 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876',
+ 'file': '68093876.mp4',
+ 'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82',
+ 'note': 'Vimeo Pro video (#1197)',
+ 'info_dict': {
+ 'uploader_id': 'openstreetmapus',
+ 'uploader': 'OpenStreetMap US',
+ 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
},
},
{
- u'url': u'http://player.vimeo.com/video/54469442',
- u'file': u'54469442.mp4',
- u'md5': u'619b811a4417aa4abe78dc653becf511',
- u'note': u'Videos that embed the url in the player page',
- u'info_dict': {
- u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software',
- u'uploader': u'The BLN & Business of Software',
+ 'url': 'http://player.vimeo.com/video/54469442',
+ 'file': '54469442.mp4',
+ 'md5': '619b811a4417aa4abe78dc653becf511',
+ 'note': 'Videos that embed the url in the player page',
+ 'info_dict': {
+ 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software',
+ 'uploader': 'The BLN & Business of Software',
+ 'uploader_id': 'theblnbusinessofsoftware',
},
},
{
- u'url': u'http://vimeo.com/68375962',
- u'file': u'68375962.mp4',
- u'md5': u'aaf896bdb7ddd6476df50007a0ac0ae7',
- u'note': u'Video protected with password',
- u'info_dict': {
- u'title': u'youtube-dl password protected test video',
- u'upload_date': u'20130614',
- u'uploader_id': u'user18948128',
- u'uploader': u'Jaime Marquínez Ferrándiz',
+ 'url': 'http://vimeo.com/68375962',
+ 'file': '68375962.mp4',
+ 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7',
+ 'note': 'Video protected with password',
+ 'info_dict': {
+ 'title': 'youtube-dl password protected test video',
+ 'upload_date': '20130614',
+ 'uploader_id': 'user18948128',
+ 'uploader': 'Jaime Marquínez Ferrándiz',
},
- u'params': {
- u'videopassword': u'youtube-dl',
+ 'params': {
+ 'videopassword': 'youtube-dl',
},
},
]
@@ -90,7 +93,7 @@ class VimeoIE(InfoExtractor):
self.report_login()
login_url = 'https://vimeo.com/log_in'
webpage = self._download_webpage(login_url, None, False)
- token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
+ token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
data = compat_urllib_parse.urlencode({'email': username,
'password': password,
'action': 'login',
@@ -100,13 +103,13 @@ class VimeoIE(InfoExtractor):
login_request = compat_urllib_request.Request(login_url, data)
login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
login_request.add_header('Cookie', 'xsrft=%s' % token)
- self._download_webpage(login_request, None, False, u'Wrong login info')
+ self._download_webpage(login_request, None, False, 'Wrong login info')
def _verify_video_password(self, url, video_id, webpage):
password = self._downloader.params.get('videopassword', None)
if password is None:
- raise ExtractorError(u'This video is protected by a password, use the --video-password option')
- token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
+ raise ExtractorError('This video is protected by a password, use the --video-password option')
+ token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
data = compat_urllib_parse.urlencode({'password': password,
'token': token})
# I didn't manage to use the password with https
@@ -118,8 +121,8 @@ class VimeoIE(InfoExtractor):
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
password_request.add_header('Cookie', 'xsrft=%s' % token)
self._download_webpage(password_request, video_id,
- u'Verifying the password',
- u'Wrong password')
+ 'Verifying the password',
+ 'Wrong password')
def _real_initialize(self):
self._login()
@@ -134,7 +137,7 @@ class VimeoIE(InfoExtractor):
# Extract ID from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
video_id = mobj.group('id')
if mobj.group('pro') or mobj.group('player'):
@@ -155,7 +158,7 @@ class VimeoIE(InfoExtractor):
try:
try:
config_url = self._html_search_regex(
- r' data-config-url="(.+?)"', webpage, u'config URL')
+ r' data-config-url="(.+?)"', webpage, 'config URL')
config_json = self._download_webpage(config_url, video_id)
config = json.loads(config_json)
except RegexNotFoundError:
@@ -166,19 +169,23 @@ class VimeoIE(InfoExtractor):
config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1))
else:
config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
- config = self._search_regex(config_re, webpage, u'info section',
+ config = self._search_regex(config_re, webpage, 'info section',
flags=re.DOTALL)
config = json.loads(config)
except Exception as e:
if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
- raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
+ raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option')
if re.search('<form[^>]+?id="pw_form"', webpage) is not None:
self._verify_video_password(url, video_id, webpage)
return self._real_extract(url)
else:
- raise ExtractorError(u'Unable to extract info section',
+ raise ExtractorError('Unable to extract info section',
cause=e)
+ else:
+ if config.get('view') == 4:
+ self._verify_video_password(url, video_id, webpage)
+ return self._real_extract(url)
# Extract title
video_title = config["video"]["title"]
@@ -212,9 +219,9 @@ class VimeoIE(InfoExtractor):
video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
try:
- view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, u'view count'))
- like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, u'like count'))
- comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, u'comment count'))
+ view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count'))
+ like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count'))
+ comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count'))
except RegexNotFoundError:
# This info is only available in vimeo.com/{id} urls
view_count = None
@@ -255,7 +262,7 @@ class VimeoIE(InfoExtractor):
for key in ('other', 'sd', 'hd'):
formats += files[key]
if len(formats) == 0:
- raise ExtractorError(u'No known codec found')
+ raise ExtractorError('No known codec found')
return {
'id': video_id,
@@ -274,7 +281,7 @@ class VimeoIE(InfoExtractor):
class VimeoChannelIE(InfoExtractor):
- IE_NAME = u'vimeo:channel'
+ IE_NAME = 'vimeo:channel'
_VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)'
_MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
_TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
@@ -283,14 +290,14 @@ class VimeoChannelIE(InfoExtractor):
return '%s/videos/page:%d/' % (base_url, pagenum)
def _extract_list_title(self, webpage):
- return self._html_search_regex(self._TITLE_RE, webpage, u'list title')
+ return self._html_search_regex(self._TITLE_RE, webpage, 'list title')
def _extract_videos(self, list_id, base_url):
video_ids = []
for pagenum in itertools.count(1):
webpage = self._download_webpage(
self._page_url(base_url, pagenum) ,list_id,
- u'Downloading page %s' % pagenum)
+ 'Downloading page %s' % pagenum)
video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))
if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
break
@@ -310,8 +317,8 @@ class VimeoChannelIE(InfoExtractor):
class VimeoUserIE(VimeoChannelIE):
- IE_NAME = u'vimeo:user'
- _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)'
+ IE_NAME = 'vimeo:user'
+ _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)(?:[#?]|$)'
_TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
@classmethod
@@ -327,7 +334,7 @@ class VimeoUserIE(VimeoChannelIE):
class VimeoAlbumIE(VimeoChannelIE):
- IE_NAME = u'vimeo:album'
+ IE_NAME = 'vimeo:album'
_VALID_URL = r'(?:https?://)?vimeo.\com/album/(?P<id>\d+)'
_TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
@@ -336,12 +343,12 @@ class VimeoAlbumIE(VimeoChannelIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- album_id = mobj.group('id')
+ album_id = mobj.group('id')
return self._extract_videos(album_id, 'http://vimeo.com/album/%s' % album_id)
class VimeoGroupsIE(VimeoAlbumIE):
- IE_NAME = u'vimeo:group'
+ IE_NAME = 'vimeo:group'
_VALID_URL = r'(?:https?://)?vimeo.\com/groups/(?P<name>[^/]+)'
def _extract_list_title(self, webpage):
@@ -351,3 +358,24 @@ class VimeoGroupsIE(VimeoAlbumIE):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
return self._extract_videos(name, 'http://vimeo.com/groups/%s' % name)
+
+
+class VimeoReviewIE(InfoExtractor):
+ IE_NAME = 'vimeo:review'
+ IE_DESC = 'Review pages on vimeo'
+ _VALID_URL = r'(?:https?://)?vimeo.\com/[^/]+/review/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d',
+ 'file': '75524534.mp4',
+ 'md5': 'c507a72f780cacc12b2248bb4006d253',
+ 'info_dict': {
+ 'title': "DICK HARDWICK 'Comedian'",
+ 'uploader': 'Richard Hardwick',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ player_url = 'https://player.vimeo.com/player/' + video_id
+ return self.url_result(player_url, 'Vimeo', video_id)
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
index e1748c261..bc31c2e64 100644
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -44,8 +44,10 @@ class WistiaIE(InfoExtractor):
'height': a['height'],
'filesize': a['size'],
'ext': a['ext'],
+ 'preference': 1 if atype == 'original' else None,
})
- formats.sort(key=lambda a: a['filesize'])
+
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 5c9c361b9..e17a39782 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -6,8 +6,8 @@ from .common import InfoExtractor, SearchInfoExtractor
from ..utils import (
compat_urllib_parse,
compat_urlparse,
- determine_ext,
clean_html,
+ int_or_none,
)
@@ -68,9 +68,9 @@ class YahooIE(InfoExtractor):
formats = []
for s in info['streams']:
format_info = {
- 'width': s.get('width'),
- 'height': s.get('height'),
- 'bitrate': s.get('bitrate'),
+ 'width': int_or_none(s.get('width')),
+ 'height': int_or_none(s.get('height')),
+ 'tbr': int_or_none(s.get('bitrate')),
}
host = s['host']
@@ -84,10 +84,10 @@ class YahooIE(InfoExtractor):
else:
format_url = compat_urlparse.urljoin(host, path)
format_info['url'] = format_url
- format_info['ext'] = determine_ext(format_url)
formats.append(format_info)
- formats = sorted(formats, key=lambda f:(f['height'], f['width']))
+
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index bd0f2cae0..77ad423c4 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -1,5 +1,4 @@
import json
-import os
import re
import sys
@@ -16,6 +15,7 @@ from ..aes import (
aes_decrypt_text
)
+
class YouPornIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
_TEST = {
@@ -23,9 +23,9 @@ class YouPornIE(InfoExtractor):
u'file': u'505835.mp4',
u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89',
u'info_dict': {
- u"upload_date": u"20101221",
- u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",
- u"uploader": u"Ask Dan And Jennifer",
+ u"upload_date": u"20101221",
+ u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",
+ u"uploader": u"Ask Dan And Jennifer",
u"title": u"Sex Ed: Is It Safe To Masturbate Daily?",
u"age_limit": 18,
}
@@ -71,38 +71,36 @@ class YouPornIE(InfoExtractor):
link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8')
links.append(link)
- if not links:
- raise ExtractorError(u'ERROR: no known formats available for video')
-
formats = []
for link in links:
-
# A link looks like this:
# http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
# A path looks like this:
# /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
video_url = unescapeHTML(link)
path = compat_urllib_parse_urlparse(video_url).path
- extension = os.path.splitext(path)[1][1:]
- format = path.split('/')[4].split('_')[:2]
+ format_parts = path.split('/')[4].split('_')[:2]
- # size = format[0]
- # bitrate = format[1]
- format = "-".join(format)
- # title = u'%s-%s-%s' % (video_title, size, bitrate)
+ dn = compat_urllib_parse_urlparse(video_url).netloc.partition('.')[0]
+
+ resolution = format_parts[0]
+ height = int(resolution[:-len('p')])
+ bitrate = int(format_parts[1][:-len('k')])
+ format = u'-'.join(format_parts) + u'-' + dn
formats.append({
'url': video_url,
- 'ext': extension,
'format': format,
'format_id': format,
+ 'height': height,
+ 'tbr': bitrate,
+ 'resolution': resolution,
})
- # Sort and remove doubles
- formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-'))))
- for i in range(len(formats)-1,0,-1):
- if formats[i]['format_id'] == formats[i-1]['format_id']:
- del formats[i]
+ self._sort_formats(formats)
+
+ if not formats:
+ raise ExtractorError(u'ERROR: no known formats available for video')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index a68576547..9424d5e26 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -150,168 +150,69 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
(?(1).+)? # if we found the ID, everything can follow
$"""
_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
- # Listed in order of quality
- _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
- # Apple HTTP Live Streaming
- '96', '95', '94', '93', '92', '132', '151',
- # 3D
- '85', '84', '102', '83', '101', '82', '100',
- # Dash video
- '138', '137', '248', '136', '247', '135', '246',
- '245', '244', '134', '243', '133', '242', '160',
- # Dash audio
- '141', '172', '140', '171', '139',
- ]
- _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
- # Apple HTTP Live Streaming
- '96', '95', '94', '93', '92', '132', '151',
- # 3D
- '85', '102', '84', '101', '83', '100', '82',
- # Dash video
- '138', '248', '137', '247', '136', '246', '245',
- '244', '135', '243', '134', '242', '133', '160',
- # Dash audio
- '172', '141', '171', '140', '139',
- ]
- _video_formats_map = {
- 'flv': ['35', '34', '6', '5'],
- '3gp': ['36', '17', '13'],
- 'mp4': ['38', '37', '22', '18'],
- 'webm': ['46', '45', '44', '43'],
- }
- _video_extensions = {
- '13': '3gp',
- '17': '3gp',
- '18': 'mp4',
- '22': 'mp4',
- '36': '3gp',
- '37': 'mp4',
- '38': 'mp4',
- '43': 'webm',
- '44': 'webm',
- '45': 'webm',
- '46': 'webm',
+ _formats = {
+ '5': {'ext': 'flv', 'width': 400, 'height': 240},
+ '6': {'ext': 'flv', 'width': 450, 'height': 270},
+ '13': {'ext': '3gp'},
+ '17': {'ext': '3gp', 'width': 176, 'height': 144},
+ '18': {'ext': 'mp4', 'width': 640, 'height': 360},
+ '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
+ '34': {'ext': 'flv', 'width': 640, 'height': 360},
+ '35': {'ext': 'flv', 'width': 854, 'height': 480},
+ '36': {'ext': '3gp', 'width': 320, 'height': 240},
+ '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
+ '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
+ '43': {'ext': 'webm', 'width': 640, 'height': 360},
+ '44': {'ext': 'webm', 'width': 854, 'height': 480},
+ '45': {'ext': 'webm', 'width': 1280, 'height': 720},
+ '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
+
# 3d videos
- '82': 'mp4',
- '83': 'mp4',
- '84': 'mp4',
- '85': 'mp4',
- '100': 'webm',
- '101': 'webm',
- '102': 'webm',
+ '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
+ '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
+ '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
+ '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
+ '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
+ '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
+ '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
# Apple HTTP Live Streaming
- '92': 'mp4',
- '93': 'mp4',
- '94': 'mp4',
- '95': 'mp4',
- '96': 'mp4',
- '132': 'mp4',
- '151': 'mp4',
-
- # Dash mp4
- '133': 'mp4',
- '134': 'mp4',
- '135': 'mp4',
- '136': 'mp4',
- '137': 'mp4',
- '138': 'mp4',
- '160': 'mp4',
+ '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
+ '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
+ '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
+ '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
+ '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
+ '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
+ '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
+
+ # DASH mp4 video
+ '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
+ '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
+ '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
+ '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
+ '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
+ '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
+ '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
+ '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
# Dash mp4 audio
- '139': 'm4a',
- '140': 'm4a',
- '141': 'm4a',
+ '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
+ '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
+ '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
# Dash webm
- '171': 'webm',
- '172': 'webm',
- '242': 'webm',
- '243': 'webm',
- '244': 'webm',
- '245': 'webm',
- '246': 'webm',
- '247': 'webm',
- '248': 'webm',
- }
- _video_dimensions = {
- '5': '400x240',
- '6': '???',
- '13': '???',
- '17': '176x144',
- '18': '640x360',
- '22': '1280x720',
- '34': '640x360',
- '35': '854x480',
- '36': '320x240',
- '37': '1920x1080',
- '38': '4096x3072',
- '43': '640x360',
- '44': '854x480',
- '45': '1280x720',
- '46': '1920x1080',
- '82': '360p',
- '83': '480p',
- '84': '720p',
- '85': '1080p',
- '92': '240p',
- '93': '360p',
- '94': '480p',
- '95': '720p',
- '96': '1080p',
- '100': '360p',
- '101': '480p',
- '102': '720p',
- '132': '240p',
- '151': '72p',
- '133': '240p',
- '134': '360p',
- '135': '480p',
- '136': '720p',
- '137': '1080p',
- '138': '>1080p',
- '139': '48k',
- '140': '128k',
- '141': '256k',
- '160': '192p',
- '171': '128k',
- '172': '256k',
- '242': '240p',
- '243': '360p',
- '244': '480p',
- '245': '480p',
- '246': '480p',
- '247': '720p',
- '248': '1080p',
- }
- _special_itags = {
- '82': '3D',
- '83': '3D',
- '84': '3D',
- '85': '3D',
- '100': '3D',
- '101': '3D',
- '102': '3D',
- '133': 'DASH Video',
- '134': 'DASH Video',
- '135': 'DASH Video',
- '136': 'DASH Video',
- '137': 'DASH Video',
- '138': 'DASH Video',
- '139': 'DASH Audio',
- '140': 'DASH Audio',
- '141': 'DASH Audio',
- '160': 'DASH Video',
- '171': 'DASH Audio',
- '172': 'DASH Audio',
- '242': 'DASH Video',
- '243': 'DASH Video',
- '244': 'DASH Video',
- '245': 'DASH Video',
- '246': 'DASH Video',
- '247': 'DASH Video',
- '248': 'DASH Video',
+ '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
+ '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
+ '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
+ '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
+ '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
+ '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
+ '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
+
+ # Dash webm audio
+ '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
+ '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
}
IE_NAME = u'youtube'
@@ -1153,13 +1054,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
self._downloader.report_warning(err_msg)
return {}
- def _print_formats(self, formats):
- print('Available formats:')
- for x in formats:
- print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
- self._video_dimensions.get(x, '???'),
- ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
-
def _extract_id(self, url):
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
if mobj is None:
@@ -1172,48 +1066,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
Transform a dictionary in the format {itag:url} to a list of (itag, url)
with the requested formats.
"""
- req_format = self._downloader.params.get('format', None)
- format_limit = self._downloader.params.get('format_limit', None)
- available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
- if format_limit is not None and format_limit in available_formats:
- format_list = available_formats[available_formats.index(format_limit):]
- else:
- format_list = available_formats
- existing_formats = [x for x in format_list if x in url_map]
+ existing_formats = [x for x in self._formats if x in url_map]
if len(existing_formats) == 0:
raise ExtractorError(u'no known formats available for video')
- if self._downloader.params.get('listformats', None):
- self._print_formats(existing_formats)
- return
- if req_format is None or req_format == 'best':
- video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
- elif req_format == 'worst':
- video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
- elif req_format in ('-1', 'all'):
- video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
- else:
- # Specific formats. We pick the first in a slash-delimeted sequence.
- # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
- # available in the specified format. For example,
- # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
- # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
- # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
- req_formats = req_format.split('/')
- video_url_list = None
- for rf in req_formats:
- if rf in url_map:
- video_url_list = [(rf, url_map[rf])]
- break
- if rf in self._video_formats_map:
- for srf in self._video_formats_map[rf]:
- if srf in url_map:
- video_url_list = [(srf, url_map[srf])]
- break
- else:
- continue
- break
- if video_url_list is None:
- raise ExtractorError(u'requested format not available')
+ video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
+ video_url_list.reverse() # order worst to best
return video_url_list
def _extract_from_m3u8(self, manifest_url, video_id):
@@ -1462,50 +1319,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
url += '&ratebypass=yes'
url_map[url_data['itag'][0]] = url
video_url_list = self._get_video_url_list(url_map)
- if not video_url_list:
- return
elif video_info.get('hlsvp'):
manifest_url = video_info['hlsvp'][0]
url_map = self._extract_from_m3u8(manifest_url, video_id)
video_url_list = self._get_video_url_list(url_map)
- if not video_url_list:
- return
-
else:
raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
- results = []
+ formats = []
for itag, video_real_url in video_url_list:
- # Extension
- video_extension = self._video_extensions.get(itag, 'flv')
-
- video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
- self._video_dimensions.get(itag, '???'),
- ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
-
- results.append({
- 'id': video_id,
- 'url': video_real_url,
- 'uploader': video_uploader,
- 'uploader_id': video_uploader_id,
- 'upload_date': upload_date,
- 'title': video_title,
- 'ext': video_extension,
- 'format': video_format,
+ dct = {
'format_id': itag,
- 'thumbnail': video_thumbnail,
- 'description': video_description,
- 'player_url': player_url,
- 'subtitles': video_subtitles,
- 'duration': video_duration,
- 'age_limit': 18 if age_gate else 0,
- 'annotations': video_annotations,
- 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
- 'view_count': view_count,
- 'like_count': like_count,
- 'dislike_count': dislike_count,
- })
- return results
+ 'url': video_real_url,
+ 'player_url': player_url,
+ }
+ dct.update(self._formats[itag])
+ formats.append(dct)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'uploader': video_uploader,
+ 'uploader_id': video_uploader_id,
+ 'upload_date': upload_date,
+ 'title': video_title,
+ 'thumbnail': video_thumbnail,
+ 'description': video_description,
+ 'subtitles': video_subtitles,
+ 'duration': video_duration,
+ 'age_limit': 18 if age_gate else 0,
+ 'annotations': video_annotations,
+ 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
+ 'formats': formats,
+ }
class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
IE_DESC = u'YouTube.com playlists'
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index 35ece354a..829f002cf 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -1,10 +1,10 @@
# coding: utf-8
-import operator
import re
from .common import InfoExtractor
from ..utils import (
+ int_or_none,
unified_strdate,
)
@@ -67,29 +67,13 @@ class ZDFIE(InfoExtractor):
''', format_id)
ext = format_m.group('container')
- is_supported = ext != 'f4f'
-
- PROTO_ORDER = ['http', 'rtmp', 'rtsp']
- try:
- proto_pref = -PROTO_ORDER.index(format_m.group('proto'))
- except ValueError:
- proto_pref = -999
+ proto = format_m.group('proto').lower()
quality = fnode.find('./quality').text
- QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low']
- try:
- quality_pref = -QUALITY_ORDER.index(quality)
- except ValueError:
- quality_pref = -999
-
abr = int(fnode.find('./audioBitrate').text) // 1000
vbr = int(fnode.find('./videoBitrate').text) // 1000
- pref = (is_available, is_supported,
- proto_pref, quality_pref, vbr, abr)
format_note = u''
- if not is_supported:
- format_note += u'(unsupported)'
if not format_note:
format_note = None
@@ -101,18 +85,20 @@ class ZDFIE(InfoExtractor):
'vcodec': format_m.group('vcodec'),
'abr': abr,
'vbr': vbr,
- 'width': int(fnode.find('./width').text),
- 'height': int(fnode.find('./height').text),
- 'filesize': int(fnode.find('./filesize').text),
+ 'width': int_or_none(fnode.find('./width').text),
+ 'height': int_or_none(fnode.find('./height').text),
+ 'filesize': int_or_none(fnode.find('./filesize').text),
'format_note': format_note,
- '_pref': pref,
+ 'protocol': proto,
'_available': is_available,
}
format_nodes = doc.findall('.//formitaeten/formitaet')
- formats = sorted(filter(lambda f: f['_available'],
- map(xml_to_format, format_nodes)),
- key=operator.itemgetter('_pref'))
+ formats = list(filter(
+ lambda f: f['_available'],
+ map(xml_to_format, format_nodes)))
+
+ self._sort_formats(formats)
return {
'id': video_id,