aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py9
-rw-r--r--youtube_dl/extractor/aol.py28
-rw-r--r--youtube_dl/extractor/arte.py28
-rw-r--r--youtube_dl/extractor/comedycentral.py2
-rw-r--r--youtube_dl/extractor/cspan.py19
-rw-r--r--youtube_dl/extractor/daum.py28
-rw-r--r--youtube_dl/extractor/engadget.py43
-rw-r--r--youtube_dl/extractor/fivemin.py56
-rw-r--r--youtube_dl/extractor/generic.py71
-rw-r--r--youtube_dl/extractor/iprima.py2
-rw-r--r--youtube_dl/extractor/kontrtube.py24
-rw-r--r--youtube_dl/extractor/ninegag.py4
-rw-r--r--youtube_dl/extractor/ooyala.py47
-rw-r--r--youtube_dl/extractor/parliamentliveuk.py57
-rw-r--r--youtube_dl/extractor/pbs.py11
-rw-r--r--youtube_dl/extractor/rutv.py (renamed from youtube_dl/extractor/vgtrk.py)167
-rw-r--r--youtube_dl/extractor/ted.py13
-rw-r--r--youtube_dl/extractor/udemy.py5
-rw-r--r--youtube_dl/extractor/vesti.py121
-rw-r--r--youtube_dl/extractor/vevo.py7
-rw-r--r--youtube_dl/extractor/videolecturesnet.py67
-rw-r--r--youtube_dl/extractor/viki.py45
-rw-r--r--youtube_dl/extractor/xtube.py29
-rw-r--r--youtube_dl/extractor/youporn.py32
-rw-r--r--youtube_dl/extractor/youtube.py14
25 files changed, 668 insertions, 261 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 313414e7d..d828c6932 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -2,6 +2,7 @@ from .academicearth import AcademicEarthCourseIE
from .addanime import AddAnimeIE
from .aftonbladet import AftonbladetIE
from .anitube import AnitubeIE
+from .aol import AolIE
from .aparat import AparatIE
from .appletrailers import AppleTrailersIE
from .archiveorg import ArchiveOrgIE
@@ -10,6 +11,7 @@ from .arte import (
ArteTvIE,
ArteTVPlus7IE,
ArteTVCreativeIE,
+ ArteTVConcertIE,
ArteTVFutureIE,
ArteTVDDCIE,
)
@@ -63,6 +65,7 @@ from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .eitb import EitbIE
from .elpais import ElPaisIE
+from .engadget import EngadgetIE
from .escapist import EscapistIE
from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
@@ -71,6 +74,7 @@ from .facebook import FacebookIE
from .faz import FazIE
from .firstpost import FirstpostIE
from .firsttv import FirstTVIE
+from .fivemin import FiveMinIE
from .fktv import (
FKTVIE,
FKTVPosteckeIE,
@@ -173,6 +177,7 @@ from .nowness import NownessIE
from .nowvideo import NowVideoIE
from .ooyala import OoyalaIE
from .orf import ORFIE
+from .parliamentliveuk import ParliamentLiveUKIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
from .playvid import PlayvidIE
@@ -196,6 +201,7 @@ from .rutube import (
RutubeMovieIE,
RutubePersonIE,
)
+from .rutv import RUTVIE
from .savefrom import SaveFromIE
from .servingsys import ServingSysIE
from .sina import SinaIE
@@ -251,12 +257,13 @@ from .ustream import UstreamIE, UstreamChannelIE
from .vbox7 import Vbox7IE
from .veehd import VeeHDIE
from .veoh import VeohIE
+from .vesti import VestiIE
from .vevo import VevoIE
-from .vgtrk import VGTRKIE
from .vice import ViceIE
from .viddler import ViddlerIE
from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
+from .videolecturesnet import VideoLecturesNetIE
from .videofyme import VideofyMeIE
from .videopremium import VideoPremiumIE
from .vimeo import (
diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py
new file mode 100644
index 000000000..abc668912
--- /dev/null
+++ b/youtube_dl/extractor/aol.py
@@ -0,0 +1,28 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .fivemin import FiveMinIE
+
+
+class AolIE(InfoExtractor):
+ IE_NAME = 'on.aol.com'
+ _VALID_URL = r'http://on\.aol\.com/video/.*-(?P<id>\d+)($|\?)'
+
+ _TEST = {
+ 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img',
+ 'md5': '18ef68f48740e86ae94b98da815eec42',
+ 'info_dict': {
+ 'id': '518167793',
+ 'ext': 'mp4',
+ 'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',
+ },
+ 'add_ie': ['FiveMin'],
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ self.to_screen('Downloading 5min.com video %s' % video_id)
+ return FiveMinIE._build_result(video_id)
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index d194f2564..548442166 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -131,7 +131,7 @@ class ArteTvIE(InfoExtractor):
class ArteTVPlus7IE(InfoExtractor):
IE_NAME = 'arte.tv:+7'
- _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+ _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
@classmethod
def _extract_url_info(cls, url):
@@ -202,6 +202,8 @@ class ArteTVPlus7IE(InfoExtractor):
re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None,
# The version with sourds/mal subtitles has also lower relevance
re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None,
+ # Prefer http downloads over m3u8
+ 0 if f['url'].endswith('m3u8') else 1,
)
formats = sorted(formats, key=sort_key)
def _format(format_info):
@@ -242,8 +244,9 @@ class ArteTVCreativeIE(ArteTVPlus7IE):
_TEST = {
'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
- 'file': '050489-002.mp4',
'info_dict': {
+ 'id': '050489-002',
+ 'ext': 'mp4',
'title': 'Agentur Amateur / Agence Amateur #2 : Corporate Design',
},
}
@@ -255,8 +258,9 @@ class ArteTVFutureIE(ArteTVPlus7IE):
_TEST = {
'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081',
- 'file': '050940-003.mp4',
'info_dict': {
+ 'id': '050940-003',
+ 'ext': 'mp4',
'title': 'Les champignons au secours de la planète',
},
}
@@ -270,7 +274,7 @@ class ArteTVFutureIE(ArteTVPlus7IE):
class ArteTVDDCIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:ddc'
- _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)'
+ _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)'
def _real_extract(self, url):
video_id, lang = self._extract_url_info(url)
@@ -284,3 +288,19 @@ class ArteTVDDCIE(ArteTVPlus7IE):
javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator')
json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url')
return self._extract_from_json_url(json_url, video_id, lang)
+
+
+class ArteTVConcertIE(ArteTVPlus7IE):
+ IE_NAME = 'arte.tv:concert'
+ _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>de|fr)/(?P<id>.+)'
+
+ _TEST = {
+ 'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',
+ 'md5': '9ea035b7bd69696b67aa2ccaaa218161',
+ 'info_dict': {
+ 'id': '186',
+ 'ext': 'mp4',
+ 'title': 'The Notwist im Pariser Konzertclub "Divan du Monde"',
+ 'upload_date': '20140128',
+ },
+ }
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index ed3986f31..d50fcdbdb 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -14,7 +14,7 @@ from ..utils import (
class ComedyCentralIE(MTVServicesInfoExtractor):
- _VALID_URL = r'''(?x)https?://(?:www\.)?comedycentral\.com/
+ _VALID_URL = r'''(?x)https?://(?:www\.)?(comedycentral|cc)\.com/
(video-clips|episodes|cc-studios|video-collections)
/(?P<title>.*)'''
_FEED_URL = 'http://comedycentral.com/feeds/mrss/'
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index d65046f58..2a8eda9ef 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -10,9 +10,9 @@ from ..utils import (
class CSpanIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>\d+)'
+ _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)'
IE_DESC = 'C-SPAN'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.c-span.org/video/?313572-1/HolderonV',
'md5': '8e44ce11f0f725527daccc453f553eb0',
'info_dict': {
@@ -22,13 +22,24 @@ class CSpanIE(InfoExtractor):
'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.',
},
'skip': 'Regularly fails on travis, for unknown reasons',
- }
+ }, {
+ 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models',
+ # For whatever reason, the served video alternates between
+ # two different ones
+ #'md5': 'dbb0f047376d457f2ab8b3929cbb2d0c',
+ 'info_dict': {
+ 'id': '340723',
+ 'ext': 'mp4',
+ 'title': 'International Health Care Models',
+ 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967',
+ }
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
page_id = mobj.group('id')
webpage = self._download_webpage(url, page_id)
- video_id = self._search_regex(r'data-progid=\'(\d+)\'>', webpage, 'video id')
+ video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id')
description = self._html_search_regex(
[
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py
index 4876ecb48..6033cd94a 100644
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -1,25 +1,28 @@
# encoding: utf-8
+
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
- determine_ext,
)
class DaumIE(InfoExtractor):
_VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
- IE_NAME = u'daum.net'
+ IE_NAME = 'daum.net'
_TEST = {
- u'url': u'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690',
- u'file': u'52554690.mp4',
- u'info_dict': {
- u'title': u'DOTA 2GETHER 시즌2 6회 - 2부',
- u'description': u'DOTA 2GETHER 시즌2 6회 - 2부',
- u'upload_date': u'20130831',
- u'duration': 3868,
+ 'url': 'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690',
+ 'info_dict': {
+ 'id': '52554690',
+ 'ext': 'mp4',
+ 'title': 'DOTA 2GETHER 시즌2 6회 - 2부',
+ 'description': 'DOTA 2GETHER 시즌2 6회 - 2부',
+ 'upload_date': '20130831',
+ 'duration': 3868,
},
}
@@ -30,14 +33,14 @@ class DaumIE(InfoExtractor):
webpage = self._download_webpage(canonical_url, video_id)
full_id = self._search_regex(
r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]',
- webpage, u'full id')
+ webpage, 'full id')
query = compat_urllib_parse.urlencode({'vid': full_id})
info = self._download_xml(
'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
- u'Downloading video info')
+ 'Downloading video info')
urls = self._download_xml(
'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query,
- video_id, u'Downloading video formats info')
+ video_id, 'Downloading video formats info')
self.to_screen(u'%s: Getting video urls' % video_id)
formats = []
@@ -53,7 +56,6 @@ class DaumIE(InfoExtractor):
format_url = url_doc.find('result/url').text
formats.append({
'url': format_url,
- 'ext': determine_ext(format_url),
'format_id': profile,
})
diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py
new file mode 100644
index 000000000..92ada81d2
--- /dev/null
+++ b/youtube_dl/extractor/engadget.py
@@ -0,0 +1,43 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .fivemin import FiveMinIE
+from ..utils import (
+ url_basename,
+)
+
+
+class EngadgetIE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://www.engadget.com/
+ (?:video/5min/(?P<id>\d+)|
+ [\d/]+/.*?)
+ '''
+
+ _TEST = {
+ 'url': 'http://www.engadget.com/video/5min/518153925/',
+ 'md5': 'c6820d4828a5064447a4d9fc73f312c9',
+ 'info_dict': {
+ 'id': '518153925',
+ 'ext': 'mp4',
+ 'title': 'Samsung Galaxy Tab Pro 8.4 Review',
+ },
+ 'add_ie': ['FiveMin'],
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ if video_id is not None:
+ return FiveMinIE._build_result(video_id)
+ else:
+ title = url_basename(url)
+ webpage = self._download_webpage(url, title)
+ ids = re.findall(r'<iframe[^>]+?playList=(\d+)', webpage)
+ return {
+ '_type': 'playlist',
+ 'title': title,
+ 'entries': [FiveMinIE._build_result(id) for id in ids]
+ }
diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py
new file mode 100644
index 000000000..215cc831e
--- /dev/null
+++ b/youtube_dl/extractor/fivemin.py
@@ -0,0 +1,56 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_str,
+)
+
+
+class FiveMinIE(InfoExtractor):
+ IE_NAME = '5min'
+ _VALID_URL = r'''(?x)
+ (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(.*?&)?playList=|
+ 5min:)
+ (?P<id>\d+)
+ '''
+
+ _TEST = {
+ # From http://www.engadget.com/2013/11/15/ipad-mini-retina-display-review/
+ 'url': 'http://pshared.5min.com/Scripts/PlayerSeed.js?sid=281&width=560&height=345&playList=518013791',
+ 'md5': '4f7b0b79bf1a470e5004f7112385941d',
+ 'info_dict': {
+ 'id': '518013791',
+ 'ext': 'mp4',
+ 'title': 'iPad Mini with Retina Display Review',
+ },
+ }
+
+ @classmethod
+ def _build_result(cls, video_id):
+ return cls.url_result('5min:%s' % video_id, cls.ie_key())
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ info = self._download_json(
+ 'https://syn.5min.com/handlers/SenseHandler.ashx?func=GetResults&'
+ 'playlist=%s&url=https' % video_id,
+ video_id)['binding'][0]
+
+ second_id = compat_str(int(video_id[:-2]) + 1)
+ formats = []
+ for quality, height in [(1, 320), (2, 480), (4, 720), (8, 1080)]:
+ if any(r['ID'] == quality for r in info['Renditions']):
+ formats.append({
+ 'format_id': compat_str(quality),
+ 'url': 'http://avideos.5min.com/%s/%s/%s_%s.mp4' % (second_id[-3:], second_id, video_id, quality),
+ 'height': height,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': info['Title'],
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 0d02f836e..238913256 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -24,6 +24,7 @@ from ..utils import (
)
from .brightcove import BrightcoveIE
from .ooyala import OoyalaIE
+from .rutv import RUTVIE
class GenericIE(InfoExtractor):
@@ -101,6 +102,20 @@ class GenericIE(InfoExtractor):
'title': '2cc213299525360.mov', # that's what we get
},
},
+ # second style of embedded ooyala videos
+ {
+ 'url': 'http://www.smh.com.au/tv/business/show/financial-review-sunday/behind-the-scenes-financial-review-sunday--4350201.html',
+ 'info_dict': {
+ 'id': '13djJjYjptA1XpPx8r9kuzPyj3UZH0Uk',
+ 'ext': 'mp4',
+ 'title': 'Behind-the-scenes: Financial Review Sunday ',
+ 'description': 'Step inside Channel Nine studios for an exclusive tour of its upcoming financial business show.',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
# google redirect
{
'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
@@ -143,6 +158,32 @@ class GenericIE(InfoExtractor):
'ext': 'mp4',
'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
+ },
+ },
+ # RUTV embed
+ {
+ 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
+ 'info_dict': {
+ 'id': '776940',
+ 'ext': 'mp4',
+ 'title': 'Охотское море стало целиком российским',
+ 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ # Embedded TED video
+ {
+ 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
+ 'md5': 'deeeabcc1085eb2ba205474e7235a3d5',
+ 'info_dict': {
+ 'id': '981',
+ 'ext': 'mp4',
+ 'title': 'My web playroom',
+ 'uploader': 'Ze Frank',
+ 'description': 'md5:ddb2a40ecd6b6a147e400e535874947b',
}
},
# nowvideo embed hidden behind percent encoding
@@ -155,7 +196,7 @@ class GenericIE(InfoExtractor):
'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
'description': 'No description',
},
- }
+ },
]
def report_download_webpage(self, video_id):
@@ -181,9 +222,14 @@ class GenericIE(InfoExtractor):
newurl = newurl.replace(' ', '%20')
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
+ try:
+ # This function was deprecated in python 3.3 and removed in 3.4
+ origin_req_host = req.get_origin_req_host()
+ except AttributeError:
+ origin_req_host = req.origin_req_host
return HEADRequest(newurl,
headers=newheaders,
- origin_req_host=req.get_origin_req_host(),
+ origin_req_host=origin_req_host,
unverifiable=True)
else:
raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
@@ -340,9 +386,9 @@ class GenericIE(InfoExtractor):
# Look for embedded (iframe) Vimeo player
mobj = re.search(
- r'<iframe[^>]+?src="((?:https?:)?//player\.vimeo\.com/video/.+?)"', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
if mobj:
- player_url = unescapeHTML(mobj.group(1))
+ player_url = unescapeHTML(mobj.group('url'))
surl = smuggle_url(player_url, {'Referer': url})
return self.url_result(surl, 'Vimeo')
@@ -408,9 +454,10 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'))
# Look for Ooyala videos
- mobj = re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=([^"&]+)', webpage)
+ mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
+ re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
if mobj is not None:
- return OoyalaIE._build_url_result(mobj.group(1))
+ return OoyalaIE._build_url_result(mobj.group('ec'))
# Look for Aparat videos
mobj = re.search(r'<iframe src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
@@ -467,6 +514,11 @@ class GenericIE(InfoExtractor):
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
+ # Look for embedded RUTV player
+ rutv_url = RUTVIE._extract_url(webpage)
+ if rutv_url:
+ return self.url_result(rutv_url, 'RUTV')
+
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
@@ -478,6 +530,13 @@ class GenericIE(InfoExtractor):
if mobj is None:
# Broaden the search a little bit: JWPlayer JS loader
mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
+
+ # Look for embedded TED player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'TED')
+
if mobj is None:
# Try to find twitter cards info
mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py
index 2a29e6072..d1defd363 100644
--- a/youtube_dl/extractor/iprima.py
+++ b/youtube_dl/extractor/iprima.py
@@ -48,7 +48,7 @@ class IPrimaIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- if re.search(r'Nemáte oprávnění přistupovat na tuto stránku.\s*</div>', webpage):
+ if re.search(r'Nemáte oprávnění přistupovat na tuto stránku\.\s*</div>', webpage):
raise ExtractorError(
'%s said: You do not have permission to access this page' % self.IE_NAME, expected=True)
diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py
index 1b45b67b0..5341ac773 100644
--- a/youtube_dl/extractor/kontrtube.py
+++ b/youtube_dl/extractor/kontrtube.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import int_or_none
class KontrTubeIE(InfoExtractor):
@@ -32,27 +33,26 @@ class KontrTubeIE(InfoExtractor):
video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL')
thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False)
- title = self._html_search_regex(r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage,
- 'video title')
+ title = self._html_search_regex(
+ r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage, 'video title')
description = self._html_search_meta('description', webpage, 'video description')
- mobj = re.search(r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>',
- webpage)
+ mobj = re.search(
+ r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>', webpage)
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
- view_count = self._html_search_regex(r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage,
- 'view count', fatal=False)
- view_count = int(view_count) if view_count is not None else None
+ view_count = self._html_search_regex(
+ r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage, 'view count', fatal=False)
comment_count = None
- comment_str = self._html_search_regex(r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count',
- fatal=False)
+ comment_str = self._html_search_regex(
+ r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count', fatal=False)
if comment_str.startswith('комментариев нет'):
comment_count = 0
else:
mobj = re.search(r'\d+ из (?P<total>\d+) комментариев', comment_str)
if mobj:
- comment_count = int(mobj.group('total'))
+ comment_count = mobj.group('total')
return {
'id': video_id,
@@ -61,6 +61,6 @@ class KontrTubeIE(InfoExtractor):
'title': title,
'description': description,
'duration': duration,
- 'view_count': view_count,
- 'comment_count': comment_count,
+ 'view_count': int_or_none(view_count),
+ 'comment_count': int_or_none(comment_count),
} \ No newline at end of file
diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py
index 1d7aa40ed..b8c892cce 100644
--- a/youtube_dl/extractor/ninegag.py
+++ b/youtube_dl/extractor/ninegag.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
@@ -12,8 +11,9 @@ class NineGagIE(InfoExtractor):
_TEST = {
"url": "http://9gag.tv/v/1912",
- "file": "1912.mp4",
"info_dict": {
+ "id": "1912",
+ "ext": "mp4",
"description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
"title": "\"People Are Awesome 2013\" Is Absolutely Awesome",
"view_count": int,
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py
index 44312ba4e..e20327791 100644
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -1,20 +1,23 @@
+from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from ..utils import unescapeHTML
+
class OoyalaIE(InfoExtractor):
_VALID_URL = r'https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=(?P<id>.+?)(&|$)'
_TEST = {
# From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
- u'url': u'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
- u'file': u'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8.mp4',
- u'md5': u'3f5cceb3a7bf461d6c29dc466cf8033c',
- u'info_dict': {
- u'title': u'Explaining Data Recovery from Hard Drives and SSDs',
- u'description': u'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
+ 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+ 'md5': '3f5cceb3a7bf461d6c29dc466cf8033c',
+ 'info_dict': {
+ 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+ 'ext': 'mp4',
+ 'title': 'Explaining Data Recovery from Hard Drives and SSDs',
+ 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
},
}
@@ -28,13 +31,14 @@ class OoyalaIE(InfoExtractor):
ie=cls.ie_key())
def _extract_result(self, info, more_info):
- return {'id': info['embedCode'],
- 'ext': 'mp4',
- 'title': unescapeHTML(info['title']),
- 'url': info.get('ipad_url') or info['url'],
- 'description': unescapeHTML(more_info['description']),
- 'thumbnail': more_info['promo'],
- }
+ return {
+ 'id': info['embedCode'],
+ 'ext': 'mp4',
+ 'title': unescapeHTML(info['title']),
+ 'url': info.get('ipad_url') or info['url'],
+ 'description': unescapeHTML(more_info['description']),
+ 'thumbnail': more_info['promo'],
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -42,22 +46,23 @@ class OoyalaIE(InfoExtractor):
player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode
player = self._download_webpage(player_url, embedCode)
mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="',
- player, u'mobile player url')
+ player, 'mobile player url')
mobile_player = self._download_webpage(mobile_url, embedCode)
videos_info = self._search_regex(
r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);',
- mobile_player, u'info').replace('\\"','"')
- videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, u'more info').replace('\\"','"')
+ mobile_player, 'info').replace('\\"','"')
+ videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"','"')
videos_info = json.loads(videos_info)
videos_more_info =json.loads(videos_more_info)
if videos_more_info.get('lineup'):
videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])]
- return {'_type': 'playlist',
- 'id': embedCode,
- 'title': unescapeHTML(videos_more_info['title']),
- 'entries': videos,
- }
+ return {
+ '_type': 'playlist',
+ 'id': embedCode,
+ 'title': unescapeHTML(videos_more_info['title']),
+ 'entries': videos,
+ }
else:
return self._extract_result(videos_info[0], videos_more_info)
diff --git a/youtube_dl/extractor/parliamentliveuk.py b/youtube_dl/extractor/parliamentliveuk.py
new file mode 100644
index 000000000..02dca14c0
--- /dev/null
+++ b/youtube_dl/extractor/parliamentliveuk.py
@@ -0,0 +1,57 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ unified_strdate,
+)
+
+
+class ParliamentLiveUKIE(InfoExtractor):
+ IE_NAME = 'parliamentlive.tv'
+ IE_DESC = 'UK parliament videos'
+ _VALID_URL = r'https?://www\.parliamentlive\.tv/Main/Player\.aspx\?(?:[^&]+&)*?meetingId=(?P<id>[0-9]+)'
+
+ _TEST = {
+ 'url': 'http://www.parliamentlive.tv/Main/Player.aspx?meetingId=15121&player=windowsmedia',
+ 'info_dict': {
+ 'id': '15121',
+ 'ext': 'asf',
+ 'title': 'hoc home affairs committee, 18 mar 2014.pm',
+ 'description': 'md5:033b3acdf83304cd43946b2d5e5798d1',
+ },
+ 'params': {
+ 'skip_download': True, # Requires mplayer (mms)
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+
+ asx_url = self._html_search_regex(
+ r'embed.*?src="([^"]+)" name="MediaPlayer"', webpage,
+ 'metadata URL')
+ asx = self._download_xml(asx_url, video_id, 'Downloading ASX metadata')
+ video_url = asx.find('.//REF').attrib['HREF']
+
+ title = self._search_regex(
+ r'''(?x)player\.setClipDetails\(
+ (?:(?:[0-9]+|"[^"]+"),\s*){2}
+ "([^"]+",\s*"[^"]+)"
+ ''',
+ webpage, 'title').replace('", "', ', ')
+ description = self._html_search_regex(
+ r'(?s)<span id="MainContentPlaceHolder_CaptionsBlock_WitnessInfo">(.*?)</span>',
+ webpage, 'description')
+
+ return {
+ 'id': video_id,
+ 'ext': 'asf',
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index e7e0042fb..64cded707 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -3,6 +3,9 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import (
+ US_RATINGS,
+)
class PBSIE(InfoExtractor):
@@ -13,7 +16,7 @@ class PBSIE(InfoExtractor):
# Article with embedded player
(?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) |
# Player
- video\.pbs\.org/partnerplayer/(?P<player_id>[^/]+)/
+ video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
)
'''
@@ -57,6 +60,11 @@ class PBSIE(InfoExtractor):
info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
info = self._download_json(info_url, display_id)
+ rating_str = info.get('rating')
+ if rating_str is not None:
+ rating_str = rating_str.rpartition('-')[2]
+ age_limit = US_RATINGS.get(rating_str)
+
return {
'id': video_id,
'title': info['title'],
@@ -65,4 +73,5 @@ class PBSIE(InfoExtractor):
'description': info['program'].get('description'),
'thumbnail': info.get('image_url'),
'duration': info.get('duration'),
+ 'age_limit': age_limit,
}
diff --git a/youtube_dl/extractor/vgtrk.py b/youtube_dl/extractor/rutv.py
index 429b8bc72..5c38cbc02 100644
--- a/youtube_dl/extractor/vgtrk.py
+++ b/youtube_dl/extractor/rutv.py
@@ -10,33 +10,19 @@ from ..utils import (
)
-class VGTRKIE(InfoExtractor):
- IE_DESC = 'ВГТРК'
- _VALID_URL = r'http://(?:.+?\.)?(?:vesti\.ru|russia2?\.tv|tvkultura\.ru|rutv\.ru)/(?P<id>.+)'
+class RUTVIE(InfoExtractor):
+ IE_DESC = 'RUTV.RU'
+ _VALID_URL = r'https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:flash2v/container\.swf\?id=|iframe/(?P<type>swf|video|live)/id/)(?P<id>\d+)'
_TESTS = [
{
- 'url': 'http://www.vesti.ru/videos?vid=575582&cid=1',
+ 'url': 'http://player.rutv.ru/flash2v/container.swf?id=774471&sid=kultura&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972347/video_id/978186/brand_id/31724',
'info_dict': {
- 'id': '765035',
- 'ext': 'mp4',
- 'title': 'Вести.net: биткоины в России не являются законными',
- 'description': 'md5:d4bb3859dc1177b28a94c5014c35a36b',
- 'duration': 302,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://www.vesti.ru/doc.html?id=1349233',
- 'info_dict': {
- 'id': '773865',
+ 'id': '774471',
'ext': 'mp4',
- 'title': 'Участники митинга штурмуют Донецкую областную администрацию',
- 'description': 'md5:1a160e98b3195379b4c849f2f4958009',
- 'duration': 210,
+ 'title': 'Монологи на все времена',
+ 'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5',
+ 'duration': 2906,
},
'params': {
# m3u8 download
@@ -44,13 +30,13 @@ class VGTRKIE(InfoExtractor):
},
},
{
- 'url': 'http://www.vesti.ru/only_video.html?vid=576180',
+ 'url': 'https://player.vgtrk.com/flash2v/container.swf?id=774016&sid=russiatv&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972098/video_id/977760/brand_id/57638',
'info_dict': {
- 'id': '766048',
+ 'id': '774016',
'ext': 'mp4',
- 'title': 'США заморозило, Британию затопило',
- 'description': 'md5:f0ed0695ec05aed27c56a70a58dc4cc1',
- 'duration': 87,
+ 'title': 'Чужой в семье Сталина',
+ 'description': '',
+ 'duration': 2539,
},
'params': {
# m3u8 download
@@ -58,7 +44,7 @@ class VGTRKIE(InfoExtractor):
},
},
{
- 'url': 'http://hitech.vesti.ru/news/view/id/4000',
+ 'url': 'http://player.rutv.ru/iframe/swf/id/766888/sid/hitech/?acc_video_id=4000',
'info_dict': {
'id': '766888',
'ext': 'mp4',
@@ -72,22 +58,21 @@ class VGTRKIE(InfoExtractor):
},
},
{
- 'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403',
+ 'url': 'http://player.rutv.ru/iframe/video/id/771852/start_zoom/true/showZoomBtn/false/sid/russiatv/?acc_video_id=episode_id/970443/video_id/975648/brand_id/5169',
'info_dict': {
- 'id': '766403',
+ 'id': '771852',
'ext': 'mp4',
- 'title': 'XXII зимние Олимпийские игры. Российские хоккеисты стартовали на Олимпиаде с победы',
- 'description': 'md5:55805dfd35763a890ff50fa9e35e31b3',
- 'duration': 271,
+ 'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет',
+ 'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8',
+ 'duration': 3096,
},
'params': {
# m3u8 download
'skip_download': True,
},
- 'skip': 'Blocked outside Russia',
},
{
- 'url': 'http://sochi2014.vesti.ru/live/play/live_id/301',
+ 'url': 'http://player.rutv.ru/iframe/live/id/51499/showZoomBtn/false/isPlay/true/sid/sochi2014',
'info_dict': {
'id': '51499',
'ext': 'flv',
@@ -98,124 +83,44 @@ class VGTRKIE(InfoExtractor):
# rtmp download
'skip_download': True,
},
- 'skip': 'Translation has finished'
- },
- {
- 'url': 'http://russia.tv/video/show/brand_id/5169/episode_id/970443/video_id/975648',
- 'info_dict': {
- 'id': '771852',
- 'ext': 'mp4',
- 'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет',
- 'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8',
- 'duration': 3096,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://russia.tv/brand/show/brand_id/57638',
- 'info_dict': {
- 'id': '774016',
- 'ext': 'mp4',
- 'title': 'Чужой в семье Сталина',
- 'description': '',
- 'duration': 2539,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://2.russia.tv/video/show/brand_id/48863/episode_id/972920/video_id/978667/viewtype/picture',
- 'info_dict': {
- 'id': '775081',
- 'ext': 'mp4',
- 'title': 'XXII зимние Олимпийские игры. Россияне заняли весь пьедестал в лыжных гонках',
- 'description': 'md5:15d3741dd8d04b203fbc031c6a47fb0f',
- 'duration': 101,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- 'skip': 'Blocked outside Russia',
- },
- {
- 'url': 'http://tvkultura.ru/video/show/brand_id/31724/episode_id/972347/video_id/978186',
- 'info_dict': {
- 'id': '774471',
- 'ext': 'mp4',
- 'title': 'Монологи на все времена',
- 'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5',
- 'duration': 2906,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://rutv.ru/brand/show/id/6792/channel/75',
- 'info_dict': {
- 'id': '125521',
- 'ext': 'mp4',
- 'title': 'Грустная дама червей. Х/ф',
- 'description': '',
- 'duration': 4882,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
+ 'skip': 'Translation has finished',
},
]
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- page = self._download_webpage(url, video_id, 'Downloading page')
-
+ @classmethod
+ def _extract_url(cls, webpage):
mobj = re.search(
- r'<meta property="og:video" content="http://www\.vesti\.ru/i/flvplayer_videoHost\.swf\?vid=(?P<id>\d+)',
- page)
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/iframe/(?:swf|video|live)/id/.+?)\1', webpage)
if mobj:
- video_id = mobj.group('id')
- page = self._download_webpage('http://www.vesti.ru/only_video.html?vid=%s' % video_id, video_id,
- 'Downloading video page')
+ return mobj.group('url')
mobj = re.search(
- r'<meta property="og:video" content="http://player\.rutv\.ru/flash2v/container\.swf\?id=(?P<id>\d+)', page)
+ r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>http://player\.(?:rutv\.ru|vgtrk\.com)/flash2v/container\.swf\?id=.+?\2)',
+ webpage)
if mobj:
- video_type = 'video'
- video_id = mobj.group('id')
- else:
- mobj = re.search(
- r'<iframe.+?src="http://player\.rutv\.ru/iframe/(?P<type>[^/]+)/id/(?P<id>\d+)[^"]*".*?></iframe>',
- page)
+ return mobj.group('url')
- if not mobj:
- raise ExtractorError('No media found', expected=True)
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ video_type = mobj.group('type')
- video_type = mobj.group('type')
- video_id = mobj.group('id')
+ if not video_type or video_type == 'swf':
+ video_type = 'video'
json_data = self._download_json(
'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if video_type == 'live' else '', video_id),
video_id, 'Downloading JSON')
if json_data['errors']:
- raise ExtractorError('vesti returned error: %s' % json_data['errors'], expected=True)
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, json_data['errors']), expected=True)
playlist = json_data['data']['playlist']
medialist = playlist['medialist']
media = medialist[0]
if media['errors']:
- raise ExtractorError('vesti returned error: %s' % media['errors'], expected=True)
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, media['errors']), expected=True)
view_count = playlist.get('count_views')
priority_transport = playlist['priority_transport']
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index cf10be2d0..ad1a46c33 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -11,7 +11,9 @@ from ..utils import (
class TEDIE(SubtitlesInfoExtractor):
- _VALID_URL = r'''(?x)http://www\.ted\.com/
+ _VALID_URL = r'''(?x)
+ (?P<proto>https?://)
+ (?P<type>www|embed)(?P<urlmain>\.ted\.com/
(
(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
|
@@ -19,6 +21,7 @@ class TEDIE(SubtitlesInfoExtractor):
)
(/lang/(.*?))? # The url may contain the language
/(?P<name>\w+) # Here goes the name and then ".html"
+ .*)$
'''
_TEST = {
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
@@ -48,6 +51,9 @@ class TEDIE(SubtitlesInfoExtractor):
def _real_extract(self, url):
m = re.match(self._VALID_URL, url, re.VERBOSE)
+ if m.group('type') == 'embed':
+ desktop_url = m.group('proto') + 'www' + m.group('urlmain')
+ return self.url_result(desktop_url, 'TED')
name = m.group('name')
if m.group('type_talk'):
return self._talk_info(url, name)
@@ -93,11 +99,14 @@ class TEDIE(SubtitlesInfoExtractor):
self._list_available_subtitles(video_id, talk_info)
return
+ thumbnail = talk_info['thumb']
+ if not thumbnail.startswith('http'):
+ thumbnail = 'http://' + thumbnail
return {
'id': video_id,
'title': talk_info['title'],
'uploader': talk_info['speaker'],
- 'thumbnail': talk_info['thumb'],
+ 'thumbnail': thumbnail,
'description': self._og_search_description(webpage),
'subtitles': video_subtitles,
'formats': formats,
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py
index 35df918b8..054f42725 100644
--- a/youtube_dl/extractor/udemy.py
+++ b/youtube_dl/extractor/udemy.py
@@ -16,7 +16,7 @@ class UdemyIE(InfoExtractor):
_LOGIN_URL = 'https://www.udemy.com/join/login-submit/'
_NETRC_MACHINE = 'udemy'
- _TEST = {
+ _TESTS = [{
'url': 'https://www.udemy.com/java-tutorial/#/lecture/172757',
'md5': '98eda5b657e752cf945d8445e261b5c5',
'info_dict': {
@@ -27,7 +27,7 @@ class UdemyIE(InfoExtractor):
'duration': 579.29,
},
'skip': 'Requires udemy account credentials',
- }
+ }]
def _handle_error(self, response):
if not isinstance(response, dict):
@@ -129,6 +129,7 @@ class UdemyCourseIE(UdemyIE):
_VALID_URL = r'https?://www\.udemy\.com/(?P<coursepath>[\da-z-]+)'
_SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<'
_ALREADY_ENROLLED = '>You are already taking this course.<'
+ _TESTS = []
@classmethod
def suitable(cls, url):
diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py
new file mode 100644
index 000000000..27f9acb67
--- /dev/null
+++ b/youtube_dl/extractor/vesti.py
@@ -0,0 +1,121 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+from .rutv import RUTVIE
+
+
+class VestiIE(InfoExtractor):
+ IE_DESC = 'Вести.Ru'
+ _VALID_URL = r'http://(?:.+?\.)?vesti\.ru/(?P<id>.+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.vesti.ru/videos?vid=575582&cid=1',
+ 'info_dict': {
+ 'id': '765035',
+ 'ext': 'mp4',
+ 'title': 'Вести.net: биткоины в России не являются законными',
+ 'description': 'md5:d4bb3859dc1177b28a94c5014c35a36b',
+ 'duration': 302,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.vesti.ru/doc.html?id=1349233',
+ 'info_dict': {
+ 'id': '773865',
+ 'ext': 'mp4',
+ 'title': 'Участники митинга штурмуют Донецкую областную администрацию',
+ 'description': 'md5:1a160e98b3195379b4c849f2f4958009',
+ 'duration': 210,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.vesti.ru/only_video.html?vid=576180',
+ 'info_dict': {
+ 'id': '766048',
+ 'ext': 'mp4',
+ 'title': 'США заморозило, Британию затопило',
+ 'description': 'md5:f0ed0695ec05aed27c56a70a58dc4cc1',
+ 'duration': 87,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://hitech.vesti.ru/news/view/id/4000',
+ 'info_dict': {
+ 'id': '766888',
+ 'ext': 'mp4',
+ 'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
+ 'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
+ 'duration': 279,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403',
+ 'info_dict': {
+ 'id': '766403',
+ 'ext': 'mp4',
+ 'title': 'XXII зимние Олимпийские игры. Российские хоккеисты стартовали на Олимпиаде с победы',
+ 'description': 'md5:55805dfd35763a890ff50fa9e35e31b3',
+ 'duration': 271,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ 'skip': 'Blocked outside Russia',
+ },
+ {
+ 'url': 'http://sochi2014.vesti.ru/live/play/live_id/301',
+ 'info_dict': {
+ 'id': '51499',
+ 'ext': 'flv',
+ 'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ',
+ 'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'Translation has finished'
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ page = self._download_webpage(url, video_id, 'Downloading page')
+
+ mobj = re.search(
+ r'<meta[^>]+?property="og:video"[^>]+?content="http://www\.vesti\.ru/i/flvplayer_videoHost\.swf\?vid=(?P<id>\d+)',
+ page)
+ if mobj:
+ video_id = mobj.group('id')
+ page = self._download_webpage('http://www.vesti.ru/only_video.html?vid=%s' % video_id, video_id,
+ 'Downloading video page')
+
+ rutv_url = RUTVIE._extract_url(page)
+ if rutv_url:
+ return self.url_result(rutv_url, 'RUTV')
+
+ raise ExtractorError('No video found', expected=True) \ No newline at end of file
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index c3360f166..ee47c30ba 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -21,6 +21,7 @@ class VevoIE(InfoExtractor):
https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|
vevo:)
(?P<id>[^&?#]+)'''
+
_TESTS = [{
'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
"md5": "06bea460acb744eab74a9d7dcb4bfd61",
@@ -33,7 +34,8 @@ class VevoIE(InfoExtractor):
"duration": 230.12,
"width": 1920,
"height": 1080,
- 'timestamp': 1372057200,
+ # timestamp and upload_date are often incorrect; seem to change randomly
+ 'timestamp': int,
}
}, {
'note': 'v3 SMIL format',
@@ -47,7 +49,7 @@ class VevoIE(InfoExtractor):
'title': 'I Wish I Could Break Your Heart',
'duration': 226.101,
'age_limit': 0,
- 'timestamp': 1392796919,
+ 'timestamp': int,
}
}, {
'note': 'Age-limited video',
@@ -58,7 +60,6 @@ class VevoIE(InfoExtractor):
'age_limit': 18,
'title': 'Tunnel Vision (Explicit)',
'uploader': 'Justin Timberlake',
- # timestamp and upload_date are often incorrect; seem to change randomly
'upload_date': 're:2013070[34]',
'timestamp': int,
},
diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py
new file mode 100644
index 000000000..f8b946a88
--- /dev/null
+++ b/youtube_dl/extractor/videolecturesnet.py
@@ -0,0 +1,67 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ find_xpath_attr,
+ int_or_none,
+ parse_duration,
+ unified_strdate,
+)
+
+
+class VideoLecturesNetIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/'
+ IE_NAME = 'videolectures.net'
+
+ _TEST = {
+ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
+ 'info_dict': {
+ 'id': 'promogram_igor_mekjavic_eng',
+ 'ext': 'mp4',
+ 'title': 'Automatics, robotics and biocybernetics',
+ 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+ 'upload_date': '20130627',
+ 'duration': 565,
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id
+ smil = self._download_xml(smil_url, video_id)
+
+ title = find_xpath_attr(smil, './/meta', 'name', 'title').attrib['content']
+ description = find_xpath_attr(smil, './/meta', 'name', 'abstract').attrib['content']
+ upload_date = unified_strdate(
+ find_xpath_attr(smil, './/meta', 'name', 'date').attrib['content'])
+
+ switch = smil.find('.//switch')
+ duration = parse_duration(switch.attrib.get('dur'))
+ thumbnail_el = find_xpath_attr(switch, './image', 'type', 'thumbnail')
+ thumbnail = (
+ None if thumbnail_el is None else thumbnail_el.attrib.get('src'))
+
+ formats = [{
+ 'url': v.attrib['src'],
+ 'width': int_or_none(v.attrib.get('width')),
+ 'height': int_or_none(v.attrib.get('height')),
+ 'filesize': int_or_none(v.attrib.get('size')),
+ 'tbr': int_or_none(v.attrib.get('systemBitrate')) / 1000.0,
+ 'ext': v.attrib.get('ext'),
+ } for v in switch.findall('./video')
+ if v.attrib.get('proto') == 'http']
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
index 2206a06d5..15f315298 100644
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@@ -1,29 +1,33 @@
+from __future__ import unicode_literals
+
import re
from ..utils import (
ExtractorError,
unescapeHTML,
unified_strdate,
+ US_RATINGS,
)
from .subtitles import SubtitlesInfoExtractor
class VikiIE(SubtitlesInfoExtractor):
- IE_NAME = u'viki'
+ IE_NAME = 'viki'
_VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
_TEST = {
- u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14',
- u'file': u'1023585v.mp4',
- u'md5': u'a21454021c2646f5433514177e2caa5f',
- u'info_dict': {
- u'title': u'Heirs Episode 14',
- u'uploader': u'SBS',
- u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e',
- u'upload_date': u'20131121',
- u'age_limit': 13,
+ 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
+ 'md5': 'a21454021c2646f5433514177e2caa5f',
+ 'info_dict': {
+ 'id': '1023585v',
+ 'ext': 'mp4',
+ 'title': 'Heirs Episode 14',
+ 'uploader': 'SBS',
+ 'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e',
+ 'upload_date': '20131121',
+ 'age_limit': 13,
},
- u'skip': u'Blocked in the US',
+ 'skip': 'Blocked in the US',
}
def _real_extract(self, url):
@@ -44,28 +48,21 @@ class VikiIE(SubtitlesInfoExtractor):
rating_str = self._html_search_regex(
r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
- u'rating information', default='').strip()
- RATINGS = {
- 'G': 0,
- 'PG': 10,
- 'PG-13': 13,
- 'R': 16,
- 'NC': 18,
- }
- age_limit = RATINGS.get(rating_str)
+ 'rating information', default='').strip()
+ age_limit = US_RATINGS.get(rating_str)
info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id
info_webpage = self._download_webpage(
- info_url, video_id, note=u'Downloading info page')
+ info_url, video_id, note='Downloading info page')
if re.match(r'\s*<div\s+class="video-error', info_webpage):
raise ExtractorError(
- u'Video %s is blocked from your location.' % video_id,
+ 'Video %s is blocked from your location.' % video_id,
expected=True)
video_url = self._html_search_regex(
- r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL')
+ r'<source[^>]+src="([^"]+)"', info_webpage, 'video URL')
upload_date_str = self._html_search_regex(
- r'"created_at":"([^"]+)"', info_webpage, u'upload date')
+ r'"created_at":"([^"]+)"', info_webpage, 'upload date')
upload_date = (
unified_strdate(upload_date_str)
if upload_date_str is not None
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index d3eefd086..cdc0059f6 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -1,11 +1,10 @@
from __future__ import unicode_literals
-import os
import re
+import json
from .common import InfoExtractor
from ..utils import (
- compat_urllib_parse_urlparse,
compat_urllib_request,
parse_duration,
str_to_int,
@@ -42,7 +41,6 @@ class XTubeIE(InfoExtractor):
r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
video_description = self._html_search_regex(
r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False)
- video_url = self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
duration = parse_duration(self._html_search_regex(
r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False))
view_count = self._html_search_regex(
@@ -54,12 +52,18 @@ class XTubeIE(InfoExtractor):
if comment_count:
comment_count = str_to_int(comment_count)
- path = compat_urllib_parse_urlparse(video_url).path
- extension = os.path.splitext(path)[1][1:]
- format = path.split('/')[5].split('_')[:2]
- format[0] += 'p'
- format[1] += 'k'
- format = "-".join(format)
+ player_quality_option = json.loads(self._html_search_regex(
+ r'playerQualityOption = ({.+?});', webpage, 'player quality option'))
+
+ QUALITIES = ['3gp', 'mp4_normal', 'mp4_high', 'flv', 'mp4_ultra', 'mp4_720', 'mp4_1080']
+ formats = [
+ {
+ 'url': url,
+ 'format_id': format_id,
+ 'preference': QUALITIES.index(format_id) if format_id in QUALITIES else -1,
+ } for format_id, url in player_quality_option.items()
+ ]
+ self._sort_formats(formats)
return {
'id': video_id,
@@ -69,9 +73,6 @@ class XTubeIE(InfoExtractor):
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
- 'url': video_url,
- 'ext': extension,
- 'format': format,
- 'format_id': format,
+ 'formats': formats,
'age_limit': 18,
- }
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index 77ad423c4..d456c4da5 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -1,3 +1,6 @@
+from __future__ import unicode_literals
+
+
import json
import re
import sys
@@ -17,24 +20,25 @@ from ..aes import (
class YouPornIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
+ _VALID_URL = r'^(?P<proto>https?://)(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
_TEST = {
- u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
- u'file': u'505835.mp4',
- u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89',
- u'info_dict': {
- u"upload_date": u"20101221",
- u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",
- u"uploader": u"Ask Dan And Jennifer",
- u"title": u"Sex Ed: Is It Safe To Masturbate Daily?",
- u"age_limit": 18,
+ 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
+ 'md5': '71ec5fcfddacf80f495efa8b6a8d9a89',
+ 'info_dict': {
+ 'id': '505835',
+ 'ext': 'mp4',
+ 'upload_date': '20101221',
+ 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?',
+ 'uploader': 'Ask Dan And Jennifer',
+ 'title': 'Sex Ed: Is It Safe To Masturbate Daily?',
+ 'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
- url = 'http://www.' + mobj.group('url')
+ url = mobj.group('proto') + 'www.' + mobj.group('url')
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
@@ -42,7 +46,7 @@ class YouPornIE(InfoExtractor):
age_limit = self._rta_search(webpage)
# Get JSON parameters
- json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
+ json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, 'JSON parameters')
try:
params = json.loads(json_params)
except:
@@ -61,7 +65,7 @@ class YouPornIE(InfoExtractor):
# Get all of the links from the page
DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
- webpage, u'download list').strip()
+ webpage, 'download list').strip()
LINK_RE = r'<a href="([^"]+)">'
links = re.findall(LINK_RE, download_list_html)
@@ -86,7 +90,7 @@ class YouPornIE(InfoExtractor):
resolution = format_parts[0]
height = int(resolution[:-len('p')])
bitrate = int(format_parts[1][:-len('k')])
- format = u'-'.join(format_parts) + u'-' + dn
+ format = '-'.join(format_parts) + '-' + dn
formats.append({
'url': video_url,
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index f7cb497a8..723e7b9e6 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1130,14 +1130,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
def _real_extract(self, url):
+ proto = (
+ u'http' if self._downloader.params.get('prefer_insecure', False)
+ else u'https')
+
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re.search(self._NEXT_URL_RE, url)
if mobj:
- url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
+ url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
video_id = self.extract_id(url)
# Get video webpage
- url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
+ url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
video_webpage = self._download_webpage(url, video_id)
# Attempt to extract SWF player URL
@@ -1162,7 +1166,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'asv': 3,
'sts':'1588',
})
- video_info_url = 'https://www.youtube.com/get_video_info?' + data
+ video_info_url = proto + '://www.youtube.com/get_video_info?' + data
video_info_webpage = self._download_webpage(video_info_url, video_id,
note=False,
errnote='unable to download video info webpage')
@@ -1170,7 +1174,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
age_gate = False
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
- video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+ video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (video_id, el_type))
video_info_webpage = self._download_webpage(video_info_url, video_id,
note=False,
@@ -1445,7 +1449,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'duration': video_duration,
'age_limit': 18 if age_gate else 0,
'annotations': video_annotations,
- 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
+ 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
'view_count': view_count,
'like_count': like_count,
'dislike_count': dislike_count,