aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py74
-rw-r--r--youtube_dl/extractor/academicearth.py31
-rw-r--r--youtube_dl/extractor/addanime.py2
-rw-r--r--youtube_dl/extractor/anitube.py53
-rw-r--r--youtube_dl/extractor/aparat.py56
-rw-r--r--youtube_dl/extractor/appletrailers.py34
-rw-r--r--youtube_dl/extractor/archiveorg.py13
-rw-r--r--youtube_dl/extractor/arte.py69
-rw-r--r--youtube_dl/extractor/auengine.py33
-rw-r--r--youtube_dl/extractor/bambuser.py86
-rw-r--r--youtube_dl/extractor/bandcamp.py70
-rw-r--r--youtube_dl/extractor/blinkx.py90
-rw-r--r--youtube_dl/extractor/bliptv.py39
-rw-r--r--youtube_dl/extractor/bloomberg.py2
-rw-r--r--youtube_dl/extractor/brightcove.py91
-rw-r--r--youtube_dl/extractor/canalc2.py6
-rw-r--r--youtube_dl/extractor/canalplus.py7
-rw-r--r--youtube_dl/extractor/cbs.py30
-rw-r--r--youtube_dl/extractor/channel9.py267
-rw-r--r--youtube_dl/extractor/cinemassacre.py4
-rw-r--r--youtube_dl/extractor/clipfish.py58
-rw-r--r--youtube_dl/extractor/clipsyndicate.py50
-rw-r--r--youtube_dl/extractor/cnn.py6
-rw-r--r--youtube_dl/extractor/collegehumor.py11
-rw-r--r--youtube_dl/extractor/comedycentral.py43
-rw-r--r--youtube_dl/extractor/common.py166
-rw-r--r--youtube_dl/extractor/cspan.py2
-rw-r--r--youtube_dl/extractor/d8.py22
-rw-r--r--youtube_dl/extractor/dailymotion.py25
-rw-r--r--youtube_dl/extractor/daum.py20
-rw-r--r--youtube_dl/extractor/depositfiles.py2
-rw-r--r--youtube_dl/extractor/dreisat.py13
-rw-r--r--youtube_dl/extractor/ebaumsworld.py4
-rw-r--r--youtube_dl/extractor/eighttracks.py3
-rw-r--r--youtube_dl/extractor/eitb.py37
-rw-r--r--youtube_dl/extractor/escapist.py66
-rw-r--r--youtube_dl/extractor/exfm.py4
-rw-r--r--youtube_dl/extractor/extremetube.py50
-rw-r--r--youtube_dl/extractor/facebook.py5
-rw-r--r--youtube_dl/extractor/faz.py11
-rw-r--r--youtube_dl/extractor/fktv.py5
-rw-r--r--youtube_dl/extractor/francetv.py110
-rw-r--r--youtube_dl/extractor/gamekings.py38
-rw-r--r--youtube_dl/extractor/gamespot.py7
-rw-r--r--youtube_dl/extractor/gametrailers.py17
-rw-r--r--youtube_dl/extractor/generic.py202
-rw-r--r--youtube_dl/extractor/hotnewhiphop.py2
-rw-r--r--youtube_dl/extractor/howcast.py2
-rw-r--r--youtube_dl/extractor/hypem.py4
-rw-r--r--youtube_dl/extractor/ign.py4
-rw-r--r--youtube_dl/extractor/imdb.py57
-rw-r--r--youtube_dl/extractor/instagram.py2
-rw-r--r--youtube_dl/extractor/internetvideoarchive.py7
-rw-r--r--youtube_dl/extractor/ivi.py154
-rw-r--r--youtube_dl/extractor/jeuxvideo.py10
-rw-r--r--youtube_dl/extractor/jukebox.py2
-rw-r--r--youtube_dl/extractor/justintv.py4
-rw-r--r--youtube_dl/extractor/kankan.py7
-rw-r--r--youtube_dl/extractor/keezmovies.py8
-rw-r--r--youtube_dl/extractor/liveleak.py2
-rw-r--r--youtube_dl/extractor/livestream.py46
-rw-r--r--youtube_dl/extractor/mdr.py78
-rw-r--r--youtube_dl/extractor/metacafe.py48
-rw-r--r--youtube_dl/extractor/metacritic.py14
-rw-r--r--youtube_dl/extractor/mixcloud.py25
-rw-r--r--youtube_dl/extractor/mofosex.py49
-rw-r--r--youtube_dl/extractor/mtv.py115
-rw-r--r--youtube_dl/extractor/muzu.py2
-rw-r--r--youtube_dl/extractor/myspass.py6
-rw-r--r--youtube_dl/extractor/naver.py14
-rw-r--r--youtube_dl/extractor/nbc.py5
-rw-r--r--youtube_dl/extractor/ndtv.py64
-rw-r--r--youtube_dl/extractor/nhl.py6
-rw-r--r--youtube_dl/extractor/niconico.py127
-rw-r--r--youtube_dl/extractor/ninegag.py43
-rw-r--r--youtube_dl/extractor/ooyala.py5
-rw-r--r--youtube_dl/extractor/orf.py2
-rw-r--r--youtube_dl/extractor/pbs.py2
-rw-r--r--youtube_dl/extractor/podomatic.py49
-rw-r--r--youtube_dl/extractor/pornhd.py38
-rw-r--r--youtube_dl/extractor/pornhub.py9
-rw-r--r--youtube_dl/extractor/pyvideo.py51
-rw-r--r--youtube_dl/extractor/radiofrance.py55
-rw-r--r--youtube_dl/extractor/redtube.py6
-rw-r--r--youtube_dl/extractor/rtlnow.py19
-rw-r--r--youtube_dl/extractor/rutube.py2
-rw-r--r--youtube_dl/extractor/sina.py4
-rw-r--r--youtube_dl/extractor/slashdot.py3
-rw-r--r--youtube_dl/extractor/smotri.py356
-rw-r--r--youtube_dl/extractor/soundcloud.py237
-rw-r--r--youtube_dl/extractor/southparkstudios.py36
-rw-r--r--youtube_dl/extractor/space.py35
-rw-r--r--youtube_dl/extractor/spankwire.py18
-rw-r--r--youtube_dl/extractor/spiegel.py52
-rw-r--r--youtube_dl/extractor/stanfordoc.py22
-rw-r--r--youtube_dl/extractor/streamcloud.py66
-rw-r--r--youtube_dl/extractor/subtitles.py12
-rw-r--r--youtube_dl/extractor/sztvhu.py3
-rw-r--r--youtube_dl/extractor/teamcoco.py39
-rw-r--r--youtube_dl/extractor/ted.py71
-rw-r--r--youtube_dl/extractor/tf1.py2
-rw-r--r--youtube_dl/extractor/theplatform.py80
-rw-r--r--youtube_dl/extractor/toutv.py71
-rw-r--r--youtube_dl/extractor/trilulilu.py11
-rw-r--r--youtube_dl/extractor/tube8.py8
-rw-r--r--youtube_dl/extractor/tvp.py42
-rw-r--r--youtube_dl/extractor/unistra.py2
-rw-r--r--youtube_dl/extractor/veehd.py2
-rw-r--r--youtube_dl/extractor/vevo.py16
-rw-r--r--youtube_dl/extractor/vice.py2
-rw-r--r--youtube_dl/extractor/viddler.py13
-rw-r--r--youtube_dl/extractor/videofyme.py6
-rw-r--r--youtube_dl/extractor/videopremium.py17
-rw-r--r--youtube_dl/extractor/viki.py101
-rw-r--r--youtube_dl/extractor/vimeo.py112
-rw-r--r--youtube_dl/extractor/vine.py2
-rw-r--r--youtube_dl/extractor/wat.py2
-rw-r--r--youtube_dl/extractor/weibo.py1
-rw-r--r--youtube_dl/extractor/wimp.py23
-rw-r--r--youtube_dl/extractor/wistia.py55
-rw-r--r--youtube_dl/extractor/xhamster.py2
-rw-r--r--youtube_dl/extractor/xnxx.py2
-rw-r--r--youtube_dl/extractor/xtube.py54
-rw-r--r--youtube_dl/extractor/yahoo.py54
-rw-r--r--youtube_dl/extractor/youjizz.py2
-rw-r--r--youtube_dl/extractor/youku.py6
-rw-r--r--youtube_dl/extractor/youporn.py8
-rw-r--r--youtube_dl/extractor/youtube.py396
-rw-r--r--youtube_dl/extractor/zdf.py142
129 files changed, 4326 insertions, 1041 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index a61e17ea1..a39a1e2f4 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -1,5 +1,8 @@
-from .appletrailers import AppleTrailersIE
+from .academicearth import AcademicEarthCourseIE
from .addanime import AddAnimeIE
+from .anitube import AnitubeIE
+from .aparat import AparatIE
+from .appletrailers import AppleTrailersIE
from .archiveorg import ArchiveOrgIE
from .ard import ARDIE
from .arte import (
@@ -7,9 +10,12 @@ from .arte import (
ArteTVPlus7IE,
ArteTVCreativeIE,
ArteTVFutureIE,
+ ArteTVDDCIE,
)
from .auengine import AUEngineIE
-from .bandcamp import BandcampIE
+from .bambuser import BambuserIE, BambuserChannelIE
+from .bandcamp import BandcampIE, BandcampAlbumIE
+from .blinkx import BlinkxIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .bloomberg import BloombergIE
from .breakcom import BreakIE
@@ -17,14 +23,19 @@ from .brightcove import BrightcoveIE
from .c56 import C56IE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
+from .cbs import CBSIE
+from .channel9 import Channel9IE
from .cinemassacre import CinemassacreIE
+from .clipfish import ClipfishIE
+from .clipsyndicate import ClipsyndicateIE
from .cnn import CNNIE
from .collegehumor import CollegeHumorIE
-from .comedycentral import ComedyCentralIE
+from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
from .condenast import CondeNastIE
from .criterion import CriterionIE
from .crunchyroll import CrunchyrollIE
from .cspan import CSpanIE
+from .d8 import D8IE
from .dailymotion import (
DailymotionIE,
DailymotionPlaylistIE,
@@ -38,8 +49,10 @@ from .defense import DefenseGouvFrIE
from .ebaumsworld import EbaumsWorldIE
from .ehow import EHowIE
from .eighttracks import EightTracksIE
+from .eitb import EitbIE
from .escapist import EscapistIE
from .exfm import ExfmIE
+from .extremetube import ExtremeTubeIE
from .facebook import FacebookIE
from .faz import FazIE
from .fktv import (
@@ -50,11 +63,12 @@ from .flickr import FlickrIE
from .francetv import (
PluzzIE,
FranceTvInfoIE,
- France2IE,
+ FranceTVIE,
GenerationQuoiIE
)
from .freesound import FreesoundIE
from .funnyordie import FunnyOrDieIE
+from .gamekings import GamekingsIE
from .gamespot import GameSpotIE
from .gametrailers import GametrailersIE
from .generic import GenericIE
@@ -65,10 +79,15 @@ from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
from .hypem import HypemIE
from .ign import IGNIE, OneUPIE
+from .imdb import ImdbIE
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE
from .internetvideoarchive import InternetVideoArchiveIE
+from .ivi import (
+ IviIE,
+ IviCompilationIE
+)
from .jeuxvideo import JeuxVideoIE
from .jukebox import JukeboxIE
from .justintv import JustinTVIE
@@ -77,11 +96,13 @@ from .keezmovies import KeezMoviesIE
from .kickstarter import KickStarterIE
from .keek import KeekIE
from .liveleak import LiveLeakIE
-from .livestream import LivestreamIE
+from .livestream import LivestreamIE, LivestreamOriginalIE
+from .mdr import MDRIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
from .mit import TechTVMITIE, MITIE
from .mixcloud import MixcloudIE
+from .mofosex import MofosexIE
from .mtv import MTVIE
from .muzu import MuzuTVIE
from .myspace import MySpaceIE
@@ -90,15 +111,22 @@ from .myvideo import MyVideoIE
from .naver import NaverIE
from .nba import NBAIE
from .nbc import NBCNewsIE
+from .ndtv import NDTVIE
from .newgrounds import NewgroundsIE
from .nhl import NHLIE, NHLVideocenterIE
+from .niconico import NiconicoIE
+from .ninegag import NineGagIE
from .nowvideo import NowVideoIE
from .ooyala import OoyalaIE
from .orf import ORFIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
+from .podomatic import PodomaticIE
+from .pornhd import PornHdIE
from .pornhub import PornHubIE
from .pornotube import PornotubeIE
+from .pyvideo import PyvideoIE
+from .radiofrance import RadioFranceIE
from .rbmaradio import RBMARadioIE
from .redtube import RedTubeIE
from .ringtv import RingTVIE
@@ -110,26 +138,40 @@ from .rutube import RutubeIE
from .sina import SinaIE
from .slashdot import SlashdotIE
from .slideshare import SlideshareIE
+from .smotri import (
+ SmotriIE,
+ SmotriCommunityIE,
+ SmotriUserIE,
+ SmotriBroadcastIE,
+)
from .sohu import SohuIE
from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
-from .southparkstudios import SouthParkStudiosIE
+from .southparkstudios import (
+ SouthParkStudiosIE,
+ SouthparkDeIE,
+)
+from .space import SpaceIE
from .spankwire import SpankwireIE
from .spiegel import SpiegelIE
from .stanfordoc import StanfordOpenClassroomIE
from .statigram import StatigramIE
from .steam import SteamIE
+from .streamcloud import StreamcloudIE
from .sztvhu import SztvHuIE
from .teamcoco import TeamcocoIE
from .techtalks import TechTalksIE
from .ted import TEDIE
from .tf1 import TF1IE
+from .theplatform import ThePlatformIE
from .thisav import ThisAVIE
+from .toutv import TouTvIE
from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
from .tube8 import Tube8IE
from .tudou import TudouIE
from .tumblr import TumblrIE
from .tutv import TutvIE
+from .tvp import TvpIE
from .unistra import UnistraIE
from .ustream import UstreamIE, UstreamChannelIE
from .vbox7 import Vbox7IE
@@ -141,18 +183,31 @@ from .viddler import ViddlerIE
from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
from .videopremium import VideoPremiumIE
-from .vimeo import VimeoIE, VimeoChannelIE
+from .vimeo import (
+ VimeoIE,
+ VimeoChannelIE,
+ VimeoUserIE,
+ VimeoAlbumIE,
+ VimeoGroupsIE,
+)
from .vine import VineIE
+from .viki import VikiIE
from .vk import VKIE
from .wat import WatIE
from .websurg import WeBSurgIE
from .weibo import WeiboIE
from .wimp import WimpIE
+from .wistia import WistiaIE
from .worldstarhiphop import WorldStarHipHopIE
from .xhamster import XHamsterIE
from .xnxx import XNXXIE
from .xvideos import XVideosIE
-from .yahoo import YahooIE, YahooSearchIE
+from .xtube import XTubeIE
+from .yahoo import (
+ YahooIE,
+ YahooNewsIE,
+ YahooSearchIE,
+)
from .youjizz import YouJizzIE
from .youku import YoukuIE
from .youporn import YouPornIE
@@ -160,6 +215,7 @@ from .youtube import (
YoutubeIE,
YoutubePlaylistIE,
YoutubeSearchIE,
+ YoutubeSearchDateIE,
YoutubeUserIE,
YoutubeChannelIE,
YoutubeShowIE,
@@ -168,6 +224,8 @@ from .youtube import (
YoutubeTruncatedURLIE,
YoutubeWatchLaterIE,
YoutubeFavouritesIE,
+ YoutubeHistoryIE,
+ YoutubeTopListIE,
)
from .zdf import ZDFIE
diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py
new file mode 100644
index 000000000..ac05f8246
--- /dev/null
+++ b/youtube_dl/extractor/academicearth.py
@@ -0,0 +1,31 @@
+import re
+
+from .common import InfoExtractor
+
+
+class AcademicEarthCourseIE(InfoExtractor):
+ _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/(?:courses|playlists)/(?P<id>[^?#/]+)'
+ IE_NAME = u'AcademicEarth:Course'
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ playlist_id = m.group('id')
+
+ webpage = self._download_webpage(url, playlist_id)
+ title = self._html_search_regex(
+ r'<h1 class="playlist-name">(.*?)</h1>', webpage, u'title')
+ description = self._html_search_regex(
+ r'<p class="excerpt">(.*?)</p>',
+ webpage, u'description', fatal=False)
+ urls = re.findall(
+ r'<h3 class="lecture-title"><a target="_blank" href="([^"]+)">',
+ webpage)
+ entries = [self.url_result(u) for u in urls]
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'title': title,
+ 'description': description,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py
index b99d4b966..a3a1b999d 100644
--- a/youtube_dl/extractor/addanime.py
+++ b/youtube_dl/extractor/addanime.py
@@ -13,7 +13,7 @@ from ..utils import (
class AddAnimeIE(InfoExtractor):
- _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
+ _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
IE_NAME = u'AddAnime'
_TEST = {
u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py
new file mode 100644
index 000000000..2b019daa9
--- /dev/null
+++ b/youtube_dl/extractor/anitube.py
@@ -0,0 +1,53 @@
+import re
+
+from .common import InfoExtractor
+
+
+class AnitubeIE(InfoExtractor):
+ IE_NAME = u'anitube.se'
+ _VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.anitube.se/video/36621',
+ u'md5': u'59d0eeae28ea0bc8c05e7af429998d43',
+ u'file': u'36621.mp4',
+ u'info_dict': {
+ u'id': u'36621',
+ u'ext': u'mp4',
+ u'title': u'Recorder to Randoseru 01',
+ },
+ u'skip': u'Blocked in the US',
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)',
+ webpage, u'key')
+
+ config_xml = self._download_xml('http://www.anitube.se/nuevo/econfig.php?key=%s' % key,
+ key)
+
+ video_title = config_xml.find('title').text
+
+ formats = []
+ video_url = config_xml.find('file')
+ if video_url is not None:
+ formats.append({
+ 'format_id': 'sd',
+ 'url': video_url.text,
+ })
+ video_url = config_xml.find('filehd')
+ if video_url is not None:
+ formats.append({
+ 'format_id': 'hd',
+ 'url': video_url.text,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py
new file mode 100644
index 000000000..7e93bc4df
--- /dev/null
+++ b/youtube_dl/extractor/aparat.py
@@ -0,0 +1,56 @@
+#coding: utf-8
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ HEADRequest,
+)
+
+
+class AparatIE(InfoExtractor):
+ _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
+
+ _TEST = {
+ u'url': u'http://www.aparat.com/v/wP8On',
+ u'file': u'wP8On.mp4',
+ u'md5': u'6714e0af7e0d875c5a39c4dc4ab46ad1',
+ u'info_dict': {
+ u"title": u"تیم گلکسی 11 - زومیت",
+ },
+ #u'skip': u'Extremely unreliable',
+ }
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ video_id = m.group('id')
+
+ # Note: There is an easier-to-parse configuration at
+ # http://www.aparat.com/video/video/config/videohash/%video_id
+ # but the URL in there does not work
+ embed_url = (u'http://www.aparat.com/video/video/embed/videohash/' +
+ video_id + u'/vt/frame')
+ webpage = self._download_webpage(embed_url, video_id)
+
+ video_urls = re.findall(r'fileList\[[0-9]+\]\s*=\s*"([^"]+)"', webpage)
+ for i, video_url in enumerate(video_urls):
+ req = HEADRequest(video_url)
+ res = self._request_webpage(
+ req, video_id, note=u'Testing video URL %d' % i, errnote=False)
+ if res:
+ break
+ else:
+ raise ExtractorError(u'No working video URLs found')
+
+ title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, u'title')
+ thumbnail = self._search_regex(
+ r'\s+image:\s*"([^"]+)"', webpage, u'thumbnail', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index 6d6237f8a..ef5644aa5 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -1,5 +1,4 @@
import re
-import xml.etree.ElementTree
import json
from .common import InfoExtractor
@@ -10,7 +9,7 @@ from ..utils import (
class AppleTrailersIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
_TEST = {
u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/",
u"playlist": [
@@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor):
uploader_id = mobj.group('company')
playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
- playlist_snippet = self._download_webpage(playlist_url, movie)
- playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet)
- playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned)
- # The ' in the onClick attributes are not escaped, it couldn't be parsed
- # with xml.etree.ElementTree.fromstring
- # like: http://trailers.apple.com/trailers/wb/gravity/
- def _clean_json(m):
- return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
- playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned)
- playlist_html = u'<html>' + playlist_cleaned + u'</html>'
+ def fix_html(s):
+ s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s)
+ s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
+ # The ' in the onClick attributes are not escaped, it couldn't be parsed
+ # like: http://trailers.apple.com/trailers/wb/gravity/
+ def _clean_json(m):
+ return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
+ s = re.sub(self._JSON_RE, _clean_json, s)
+ s = u'<html>' + s + u'</html>'
+ return s
+ doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
- doc = xml.etree.ElementTree.fromstring(playlist_html)
playlist = []
for li in doc.findall('./div/ul/li'):
on_click = li.find('.//a').attrib['onClick']
@@ -113,7 +112,7 @@ class AppleTrailersIE(InfoExtractor):
})
formats = sorted(formats, key=lambda f: (f['height'], f['width']))
- info = {
+ playlist.append({
'_type': 'video',
'id': video_id,
'title': title,
@@ -124,12 +123,7 @@ class AppleTrailersIE(InfoExtractor):
'upload_date': upload_date,
'uploader_id': uploader_id,
'user_agent': 'QuickTime compatible (youtube-dl)',
- }
- # TODO: Remove when #980 has been merged
- info['url'] = formats[-1]['url']
- info['ext'] = formats[-1]['ext']
-
- playlist.append(info)
+ })
return {
'_type': 'playlist',
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py
index 61ce4469a..8bb546410 100644
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -11,7 +11,7 @@ from ..utils import (
class ArchiveOrgIE(InfoExtractor):
IE_NAME = 'archive.org'
IE_DESC = 'archive.org videos'
- _VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
+ _VALID_URL = r'(?:https?://)?(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
_TEST = {
u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
@@ -49,7 +49,7 @@ class ArchiveOrgIE(InfoExtractor):
for f in formats:
f['ext'] = determine_ext(f['url'])
- info = {
+ return {
'_type': 'video',
'id': video_id,
'title': title,
@@ -57,12 +57,5 @@ class ArchiveOrgIE(InfoExtractor):
'description': description,
'uploader': uploader,
'upload_date': upload_date,
+ 'thumbnail': data.get('misc', {}).get('image'),
}
- thumbnail = data.get('misc', {}).get('image')
- if thumbnail:
- info['thumbnail'] = thumbnail
-
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
-
- return info
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index e10c74c11..9254fbfe0 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -1,7 +1,6 @@
# encoding: utf-8
import re
import json
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -10,6 +9,8 @@ from ..utils import (
unified_strdate,
determine_ext,
get_element_by_id,
+ compat_str,
+ get_element_by_attribute,
)
# There are different sources of video in arte.tv, the extraction process
@@ -17,8 +18,8 @@ from ..utils import (
# add tests.
class ArteTvIE(InfoExtractor):
- _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'
- _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
+ _VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html'
+ _LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
_LIVE_URL = r'index-[0-9]+\.html$'
IE_NAME = u'arte.tv'
@@ -68,7 +69,7 @@ class ArteTvIE(InfoExtractor):
lang = mobj.group('lang')
return self._extract_liveweb(url, name, lang)
- if re.search(self._LIVE_URL, video_id) is not None:
+ if re.search(self._LIVE_URL, url) is not None:
raise ExtractorError(u'Arte live streams are not yet supported, sorry')
# self.extractLiveStream(url)
# return
@@ -77,8 +78,7 @@ class ArteTvIE(InfoExtractor):
"""Extract from videos.arte.tv"""
ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
- ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
- ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
+ ref_xml_doc = self._download_xml(ref_xml_url, video_id, note=u'Downloading metadata')
config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
config_xml_url = config_node.attrib['ref']
config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')
@@ -108,13 +108,12 @@ class ArteTvIE(InfoExtractor):
"""Extract form http://liveweb.arte.tv/"""
webpage = self._download_webpage(url, name)
video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id')
- config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
+ config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
video_id, u'Downloading information')
- config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
event_doc = config_doc.find('event')
url_node = event_doc.find('video').find('urlHd')
if url_node is None:
- url_node = video_doc.find('urlSd')
+ url_node = event_doc.find('urlSd')
return {'id': video_id,
'title': event_doc.find('name%s' % lang.capitalize()).text,
@@ -144,7 +143,9 @@ class ArteTVPlus7IE(InfoExtractor):
def _extract_from_webpage(self, webpage, video_id, lang):
json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
+ return self._extract_from_json_url(json_url, video_id, lang)
+ def _extract_from_json_url(self, json_url, video_id, lang):
json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
self.report_extraction(video_id)
info = json.loads(json_info)
@@ -181,20 +182,30 @@ class ArteTVPlus7IE(InfoExtractor):
formats = all_formats
else:
raise ExtractorError(u'The formats list is empty')
- # We order the formats by quality
+
if re.match(r'[A-Z]Q', formats[0]['quality']) is not None:
- sort_key = lambda f: ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality'])
+ def sort_key(f):
+ return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality'])
else:
- sort_key = lambda f: int(f.get('height',-1))
+ def sort_key(f):
+ return (
+ # Sort first by quality
+ int(f.get('height',-1)),
+ int(f.get('bitrate',-1)),
+ # The original version with subtitles has lower relevance
+ re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None,
+ # The version with sourds/mal subtitles has also lower relevance
+ re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None,
+ )
formats = sorted(formats, key=sort_key)
- # Prefer videos without subtitles in the same language
- formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None)
- # Pick the best quality
def _format(format_info):
- quality = format_info['quality']
- m_quality = re.match(r'\w*? - (\d*)p', quality)
- if m_quality is not None:
- quality = m_quality.group(1)
+ quality = ''
+ height = format_info.get('height')
+ if height is not None:
+ quality = compat_str(height)
+ bitrate = format_info.get('bitrate')
+ if bitrate is not None:
+ quality += '-%d' % bitrate
if format_info.get('versionCode') is not None:
format_id = u'%s-%s' % (quality, format_info['versionCode'])
else:
@@ -203,7 +214,7 @@ class ArteTVPlus7IE(InfoExtractor):
'format_id': format_id,
'format_note': format_info.get('versionLibelle'),
'width': format_info.get('width'),
- 'height': format_info.get('height'),
+ 'height': height,
}
if format_info['mediaType'] == u'rtmp':
info['url'] = format_info['streamer']
@@ -249,3 +260,21 @@ class ArteTVFutureIE(ArteTVPlus7IE):
webpage = self._download_webpage(url, anchor_id)
row = get_element_by_id(anchor_id, webpage)
return self._extract_from_webpage(row, anchor_id, lang)
+
+
+class ArteTVDDCIE(ArteTVPlus7IE):
+ IE_NAME = u'arte.tv:ddc'
+ _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)'
+
+ def _real_extract(self, url):
+ video_id, lang = self._extract_url_info(url)
+ if lang == 'folge':
+ lang = 'de'
+ elif lang == 'emission':
+ lang = 'fr'
+ webpage = self._download_webpage(url, video_id)
+ scriptElement = get_element_by_attribute('class', 'visu_video_block', webpage)
+ script_url = self._html_search_regex(r'src="(.*?)"', scriptElement, 'script url')
+ javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator')
+ json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url')
+ return self._extract_from_json_url(json_url, video_id, lang)
diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py
index 0febbff4f..bcccc0b7a 100644
--- a/youtube_dl/extractor/auengine.py
+++ b/youtube_dl/extractor/auengine.py
@@ -1,10 +1,10 @@
-import os.path
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
- compat_urllib_parse_urlparse,
+ determine_ext,
+ ExtractorError,
)
class AUEngineIE(InfoExtractor):
@@ -16,7 +16,7 @@ class AUEngineIE(InfoExtractor):
u"title": u"[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]"
}
}
- _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed.php\?.*?file=([^&]+).*?'
+ _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed\.php\?.*?file=([^&]+).*?'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -25,22 +25,25 @@ class AUEngineIE(InfoExtractor):
title = self._html_search_regex(r'<title>(?P<title>.+?)</title>',
webpage, u'title')
title = title.strip()
- links = re.findall(r'[^A-Za-z0-9]?(?:file|url):\s*["\'](http[^\'"&]*)', webpage)
- links = [compat_urllib_parse.unquote(l) for l in links]
+ links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage)
+ links = map(compat_urllib_parse.unquote, links)
+
+ thumbnail = None
+ video_url = None
for link in links:
- root, pathext = os.path.splitext(compat_urllib_parse_urlparse(link).path)
- if pathext == '.png':
+ if link.endswith('.png'):
thumbnail = link
- elif pathext == '.mp4':
- url = link
- ext = pathext
+ elif '/videos/' in link:
+ video_url = link
+ if not video_url:
+ raise ExtractorError(u'Could not find video URL')
+ ext = u'.' + determine_ext(video_url)
if ext == title[-len(ext):]:
title = title[:-len(ext)]
- ext = ext[1:]
- return [{
+
+ return {
'id': video_id,
- 'url': url,
- 'ext': ext,
+ 'url': video_url,
'title': title,
'thumbnail': thumbnail,
- }]
+ }
diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py
new file mode 100644
index 000000000..d48c0c38d
--- /dev/null
+++ b/youtube_dl/extractor/bambuser.py
@@ -0,0 +1,86 @@
+import re
+import json
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_request,
+)
+
+
+class BambuserIE(InfoExtractor):
+ IE_NAME = u'bambuser'
+ _VALID_URL = r'https?://bambuser\.com/v/(?P<id>\d+)'
+ _API_KEY = '005f64509e19a868399060af746a00aa'
+
+ _TEST = {
+ u'url': u'http://bambuser.com/v/4050584',
+ # MD5 seems to be flaky, see https://travis-ci.org/rg3/youtube-dl/jobs/14051016#L388
+ #u'md5': u'fba8f7693e48fd4e8641b3fd5539a641',
+ u'info_dict': {
+ u'id': u'4050584',
+ u'ext': u'flv',
+ u'title': u'Education engineering days - lightning talks',
+ u'duration': 3741,
+ u'uploader': u'pixelversity',
+ u'uploader_id': u'344706',
+ },
+ u'params': {
+ # It doesn't respect the 'Range' header, it would download the whole video
+ # caused the travis builds to fail: https://travis-ci.org/rg3/youtube-dl/jobs/14493845#L59
+ u'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ info_url = ('http://player-c.api.bambuser.com/getVideo.json?'
+ '&api_key=%s&vid=%s' % (self._API_KEY, video_id))
+ info_json = self._download_webpage(info_url, video_id)
+ info = json.loads(info_json)['result']
+
+ return {
+ 'id': video_id,
+ 'title': info['title'],
+ 'url': info['url'],
+ 'thumbnail': info.get('preview'),
+ 'duration': int(info['length']),
+ 'view_count': int(info['views_total']),
+ 'uploader': info['username'],
+ 'uploader_id': info['uid'],
+ }
+
+
+class BambuserChannelIE(InfoExtractor):
+ IE_NAME = u'bambuser:channel'
+ _VALID_URL = r'https?://bambuser\.com/channel/(?P<user>.*?)(?:/|#|\?|$)'
+ # The maximum number we can get with each request
+ _STEP = 50
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ user = mobj.group('user')
+ urls = []
+ last_id = ''
+ for i in itertools.count(1):
+ req_url = ('http://bambuser.com/xhr-api/index.php?username={user}'
+ '&sort=created&access_mode=0%2C1%2C2&limit={count}'
+ '&method=broadcast&format=json&vid_older_than={last}'
+ ).format(user=user, count=self._STEP, last=last_id)
+ req = compat_urllib_request.Request(req_url)
+ # Without setting this header, we wouldn't get any result
+ req.add_header('Referer', 'http://bambuser.com/channel/%s' % user)
+ info_json = self._download_webpage(req, user,
+ u'Downloading page %d' % i)
+ results = json.loads(info_json)['result']
+ if len(results) == 0:
+ break
+ last_id = results[-1]['vid']
+ urls.extend(self.url_result(v['page'], 'Bambuser') for v in results)
+
+ return {
+ '_type': 'playlist',
+ 'title': user,
+ 'entries': urls,
+ }
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 129a20f44..3a32c14c5 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -3,13 +3,16 @@ import re
from .common import InfoExtractor
from ..utils import (
+ compat_str,
+ compat_urlparse,
ExtractorError,
)
class BandcampIE(InfoExtractor):
+ IE_NAME = u'Bandcamp'
_VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
- _TEST = {
+ _TESTS = [{
u'url': u'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
u'file': u'1812978515.mp3',
u'md5': u'cdeb30cdae1921719a3cbcab696ef53c',
@@ -17,7 +20,7 @@ class BandcampIE(InfoExtractor):
u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad"
},
u'skip': u'There is a limit of 200 free downloads / month for the test song'
- }
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -26,6 +29,23 @@ class BandcampIE(InfoExtractor):
# We get the link to the free download page
m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
if m_download is None:
+ m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
+ if m_trackinfo:
+ json_code = m_trackinfo.group(1)
+ data = json.loads(json_code)
+
+ for d in data:
+ formats = [{
+ 'format_id': 'format_id',
+ 'url': format_url,
+ 'ext': format_id.partition('-')[0]
+ } for format_id, format_url in sorted(d['file'].items())]
+ return {
+ 'id': compat_str(d['id']),
+ 'title': d['title'],
+ 'formats': formats,
+ }
+ else:
raise ExtractorError(u'No free songs found')
download_link = m_download.group(1)
@@ -61,3 +81,49 @@ class BandcampIE(InfoExtractor):
}
return [track_info]
+
+
+class BandcampAlbumIE(InfoExtractor):
+ IE_NAME = u'Bandcamp:album'
+ _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P<title>.*)'
+
+ _TEST = {
+ u'url': u'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
+ u'playlist': [
+ {
+ u'file': u'1353101989.mp3',
+ u'md5': u'39bc1eded3476e927c724321ddf116cf',
+ u'info_dict': {
+ u'title': u'Intro',
+ }
+ },
+ {
+ u'file': u'38097443.mp3',
+ u'md5': u'1a2c32e2691474643e912cc6cd4bffaa',
+ u'info_dict': {
+ u'title': u'Kero One - Keep It Alive (Blazo remix)',
+ }
+ },
+ ],
+ u'params': {
+ u'playlistend': 2
+ },
+ u'skip': u'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ title = mobj.group('title')
+ webpage = self._download_webpage(url, title)
+ tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
+ if not tracks_paths:
+ raise ExtractorError(u'The page doesn\'t contain any track')
+ entries = [
+ self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
+ for t_path in tracks_paths]
+ title = self._search_regex(r'album_title : "(.*?)"', webpage, u'title')
+ return {
+ '_type': 'playlist',
+ 'title': title,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py
new file mode 100644
index 000000000..e118f2e9f
--- /dev/null
+++ b/youtube_dl/extractor/blinkx.py
@@ -0,0 +1,90 @@
+import datetime
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ remove_start,
+)
+
+
+class BlinkxIE(InfoExtractor):
+ _VALID_URL = r'^(?:https?://(?:www\.)blinkx\.com/ce/|blinkx:)(?P<id>[^?]+)'
+ _IE_NAME = u'blinkx'
+
+ _TEST = {
+ u'url': u'http://www.blinkx.com/ce/8aQUy7GVFYgFzpKhT0oqsilwOGFRVXk3R1ZGWWdGenBLaFQwb3FzaWx3OGFRVXk3R1ZGWWdGenB',
+ u'file': u'8aQUy7GV.mp4',
+ u'md5': u'2e9a07364af40163a908edbf10bb2492',
+ u'info_dict': {
+ u"title": u"Police Car Rolls Away",
+ u"uploader": u"stupidvideos.com",
+ u"upload_date": u"20131215",
+ u"description": u"A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!",
+ u"duration": 14.886,
+ u"thumbnails": [{
+ "width": 100,
+ "height": 76,
+ "url": "http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg",
+ }],
+ },
+ }
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ video_id = m.group('id')
+ display_id = video_id[:8]
+
+ api_url = (u'https://apib4.blinkx.com/api.php?action=play_video&' +
+ u'video=%s' % video_id)
+ data_json = self._download_webpage(api_url, display_id)
+ data = json.loads(data_json)['api']['results'][0]
+ dt = datetime.datetime.fromtimestamp(data['pubdate_epoch'])
+ upload_date = dt.strftime('%Y%m%d')
+
+ duration = None
+ thumbnails = []
+ formats = []
+ for m in data['media']:
+ if m['type'] == 'jpg':
+ thumbnails.append({
+ 'url': m['link'],
+ 'width': int(m['w']),
+ 'height': int(m['h']),
+ })
+ elif m['type'] == 'original':
+ duration = m['d']
+ elif m['type'] == 'youtube':
+ yt_id = m['link']
+ self.to_screen(u'Youtube video detected: %s' % yt_id)
+ return self.url_result(yt_id, 'Youtube', video_id=yt_id)
+ elif m['type'] in ('flv', 'mp4'):
+ vcodec = remove_start(m['vcodec'], 'ff')
+ acodec = remove_start(m['acodec'], 'ff')
+ format_id = (u'%s-%sk-%s' %
+ (vcodec,
+ (int(m['vbr']) + int(m['abr'])) // 1000,
+ m['w']))
+ formats.append({
+ 'format_id': format_id,
+ 'url': m['link'],
+ 'vcodec': vcodec,
+ 'acodec': acodec,
+ 'abr': int(m['abr']) // 1000,
+ 'vbr': int(m['vbr']) // 1000,
+ 'width': int(m['w']),
+ 'height': int(m['h']),
+ })
+ formats.sort(key=lambda f: (f['width'], f['vbr'], f['abr']))
+
+ return {
+ 'id': display_id,
+ 'fullid': video_id,
+ 'title': data['title'],
+ 'formats': formats,
+ 'uploader': data['channel_name'],
+ 'upload_date': upload_date,
+ 'description': data.get('description'),
+ 'thumbnails': thumbnails,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py
index 493504f75..5e33a69df 100644
--- a/youtube_dl/extractor/bliptv.py
+++ b/youtube_dl/extractor/bliptv.py
@@ -51,8 +51,7 @@ class BlipTVIE(InfoExtractor):
url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
urlp = compat_urllib_parse_urlparse(url)
if urlp.path.startswith('/play/'):
- request = compat_urllib_request.Request(url)
- response = compat_urllib_request.urlopen(request)
+ response = self._request_webpage(url, None, False)
redirecturl = response.geturl()
rurlp = compat_urllib_parse_urlparse(redirecturl)
file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
@@ -69,25 +68,23 @@ class BlipTVIE(InfoExtractor):
request.add_header('User-Agent', 'iTunes/10.6.1')
self.report_extraction(mobj.group(1))
info = None
- try:
- urlh = compat_urllib_request.urlopen(request)
- if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
- basename = url.split('/')[-1]
- title,ext = os.path.splitext(basename)
- title = title.decode('UTF-8')
- ext = ext.replace('.', '')
- self.report_direct_download(title)
- info = {
- 'id': title,
- 'url': url,
- 'uploader': None,
- 'upload_date': None,
- 'title': title,
- 'ext': ext,
- 'urlhandle': urlh
- }
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
+ urlh = self._request_webpage(request, None, False,
+ u'unable to download video info webpage')
+ if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
+ basename = url.split('/')[-1]
+ title,ext = os.path.splitext(basename)
+ title = title.decode('UTF-8')
+ ext = ext.replace('.', '')
+ self.report_direct_download(title)
+ info = {
+ 'id': title,
+ 'url': url,
+ 'uploader': None,
+ 'upload_date': None,
+ 'title': title,
+ 'ext': ext,
+ 'urlhandle': urlh
+ }
if info is None: # Regular URL
try:
json_code_bytes = urlh.read()
diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py
index 3666a780b..755d9c9ef 100644
--- a/youtube_dl/extractor/bloomberg.py
+++ b/youtube_dl/extractor/bloomberg.py
@@ -4,7 +4,7 @@ from .common import InfoExtractor
class BloombergIE(InfoExtractor):
- _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?).html'
+ _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html'
_TEST = {
u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html',
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 1392f382a..b1b7526ca 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -9,10 +9,13 @@ from ..utils import (
compat_urllib_parse,
find_xpath_attr,
compat_urlparse,
+ compat_str,
+ compat_urllib_request,
ExtractorError,
)
+
class BrightcoveIE(InfoExtractor):
_VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
_FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
@@ -23,7 +26,7 @@ class BrightcoveIE(InfoExtractor):
# From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
u'file': u'2371591881001.mp4',
- u'md5': u'9e80619e0a94663f0bdc849b4566af19',
+ u'md5': u'8eccab865181d29ec2958f32a6a754f5',
u'note': u'Test Brightcove downloads and detection in GenericIE',
u'info_dict': {
u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
@@ -41,6 +44,29 @@ class BrightcoveIE(InfoExtractor):
u'uploader': u'Oracle',
},
},
+ {
+ # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/
+ u'url': u'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
+ u'info_dict': {
+ u'id': u'2750934548001',
+ u'ext': u'mp4',
+ u'title': u'This Bracelet Acts as a Personal Thermostat',
+ u'description': u'md5:547b78c64f4112766ccf4e151c20b6a0',
+ u'uploader': u'Mashable',
+ },
+ },
+ {
+ # test that the default referer works
+ # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/
+ u'url': u'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
+ u'info_dict': {
+ u'id': u'2878862109001',
+ u'ext': u'mp4',
+ u'title': u'Lost in Motion II',
+ u'description': u'md5:363109c02998fee92ec02211bd8000df',
+ u'uploader': u'National Ballet of Canada',
+ },
+ },
]
@classmethod
@@ -61,31 +87,65 @@ class BrightcoveIE(InfoExtractor):
params = {'flashID': object_doc.attrib['id'],
'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
}
- playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey')
+ def find_param(name):
+ node = find_xpath_attr(object_doc, './param', 'name', name)
+ if node is not None:
+ return node.attrib['value']
+ return None
+ playerKey = find_param('playerKey')
# Not all pages define this value
if playerKey is not None:
- params['playerKey'] = playerKey.attrib['value']
- videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')
+ params['playerKey'] = playerKey
+ # The three fields hold the id of the video
+ videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID')
if videoPlayer is not None:
- params['@videoPlayer'] = videoPlayer.attrib['value']
+ params['@videoPlayer'] = videoPlayer
+ linkBase = find_param('linkBaseURL')
+ if linkBase is not None:
+ params['linkBaseURL'] = linkBase
data = compat_urllib_parse.urlencode(params)
return cls._FEDERATED_URL_TEMPLATE % data
+ @classmethod
+ def _extract_brightcove_url(cls, webpage):
+ """Try to extract the brightcove url from the wepbage, returns None
+ if it can't be found
+ """
+ m_brightcove = re.search(
+ r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>',
+ webpage, re.DOTALL)
+ if m_brightcove is not None:
+ return cls._build_brighcove_url(m_brightcove.group())
+ else:
+ return None
+
def _real_extract(self, url):
+ # Change the 'videoId' and others field to '@videoPlayer'
+ url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url)
+ # Change bckey (used by bcove.me urls) to playerKey
+ url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)
mobj = re.match(self._VALID_URL, url)
query_str = mobj.group('query')
query = compat_urlparse.parse_qs(query_str)
videoPlayer = query.get('@videoPlayer')
if videoPlayer:
- return self._get_video_info(videoPlayer[0], query_str)
+ return self._get_video_info(videoPlayer[0], query_str, query,
+ # We set the original url as the default 'Referer' header
+ referer=url)
else:
player_key = query['playerKey']
return self._get_playlist_info(player_key[0])
- def _get_video_info(self, video_id, query):
- request_url = self._FEDERATED_URL_TEMPLATE % query
- webpage = self._download_webpage(request_url, video_id)
+ def _get_video_info(self, video_id, query_str, query, referer=None):
+ request_url = self._FEDERATED_URL_TEMPLATE % query_str
+ req = compat_urllib_request.Request(request_url)
+ linkBase = query.get('linkBaseURL')
+ if linkBase is not None:
+ referer = linkBase[0]
+ if referer is not None:
+ req.add_header('Referer', referer)
+ webpage = self._download_webpage(req, video_id)
self.report_extraction(video_id)
info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
@@ -109,7 +169,7 @@ class BrightcoveIE(InfoExtractor):
def _extract_video_info(self, video_info):
info = {
- 'id': video_info['id'],
+ 'id': compat_str(video_info['id']),
'title': video_info['displayName'],
'description': video_info.get('shortDescription'),
'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
@@ -119,15 +179,14 @@ class BrightcoveIE(InfoExtractor):
renditions = video_info.get('renditions')
if renditions:
renditions = sorted(renditions, key=lambda r: r['size'])
- best_format = renditions[-1]
- info.update({
- 'url': best_format['defaultURL'],
- 'ext': 'mp4',
- })
+ info['formats'] = [{
+ 'url': rend['defaultURL'],
+ 'height': rend.get('frameHeight'),
+ 'width': rend.get('frameWidth'),
+ } for rend in renditions]
elif video_info.get('FLVFullLengthURL') is not None:
info.update({
'url': video_info['FLVFullLengthURL'],
- 'ext': 'flv',
})
else:
raise ExtractorError(u'Unable to extract video url for %s' % info['id'])
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
index e7f4fa9fd..3d8d7f9d2 100644
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -6,7 +6,7 @@ from .common import InfoExtractor
class Canalc2IE(InfoExtractor):
IE_NAME = 'canalc2.tv'
- _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui'
+ _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)'
_TEST = {
u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
@@ -18,7 +18,9 @@ class Canalc2IE(InfoExtractor):
}
def _real_extract(self, url):
- video_id = re.match(self._VALID_URL, url).group(1)
+ video_id = re.match(self._VALID_URL, url).group('id')
+ # We need to set the voir field for getting the file name
+ url = 'http://www.canalc2.tv/video.asp?idVideo=%s&voir=oui' % video_id
webpage = self._download_webpage(url, video_id)
file_name = self._search_regex(
r"so\.addVariable\('file','(.*?)'\);",
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 1db9b24cf..7cdcd8399 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -1,10 +1,10 @@
# encoding: utf-8
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import unified_strdate
+
class CanalplusIE(InfoExtractor):
_VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))'
_VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
@@ -25,16 +25,15 @@ class CanalplusIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = mobj.groupdict().get('id')
if video_id is None:
webpage = self._download_webpage(url, mobj.group('path'))
video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')
info_url = self._VIDEO_INFO_TEMPLATE % video_id
- info_page = self._download_webpage(info_url,video_id,
+ doc = self._download_xml(info_url,video_id,
u'Downloading video info')
self.report_extraction(video_id)
- doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8'))
video_info = [video for video in doc if video.find('ID').text == video_id][0]
infos = video_info.find('INFOS')
media = video_info.find('MEDIA')
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py
new file mode 100644
index 000000000..ac0315853
--- /dev/null
+++ b/youtube_dl/extractor/cbs.py
@@ -0,0 +1,30 @@
+import re
+
+from .common import InfoExtractor
+
+
+class CBSIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/video/(?P<id>[^/]+)/.*'
+
+ _TEST = {
+ u'url': u'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
+ u'file': u'4JUVEwq3wUT7.flv',
+ u'info_dict': {
+ u'title': u'Connect Chat feat. Garth Brooks',
+ u'description': u'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
+ u'duration': 1495,
+ },
+ u'params': {
+ # rtmp download
+ u'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ real_id = self._search_regex(
+ r"video\.settings\.pid\s*=\s*'([^']+)';",
+ webpage, u'real video ID')
+ return self.url_result(u'theplatform:%s' % real_id)
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py
new file mode 100644
index 000000000..ae70ea229
--- /dev/null
+++ b/youtube_dl/extractor/channel9.py
@@ -0,0 +1,267 @@
+# encoding: utf-8
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+class Channel9IE(InfoExtractor):
+ '''
+ Common extractor for channel9.msdn.com.
+
+ The type of provided URL (video or playlist) is determined according to
+ meta Search.PageType from web page HTML rather than URL itself, as it is
+ not always possible to do.
+ '''
+ IE_DESC = u'Channel 9'
+ IE_NAME = u'channel9'
+ _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
+
+ _TESTS = [
+ {
+ u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
+ u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
+ u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
+ u'info_dict': {
+ u'title': u'Developer Kick-Off Session: Stuff We Love',
+ u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
+ u'duration': 4576,
+ u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
+ u'session_code': u'KOS002',
+ u'session_day': u'Day 1',
+ u'session_room': u'Arena 1A',
+ u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
+ },
+ },
+ {
+ u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
+ u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
+ u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
+ u'info_dict': {
+ u'title': u'Self-service BI with Power BI - nuclear testing',
+ u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
+ u'duration': 1540,
+ u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
+ u'authors': [ u'Mike Wilmot' ],
+ },
+ }
+ ]
+
+ _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
+
+ # Sorted by quality
+ _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
+
+ def _restore_bytes(self, formatted_size):
+ if not formatted_size:
+ return 0
+ m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
+ if not m:
+ return 0
+ units = m.group('units')
+ try:
+ exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
+ except ValueError:
+ return 0
+ size = float(m.group('size'))
+ return int(size * (1024 ** exponent))
+
+ def _formats_from_html(self, html):
+ FORMAT_REGEX = r'''
+ (?x)
+ <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
+ <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
+ (?:<div\s+class="popup\s+rounded">\s*
+ <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
+ </div>)? # File size part may be missing
+ '''
+ # Extract known formats
+ formats = [{'url': x.group('url'),
+ 'format_id': x.group('quality'),
+ 'format_note': x.group('note'),
+ 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
+ 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
+ } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
+ # Sort according to known formats list
+ formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
+ return formats
+
+ def _extract_title(self, html):
+ title = self._html_search_meta(u'title', html, u'title')
+ if title is None:
+ title = self._og_search_title(html)
+ TITLE_SUFFIX = u' (Channel 9)'
+ if title is not None and title.endswith(TITLE_SUFFIX):
+ title = title[:-len(TITLE_SUFFIX)]
+ return title
+
+ def _extract_description(self, html):
+ DESCRIPTION_REGEX = r'''(?sx)
+ <div\s+class="entry-content">\s*
+ <div\s+id="entry-body">\s*
+ (?P<description>.+?)\s*
+ </div>\s*
+ </div>
+ '''
+ m = re.search(DESCRIPTION_REGEX, html)
+ if m is not None:
+ return m.group('description')
+ return self._html_search_meta(u'description', html, u'description')
+
+ def _extract_duration(self, html):
+ m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
+ return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
+
+ def _extract_slides(self, html):
+ m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
+ return m.group('slidesurl') if m is not None else None
+
+ def _extract_zip(self, html):
+ m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
+ return m.group('zipurl') if m is not None else None
+
+ def _extract_avg_rating(self, html):
+ m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
+ return float(m.group('avgrating')) if m is not None else 0
+
+ def _extract_rating_count(self, html):
+ m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
+ return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
+
+ def _extract_view_count(self, html):
+ m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
+ return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
+
+ def _extract_comment_count(self, html):
+ m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
+ return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
+
+ def _fix_count(self, count):
+ return int(str(count).replace(',', '')) if count is not None else None
+
+ def _extract_authors(self, html):
+ m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
+ if m is None:
+ return None
+ return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
+
+ def _extract_session_code(self, html):
+ m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
+ return m.group('code') if m is not None else None
+
+ def _extract_session_day(self, html):
+ m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
+ return m.group('day') if m is not None else None
+
+ def _extract_session_room(self, html):
+ m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
+ return m.group('room') if m is not None else None
+
+ def _extract_session_speakers(self, html):
+ return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
+
+ def _extract_content(self, html, content_path):
+ # Look for downloadable content
+ formats = self._formats_from_html(html)
+ slides = self._extract_slides(html)
+ zip_ = self._extract_zip(html)
+
+ # Nothing to download
+ if len(formats) == 0 and slides is None and zip_ is None:
+ self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
+ return
+
+ # Extract meta
+ title = self._extract_title(html)
+ description = self._extract_description(html)
+ thumbnail = self._og_search_thumbnail(html)
+ duration = self._extract_duration(html)
+ avg_rating = self._extract_avg_rating(html)
+ rating_count = self._extract_rating_count(html)
+ view_count = self._extract_view_count(html)
+ comment_count = self._extract_comment_count(html)
+
+ common = {'_type': 'video',
+ 'id': content_path,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'avg_rating': avg_rating,
+ 'rating_count': rating_count,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ }
+
+ result = []
+
+ if slides is not None:
+ d = common.copy()
+ d.update({ 'title': title + '-Slides', 'url': slides })
+ result.append(d)
+
+ if zip_ is not None:
+ d = common.copy()
+ d.update({ 'title': title + '-Zip', 'url': zip_ })
+ result.append(d)
+
+ if len(formats) > 0:
+ d = common.copy()
+ d.update({ 'title': title, 'formats': formats })
+ result.append(d)
+
+ return result
+
+ def _extract_entry_item(self, html, content_path):
+ contents = self._extract_content(html, content_path)
+ if contents is None:
+ return contents
+
+ authors = self._extract_authors(html)
+
+ for content in contents:
+ content['authors'] = authors
+
+ return contents
+
+ def _extract_session(self, html, content_path):
+ contents = self._extract_content(html, content_path)
+ if contents is None:
+ return contents
+
+ session_meta = {'session_code': self._extract_session_code(html),
+ 'session_day': self._extract_session_day(html),
+ 'session_room': self._extract_session_room(html),
+ 'session_speakers': self._extract_session_speakers(html),
+ }
+
+ for content in contents:
+ content.update(session_meta)
+
+ return contents
+
+ def _extract_list(self, content_path):
+ rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
+ entries = [self.url_result(session_url.text, 'Channel9')
+ for session_url in rss.findall('./channel/item/link')]
+ title_text = rss.find('./channel/title').text
+ return self.playlist_result(entries, content_path, title_text)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ content_path = mobj.group('contentpath')
+
+ webpage = self._download_webpage(url, content_path, u'Downloading web page')
+
+ page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
+ if page_type_m is None:
+ raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
+
+ page_type = page_type_m.group('pagetype')
+ if page_type == 'List': # List page, may contain list of 'item'-like objects
+ return self._extract_list(content_path)
+ elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
+ return self._extract_entry_item(webpage, content_path)
+ elif page_type == 'Session': # Event session page, may contain downloadable content
+ return self._extract_session(webpage, content_path)
+ else:
+ raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True) \ No newline at end of file
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
index 2fe1033f0..f0d08cebf 100644
--- a/youtube_dl/extractor/cinemassacre.py
+++ b/youtube_dl/extractor/cinemassacre.py
@@ -41,7 +41,7 @@ class CinemassacreIE(InfoExtractor):
webpage_url = u'http://' + mobj.group('url')
webpage = self._download_webpage(webpage_url, None) # Don't know video id yet
video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
- mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/(?:embed|player)\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
+ mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
if not mobj:
raise ExtractorError(u'Can\'t extract embed url and video id')
playerdata_url = mobj.group(u'embed_url')
@@ -65,6 +65,7 @@ class CinemassacreIE(InfoExtractor):
{
'url': url,
'play_path': 'mp4:' + sd_file,
+ 'rtmp_live': True, # workaround
'ext': 'flv',
'format': 'sd',
'format_id': 'sd',
@@ -72,6 +73,7 @@ class CinemassacreIE(InfoExtractor):
{
'url': url,
'play_path': 'mp4:' + hd_file,
+ 'rtmp_live': True, # workaround
'ext': 'flv',
'format': 'hd',
'format_id': 'hd',
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
new file mode 100644
index 000000000..43efb08bf
--- /dev/null
+++ b/youtube_dl/extractor/clipfish.py
@@ -0,0 +1,58 @@
+import re
+import time
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class ClipfishIE(InfoExtractor):
+ IE_NAME = u'clipfish'
+
+ _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
+ _TEST = {
+ u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
+ u'file': u'3966754.mp4',
+ u'md5': u'2521cd644e862936cf2e698206e47385',
+ u'info_dict': {
+ u'title': u'FIFA 14 - E3 2013 Trailer',
+ u'duration': 82,
+ },
+ u'skip': 'Blocked in the US'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+
+ info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' %
+ (video_id, int(time.time())))
+ doc = self._download_xml(
+ info_url, video_id, note=u'Downloading info page')
+ title = doc.find('title').text
+ video_url = doc.find('filename').text
+ if video_url is None:
+ xml_bytes = xml.etree.ElementTree.tostring(doc)
+ raise ExtractorError(u'Cannot find video URL in document %r' %
+ xml_bytes)
+ thumbnail = doc.find('imageurl').text
+ duration_str = doc.find('duration').text
+ m = re.match(
+ r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$',
+ duration_str)
+ if m:
+ duration = (
+ (int(m.group('hours')) * 60 * 60) +
+ (int(m.group('minutes')) * 60) +
+ (int(m.group('seconds')))
+ )
+ else:
+ duration = None
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py
new file mode 100644
index 000000000..c60089ad3
--- /dev/null
+++ b/youtube_dl/extractor/clipsyndicate.py
@@ -0,0 +1,50 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ find_xpath_attr,
+ fix_xml_all_ampersand,
+)
+
+
+class ClipsyndicateIE(InfoExtractor):
+ _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
+ u'md5': u'4d7d549451bad625e0ff3d7bd56d776c',
+ u'info_dict': {
+ u'id': u'4629301',
+ u'ext': u'mp4',
+ u'title': u'Brick Briscoe',
+ u'duration': 612,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ js_player = self._download_webpage(
+ 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
+ video_id, u'Downlaoding player')
+ # it includes a required token
+ flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
+
+ pdoc = self._download_xml(
+ 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
+ video_id, u'Downloading video info',
+ transform_source=fix_xml_all_ampersand)
+
+ track_doc = pdoc.find('trackList/track')
+ def find_param(name):
+ node = find_xpath_attr(track_doc, './/param', 'name', name)
+ if node is not None:
+ return node.attrib['value']
+
+ return {
+ 'id': video_id,
+ 'title': find_param('title'),
+ 'url': track_doc.find('location').text,
+ 'thumbnail': find_param('thumbnail'),
+ 'duration': int(find_param('duration')),
+ }
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index a79f881cd..a034bb2fb 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -1,12 +1,11 @@
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import determine_ext
class CNNIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/
+ _VALID_URL = r'''(?x)https?://((edition|www)\.)?cnn\.com/video/(data/.+?|\?)/
(?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))'''
_TESTS = [{
@@ -33,8 +32,7 @@ class CNNIE(InfoExtractor):
path = mobj.group('path')
page_title = mobj.group('title')
info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path
- info_xml = self._download_webpage(info_url, page_title)
- info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+ info = self._download_xml(info_url, page_title)
formats = []
for f in info.findall('files/file'):
diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py
index 8d4c93d6d..b27c1dfc5 100644
--- a/youtube_dl/extractor/collegehumor.py
+++ b/youtube_dl/extractor/collegehumor.py
@@ -1,5 +1,4 @@
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -46,11 +45,10 @@ class CollegeHumorIE(InfoExtractor):
self.report_extraction(video_id)
xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
- metaXml = self._download_webpage(xmlUrl, video_id,
+ mdoc = self._download_xml(xmlUrl, video_id,
u'Downloading info XML',
u'Unable to download video info XML')
- mdoc = xml.etree.ElementTree.fromstring(metaXml)
try:
videoNode = mdoc.findall('./video')[0]
youtubeIdNode = videoNode.find('./youtubeID')
@@ -65,16 +63,13 @@ class CollegeHumorIE(InfoExtractor):
if next_url.endswith(u'manifest.f4m'):
manifest_url = next_url + '?hdcore=2.10.3'
- manifestXml = self._download_webpage(manifest_url, video_id,
+ adoc = self._download_xml(manifest_url, video_id,
u'Downloading XML manifest',
u'Unable to download video info XML')
- adoc = xml.etree.ElementTree.fromstring(manifestXml)
try:
- media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
- node_id = media_node.attrib['url']
video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
- except IndexError as err:
+ except IndexError:
raise ExtractorError(u'Invalid manifest file')
url_pr = compat_urllib_parse_urlparse(info['thumbnail'])
info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','')
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index 69b2beece..a54ce3ee7 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -1,7 +1,7 @@
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
+from .mtv import MTVServicesInfoExtractor
from ..utils import (
compat_str,
compat_urllib_parse,
@@ -11,7 +11,31 @@ from ..utils import (
)
-class ComedyCentralIE(InfoExtractor):
+class ComedyCentralIE(MTVServicesInfoExtractor):
+ _VALID_URL = r'https?://(?:www.)?comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)'
+ _FEED_URL = u'http://comedycentral.com/feeds/mrss/'
+
+ _TEST = {
+ u'url': u'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
+ u'md5': u'4167875aae411f903b751a21f357f1ee',
+ u'info_dict': {
+ u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354',
+ u'ext': u'mp4',
+ u'title': u'Uncensored - Greg Fitzsimmons - Too Good of a Mother',
+ u'description': u'After a certain point, breastfeeding becomes c**kblocking.',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ title = mobj.group('title')
+ webpage = self._download_webpage(url, title)
+ mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"',
+ webpage, u'mgid')
+ return self._get_videos_info(mgid)
+
+
+class ComedyCentralShowsIE(InfoExtractor):
IE_DESC = u'The Daily Show / Colbert Report'
# urls can be abbreviations like :thedailyshow or :colbert
# urls for episodes like:
@@ -127,13 +151,12 @@ class ComedyCentralIE(InfoExtractor):
uri = mMovieParams[0][1]
indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
- indexXml = self._download_webpage(indexUrl, epTitle,
+ idoc = self._download_xml(indexUrl, epTitle,
u'Downloading show index',
u'unable to download episode index')
results = []
- idoc = xml.etree.ElementTree.fromstring(indexXml)
itemEls = idoc.findall('.//item')
for partNum,itemEl in enumerate(itemEls):
mediaId = itemEl.findall('./guid')[0].text
@@ -144,10 +167,9 @@ class ComedyCentralIE(InfoExtractor):
configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
compat_urllib_parse.urlencode({'uri': mediaId}))
- configXml = self._download_webpage(configUrl, epTitle,
+ cdoc = self._download_xml(configUrl, epTitle,
u'Downloading configuration for %s' % shortMediaId)
- cdoc = xml.etree.ElementTree.fromstring(configXml)
turls = []
for rendition in cdoc.findall('.//rendition'):
finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
@@ -169,7 +191,7 @@ class ComedyCentralIE(InfoExtractor):
})
effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
- info = {
+ results.append({
'id': shortMediaId,
'formats': formats,
'uploader': showId,
@@ -177,11 +199,6 @@ class ComedyCentralIE(InfoExtractor):
'title': effTitle,
'thumbnail': None,
'description': compat_str(officialTitle),
- }
-
- # TODO: Remove when #980 has been merged
- info.update(info['formats'][-1])
-
- results.append(info)
+ })
return results
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index cef4dce85..ba46a7bc7 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -4,11 +4,11 @@ import re
import socket
import sys
import netrc
+import xml.etree.ElementTree
from ..utils import (
compat_http_client,
compat_urllib_error,
- compat_urllib_request,
compat_str,
clean_html,
@@ -18,6 +18,8 @@ from ..utils import (
sanitize_filename,
unescapeHTML,
)
+_NO_DEFAULT = object()
+
class InfoExtractor(object):
"""Information Extractor class.
@@ -33,15 +35,39 @@ class InfoExtractor(object):
The dictionaries must include the following fields:
id: Video identifier.
- url: Final video URL.
title: Video title, unescaped.
- ext: Video filename extension.
- Instead of url and ext, formats can also specified.
+ Additionally, it must contain either a formats entry or url and ext:
+
+ formats: A list of dictionaries for each format available, it must
+ be ordered from worst to best quality. Potential fields:
+ * url Mandatory. The URL of the video file
+ * ext Will be calculated from url if missing
+ * format A human-readable description of the format
+ ("mp4 container with h264/opus").
+ Calculated from the format_id, width, height.
+ and format_note fields if missing.
+ * format_id A short description of the format
+ ("mp4_h264_opus" or "19")
+ * format_note Additional info about the format
+ ("3D" or "DASH video")
+ * width Width of the video, if known
+ * height Height of the video, if known
+ * abr Average audio bitrate in KBit/s
+ * acodec Name of the audio codec in use
+ * vbr Average video bitrate in KBit/s
+ * vcodec Name of the video codec in use
+ * filesize The number of bytes, if known in advance
+ * player_url SWF Player URL (used for rtmpdump).
+ url: Final video URL.
+ ext: Video filename extension.
+ format: The video format, defaults to ext (used for --get-format)
+ player_url: SWF Player URL (used for rtmpdump).
+ urlhandle: [internal] The urlHandle to be used to download the file,
+ like returned by urllib.request.urlopen
The following fields are optional:
- format: The video format, defaults to ext (used for --get-format)
thumbnails: A list of dictionaries (with the entries "resolution" and
"url") for the varying thumbnails
thumbnail: Full URL to a video thumbnail image.
@@ -50,27 +76,17 @@ class InfoExtractor(object):
upload_date: Video upload date (YYYYMMDD).
uploader_id: Nickname or id of the video uploader.
location: Physical location of the video.
- player_url: SWF Player URL (used for rtmpdump).
subtitles: The subtitle file contents as a dictionary in the format
{language: subtitles}.
+ duration: Length of the video in seconds, as an integer.
view_count: How many users have watched the video on the platform.
- urlhandle: [internal] The urlHandle to be used to download the file,
- like returned by urllib.request.urlopen
+ like_count: Number of positive ratings of the video
+ dislike_count: Number of negative ratings of the video
+ comment_count: Number of comments on the video
age_limit: Age restriction for the video, as an integer (years)
- formats: A list of dictionaries for each format available, it must
- be ordered from worst to best quality. Potential fields:
- * url Mandatory. The URL of the video file
- * ext Will be calculated from url if missing
- * format A human-readable description of the format
- ("mp4 container with h264/opus").
- Calculated from the format_id, width, height.
- and format_note fields if missing.
- * format_id A short description of the format
- ("mp4_h264_opus" or "19")
- * format_note Additional info about the format
- ("3D" or "DASH video")
- * width Width of the video, if known
- * height Height of the video, if known
+ webpage_url: The url to the video webpage, if given to youtube-dl it
+ should allow to get the same result again. (It will be set
+ by YoutubeDL if it's missing)
Unless mentioned otherwise, the fields should be Unicode strings.
@@ -142,27 +158,40 @@ class InfoExtractor(object):
def IE_NAME(self):
return type(self).__name__[:-2]
- def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
+ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns the response handle """
if note is None:
self.report_download_webpage(video_id)
elif note is not False:
- self.to_screen(u'%s: %s' % (video_id, note))
+ if video_id is None:
+ self.to_screen(u'%s' % (note,))
+ else:
+ self.to_screen(u'%s: %s' % (video_id, note))
try:
- return compat_urllib_request.urlopen(url_or_request)
+ return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ if errnote is False:
+ return False
if errnote is None:
errnote = u'Unable to download webpage'
- raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
+ errmsg = u'%s: %s' % (errnote, compat_str(err))
+ if fatal:
+ raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
+ else:
+ self._downloader.report_warning(errmsg)
+ return False
- def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
+ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns a tuple (page content as string, URL handle) """
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0]
- urlh = self._request_webpage(url_or_request, video_id, note, errnote)
+ urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
+ if urlh is False:
+ assert not fatal
+ return False
content_type = urlh.headers.get('Content-Type', '')
webpage_bytes = urlh.read()
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -197,9 +226,23 @@ class InfoExtractor(object):
content = webpage_bytes.decode(encoding, 'replace')
return (content, urlh)
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns the data of the page as a string """
- return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
+ res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
+ if res is False:
+ return res
+ else:
+ content, _ = res
+ return content
+
+ def _download_xml(self, url_or_request, video_id,
+ note=u'Downloading XML', errnote=u'Unable to download XML',
+ transform_source=None):
+ """Return the xml as an xml.etree.ElementTree.Element"""
+ xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+ if transform_source:
+ xml_string = transform_source(xml_string)
+ return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
def to_screen(self, msg):
"""Print msg to screen, prefixing it with '[ie_name]'"""
@@ -222,14 +265,18 @@ class InfoExtractor(object):
self.to_screen(u'Logging in')
#Methods for following #608
- def url_result(self, url, ie=None):
+ @staticmethod
+ def url_result(url, ie=None, video_id=None):
"""Returns a url that points to a page that should be processed"""
#TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url,
'ie_key': ie}
+ if video_id is not None:
+ video_info['id'] = video_id
return video_info
- def playlist_result(self, entries, playlist_id=None, playlist_title=None):
+ @staticmethod
+ def playlist_result(entries, playlist_id=None, playlist_title=None):
"""Returns a playlist"""
video_info = {'_type': 'playlist',
'entries': entries}
@@ -239,7 +286,7 @@ class InfoExtractor(object):
video_info['title'] = playlist_title
return video_info
- def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+ def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
"""
Perform a regex search on the given string, using a single or a list of
patterns returning the first matching group.
@@ -253,7 +300,7 @@ class InfoExtractor(object):
mobj = re.search(p, string, flags)
if mobj: break
- if sys.stderr.isatty() and os.name != 'nt':
+ if os.name != 'nt' and sys.stderr.isatty():
_name = u'\033[0;34m%s\033[0m' % name
else:
_name = name
@@ -261,7 +308,7 @@ class InfoExtractor(object):
if mobj:
# return the first matching group
return next(g for g in mobj.groups() if g is not None)
- elif default is not None:
+ elif default is not _NO_DEFAULT:
return default
elif fatal:
raise RegexNotFoundError(u'Unable to extract %s' % _name)
@@ -270,7 +317,7 @@ class InfoExtractor(object):
u'please report this issue on http://yt-dl.org/bug' % _name)
return None
- def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+ def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
@@ -312,13 +359,21 @@ class InfoExtractor(object):
# Helper functions for extracting OpenGraph info
@staticmethod
- def _og_regex(prop):
- return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
+ def _og_regexes(prop):
+ content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
+ property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
+ template = r'<meta[^>]+?%s[^>]+?%s'
+ return [
+ template % (property_re, content_re),
+ template % (content_re, property_re),
+ ]
def _og_search_property(self, prop, html, name=None, **kargs):
if name is None:
name = 'OpenGraph %s' % prop
- escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
+ escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
+ if escaped is None:
+ return None
return unescapeHTML(escaped)
def _og_search_thumbnail(self, html, **kargs):
@@ -331,10 +386,22 @@ class InfoExtractor(object):
return self._og_search_property('title', html, **kargs)
def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
- regexes = [self._og_regex('video')]
- if secure: regexes.insert(0, self._og_regex('video:secure_url'))
+ regexes = self._og_regexes('video')
+ if secure: regexes = self._og_regexes('video:secure_url') + regexes
return self._html_search_regex(regexes, html, name, **kargs)
+ def _html_search_meta(self, name, html, display_name=None):
+ if display_name is None:
+ display_name = name
+ return self._html_search_regex(
+ r'''(?ix)<meta
+ (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
+ [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
+ html, display_name, fatal=False)
+
+ def _dc_search_uploader(self, html):
+ return self._html_search_meta('dc.creator', html, 'uploader')
+
def _rta_search(self, html):
# See http://www.rtalabel.org/index.php?content=howtofaq#single
if re.search(r'(?ix)<meta\s+name="rating"\s+'
@@ -343,6 +410,23 @@ class InfoExtractor(object):
return 18
return 0
+ def _media_rating_search(self, html):
+ # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
+ rating = self._html_search_meta('rating', html)
+
+ if not rating:
+ return None
+
+ RATING_TABLE = {
+ 'safe for kids': 0,
+ 'general': 8,
+ '14 years': 14,
+ 'mature': 17,
+ 'restricted': 19,
+ }
+ return RATING_TABLE.get(rating.lower(), None)
+
+
class SearchInfoExtractor(InfoExtractor):
"""
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index 7bf03c584..d5730684d 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -6,7 +6,7 @@ from ..utils import (
)
class CSpanIE(InfoExtractor):
- _VALID_URL = r'http://www.c-spanvideo.org/program/(.*)'
+ _VALID_URL = r'http://www\.c-spanvideo\.org/program/(.*)'
_TEST = {
u'url': u'http://www.c-spanvideo.org/program/HolderonV',
u'file': u'315139.flv',
diff --git a/youtube_dl/extractor/d8.py b/youtube_dl/extractor/d8.py
new file mode 100644
index 000000000..a56842b16
--- /dev/null
+++ b/youtube_dl/extractor/d8.py
@@ -0,0 +1,22 @@
+# encoding: utf-8
+from .canalplus import CanalplusIE
+
+
+class D8IE(CanalplusIE):
+ _VALID_URL = r'https?://www\.d8\.tv/.*?/(?P<path>.*)'
+ _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/d8/%s'
+ IE_NAME = u'd8.tv'
+
+ _TEST = {
+ u'url': u'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html',
+ u'file': u'966289.flv',
+ u'info_dict': {
+ u'title': u'Campagne intime - Documentaire exceptionnel',
+ u'description': u'md5:d2643b799fb190846ae09c61e59a859f',
+ u'upload_date': u'20131108',
+ },
+ u'params': {
+ # rtmp
+ u'skip_download': True,
+ },
+ }
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 355b4ed0a..6685c94a3 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -11,6 +11,7 @@ from ..utils import (
get_element_by_attribute,
get_element_by_id,
orderedSet,
+ str_to_int,
ExtractorError,
)
@@ -27,7 +28,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
"""Information Extractor for Dailymotion"""
- _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)'
+ _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)'
IE_NAME = u'dailymotion'
_FORMATS = [
@@ -80,7 +81,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
# Extract id and simplified title from URL
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1).split('_')[0].split('?')[0]
+ video_id = mobj.group('id')
url = 'http://www.dailymotion.com/video/%s' % video_id
@@ -100,10 +101,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
self.to_screen(u'Vevo video detected: %s' % vevo_id)
return self.url_result(u'vevo:%s' % vevo_id, ie='Vevo')
- video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
- # Looking for official user
- r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
- webpage, 'video uploader', fatal=False)
age_limit = self._rta_search(webpage)
video_upload_date = None
@@ -141,23 +138,29 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
raise ExtractorError(u'Unable to extract video URL')
# subtitles
- video_subtitles = self.extract_subtitles(video_id)
+ video_subtitles = self.extract_subtitles(video_id, webpage)
if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id)
+ self._list_available_subtitles(video_id, webpage)
return
+ view_count = self._search_regex(
+ r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, u'view count', fatal=False)
+ if view_count is not None:
+ view_count = str_to_int(view_count)
+
return {
'id': video_id,
'formats': formats,
- 'uploader': video_uploader,
+ 'uploader': info['owner_screenname'],
'upload_date': video_upload_date,
'title': self._og_search_title(webpage),
'subtitles': video_subtitles,
'thumbnail': info['thumbnail_url'],
'age_limit': age_limit,
+ 'view_count': view_count,
}
- def _get_available_subtitles(self, video_id):
+ def _get_available_subtitles(self, video_id, webpage):
try:
sub_list = self._download_webpage(
'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
@@ -186,7 +189,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
webpage = self._download_webpage(request,
id, u'Downloading page %s' % pagenum)
- playlist_el = get_element_by_attribute(u'class', u'video_list', webpage)
+ playlist_el = get_element_by_attribute(u'class', u'row video_list', webpage)
video_ids.extend(re.findall(r'data-id="(.+?)"', playlist_el))
if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py
index a804e83bd..4876ecb48 100644
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -1,6 +1,5 @@
# encoding: utf-8
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -10,7 +9,7 @@ from ..utils import (
class DaumIE(InfoExtractor):
- _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
IE_NAME = u'daum.net'
_TEST = {
@@ -29,17 +28,16 @@ class DaumIE(InfoExtractor):
video_id = mobj.group(1)
canonical_url = 'http://tvpot.daum.net/v/%s' % video_id
webpage = self._download_webpage(canonical_url, video_id)
- full_id = self._search_regex(r'<link rel="video_src" href=".+?vid=(.+?)"',
+ full_id = self._search_regex(
+ r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]',
webpage, u'full id')
query = compat_urllib_parse.urlencode({'vid': full_id})
- info_xml = self._download_webpage(
+ info = self._download_xml(
'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
u'Downloading video info')
- urls_xml = self._download_webpage(
+ urls = self._download_xml(
'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query,
video_id, u'Downloading video formats info')
- info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
- urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8'))
self.to_screen(u'%s: Getting video urls' % video_id)
formats = []
@@ -49,10 +47,9 @@ class DaumIE(InfoExtractor):
'vid': full_id,
'profile': profile,
})
- url_xml = self._download_webpage(
+ url_doc = self._download_xml(
'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query,
video_id, note=False)
- url_doc = xml.etree.ElementTree.fromstring(url_xml.encode('utf-8'))
format_url = url_doc.find('result/url').text
formats.append({
'url': format_url,
@@ -60,7 +57,7 @@ class DaumIE(InfoExtractor):
'format_id': profile,
})
- info = {
+ return {
'id': video_id,
'title': info.find('TITLE').text,
'formats': formats,
@@ -69,6 +66,3 @@ class DaumIE(InfoExtractor):
'duration': int(info.find('DURATION').text),
'upload_date': info.find('REGDTTM').text[:8],
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
- return info
diff --git a/youtube_dl/extractor/depositfiles.py b/youtube_dl/extractor/depositfiles.py
index d43348955..2c9fb5f2e 100644
--- a/youtube_dl/extractor/depositfiles.py
+++ b/youtube_dl/extractor/depositfiles.py
@@ -25,7 +25,7 @@ class DepositFilesIE(InfoExtractor):
url = 'http://depositfiles.com/en/files/' + file_id
# Retrieve file webpage with 'Free download' button pressed
- free_download_indication = { 'gateway_result' : '1' }
+ free_download_indication = {'gateway_result' : '1'}
request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
try:
self.report_download_webpage(file_id)
diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py
index 765cb1f37..cb7226f82 100644
--- a/youtube_dl/extractor/dreisat.py
+++ b/youtube_dl/extractor/dreisat.py
@@ -1,7 +1,6 @@
# coding: utf-8
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -12,7 +11,7 @@ from ..utils import (
class DreiSatIE(InfoExtractor):
IE_NAME = '3sat'
- _VALID_URL = r'(?:http://)?(?:www\.)?3sat.de/mediathek/index.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
+ _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/index\.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
_TEST = {
u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983",
u'file': u'36983.webm',
@@ -30,8 +29,7 @@ class DreiSatIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
- details_xml = self._download_webpage(details_url, video_id, note=u'Downloading video details')
- details_doc = xml.etree.ElementTree.fromstring(details_xml.encode('utf-8'))
+ details_doc = self._download_xml(details_url, video_id, note=u'Downloading video details')
thumbnail_els = details_doc.findall('.//teaserimage')
thumbnails = [{
@@ -67,7 +65,7 @@ class DreiSatIE(InfoExtractor):
return (qidx, prefer_http, format['video_bitrate'])
formats.sort(key=_sortkey)
- info = {
+ return {
'_type': 'video',
'id': video_id,
'title': video_title,
@@ -78,8 +76,3 @@ class DreiSatIE(InfoExtractor):
'uploader': video_uploader,
'upload_date': upload_date,
}
-
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
-
- return info
diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py
index f02c6998b..877113d63 100644
--- a/youtube_dl/extractor/ebaumsworld.py
+++ b/youtube_dl/extractor/ebaumsworld.py
@@ -1,5 +1,4 @@
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import determine_ext
@@ -21,9 +20,8 @@ class EbaumsWorldIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- config_xml = self._download_webpage(
+ config = self._download_xml(
'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id)
- config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
video_url = config.find('file').text
return {
diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py
index 2cfbcd363..88f5526b8 100644
--- a/youtube_dl/extractor/eighttracks.py
+++ b/youtube_dl/extractor/eighttracks.py
@@ -1,4 +1,3 @@
-import itertools
import json
import random
import re
@@ -11,7 +10,7 @@ from ..utils import (
class EightTracksIE(InfoExtractor):
IE_NAME = '8tracks'
- _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
+ _VALID_URL = r'https?://8tracks\.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
_TEST = {
u"name": u"EightTracks",
u"url": u"http://8tracks.com/ytdl/youtube-dl-test-tracks-a",
diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py
new file mode 100644
index 000000000..4ba323148
--- /dev/null
+++ b/youtube_dl/extractor/eitb.py
@@ -0,0 +1,37 @@
+# encoding: utf-8
+import re
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveIE
+from ..utils import ExtractorError
+
+
+class EitbIE(InfoExtractor):
+ IE_NAME = u'eitb.tv'
+ _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)'
+
+ _TEST = {
+ u'add_ie': ['Brightcove'],
+ u'url': u'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/',
+ u'md5': u'edf4436247185adee3ea18ce64c47998',
+ u'info_dict': {
+ u'id': u'2743577154001',
+ u'ext': u'mp4',
+ u'title': u'60 minutos (Lasa y Zabala, 30 años)',
+ # All videos from eitb has this description in the brightcove info
+ u'description': u'.',
+ u'uploader': u'Euskal Telebista',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ chapter_id = mobj.group('chapter_id')
+ webpage = self._download_webpage(url, chapter_id)
+ bc_url = BrightcoveIE._extract_brightcove_url(webpage)
+ if bc_url is None:
+ raise ExtractorError(u'Could not extract the Brightcove url')
+ # The BrightcoveExperience object doesn't contain the video id, we set
+ # it manually
+ bc_url += '&%40videoPlayer={0}'.format(chapter_id)
+ return self.url_result(bc_url, BrightcoveIE.ie_key())
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py
index 3aa2da52c..b1242f6bc 100644
--- a/youtube_dl/extractor/escapist.py
+++ b/youtube_dl/extractor/escapist.py
@@ -11,11 +11,11 @@ from ..utils import (
class EscapistIE(InfoExtractor):
- _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
+ _VALID_URL = r'^https?://?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
_TEST = {
u'url': u'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
u'file': u'6618-Breaking-Down-Baldurs-Gate.mp4',
- u'md5': u'c6793dbda81388f4264c1ba18684a74d',
+ u'md5': u'ab3a706c681efca53f0a35f1415cf0d1',
u'info_dict': {
u"description": u"Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
u"uploader": u"the-escapist-presents",
@@ -25,50 +25,60 @@ class EscapistIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
showName = mobj.group('showname')
videoId = mobj.group('episode')
self.report_extraction(videoId)
webpage = self._download_webpage(url, videoId)
- videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
+ videoDesc = self._html_search_regex(
+ r'<meta name="description" content="([^"]*)"',
webpage, u'description', fatal=False)
- playerUrl = self._og_search_video_url(webpage, name='player url')
+ playerUrl = self._og_search_video_url(webpage, name=u'player URL')
- title = self._html_search_regex('<meta name="title" content="([^"]*)"',
- webpage, u'player url').split(' : ')[-1]
+ title = self._html_search_regex(
+ r'<meta name="title" content="([^"]*)"',
+ webpage, u'title').split(' : ')[-1]
- configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
+ configUrl = self._search_regex('config=(.*)$', playerUrl, u'config URL')
configUrl = compat_urllib_parse.unquote(configUrl)
- configJSON = self._download_webpage(configUrl, videoId,
- u'Downloading configuration',
- u'unable to download configuration')
-
- # Technically, it's JavaScript, not JSON
- configJSON = configJSON.replace("'", '"')
-
+ formats = []
+
+ def _add_format(name, cfgurl):
+ configJSON = self._download_webpage(
+ cfgurl, videoId,
+ u'Downloading ' + name + ' configuration',
+ u'Unable to download ' + name + ' configuration')
+
+ # Technically, it's JavaScript, not JSON
+ configJSON = configJSON.replace("'", '"')
+
+ try:
+ config = json.loads(configJSON)
+ except (ValueError,) as err:
+ raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
+ playlist = config['playlist']
+ formats.append({
+ 'url': playlist[1]['url'],
+ 'format_id': name,
+ })
+
+ _add_format(u'normal', configUrl)
+ hq_url = (configUrl +
+ ('&hq=1' if '?' in configUrl else configUrl + '?hq=1'))
try:
- config = json.loads(configJSON)
- except (ValueError,) as err:
- raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
+ _add_format(u'hq', hq_url)
+ except ExtractorError:
+ pass # That's fine, we'll just use normal quality
- playlist = config['playlist']
- videoUrl = playlist[1]['url']
-
- info = {
+ return {
'id': videoId,
- 'url': videoUrl,
+ 'formats': formats,
'uploader': showName,
- 'upload_date': None,
'title': title,
- 'ext': 'mp4',
'thumbnail': self._og_search_thumbnail(webpage),
'description': videoDesc,
'player_url': playerUrl,
}
-
- return [info]
diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py
index c74556579..682901d16 100644
--- a/youtube_dl/extractor/exfm.py
+++ b/youtube_dl/extractor/exfm.py
@@ -8,7 +8,7 @@ class ExfmIE(InfoExtractor):
IE_NAME = u'exfm'
IE_DESC = u'ex.fm'
_VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)'
- _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream'
+ _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream'
_TESTS = [
{
u'url': u'http://ex.fm/song/eh359',
@@ -21,6 +21,7 @@ class ExfmIE(InfoExtractor):
u'description': u'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive',
},
u'note': u'Soundcloud song',
+ u'skip': u'The site is down too often',
},
{
u'url': u'http://ex.fm/song/wddt8',
@@ -30,6 +31,7 @@ class ExfmIE(InfoExtractor):
u'title': u'Safe and Sound',
u'uploader': u'Capital Cities',
},
+ u'skip': u'The site is down too often',
},
]
diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py
new file mode 100644
index 000000000..1c20e4364
--- /dev/null
+++ b/youtube_dl/extractor/extremetube.py
@@ -0,0 +1,50 @@
+import os
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse_urlparse,
+ compat_urllib_request,
+ compat_urllib_parse,
+)
+
+class ExtremeTubeIE(InfoExtractor):
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
+ _TEST = {
+ u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
+ u'file': u'652431.mp4',
+ u'md5': u'1fb9228f5e3332ec8c057d6ac36f33e0',
+ u'info_dict': {
+ u"title": u"Music Video 14 british euro brit european cumshots swallow",
+ u"uploader": u"unknown",
+ u"age_limit": 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('videoid')
+ url = 'http://www.' + mobj.group('url')
+
+ req = compat_urllib_request.Request(url)
+ req.add_header('Cookie', 'age_verified=1')
+ webpage = self._download_webpage(req, video_id)
+
+ video_title = self._html_search_regex(r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, u'title')
+ uploader = self._html_search_regex(r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, u'uploader', fatal=False)
+ video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, u'video_url'))
+ path = compat_urllib_parse_urlparse(video_url).path
+ extension = os.path.splitext(path)[1][1:]
+ format = path.split('/')[5].split('_')[:2]
+ format = "-".join(format)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'uploader': uploader,
+ 'url': video_url,
+ 'ext': extension,
+ 'format': format,
+ 'format_id': format,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index f8bdfc2d3..4556079c8 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -1,5 +1,4 @@
import json
-import netrc
import re
import socket
@@ -18,7 +17,7 @@ from ..utils import (
class FacebookIE(InfoExtractor):
"""Information Extractor for Facebook"""
- _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
+ _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:[^#?]*#!/)?(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
_NETRC_MACHINE = 'facebook'
@@ -28,7 +27,7 @@ class FacebookIE(InfoExtractor):
u'file': u'120708114770723.mp4',
u'md5': u'48975a41ccc4b7a581abd68651c1a5a8',
u'info_dict': {
- u"duration": 279,
+ u"duration": 279,
u"title": u"PEOPLE ARE AWESOME 2013"
}
}
diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py
index 89ed08db4..c6ab6952e 100644
--- a/youtube_dl/extractor/faz.py
+++ b/youtube_dl/extractor/faz.py
@@ -1,6 +1,5 @@
# encoding: utf-8
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -10,7 +9,7 @@ from ..utils import (
class FazIE(InfoExtractor):
IE_NAME = u'faz.net'
- _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+).html'
+ _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html'
_TEST = {
u'url': u'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
@@ -28,9 +27,8 @@ class FazIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
config_xml_url = self._search_regex(r'writeFLV\(\'(.+?)\',', webpage,
u'config xml url')
- config_xml = self._download_webpage(config_xml_url, video_id,
+ config = self._download_xml(config_xml_url, video_id,
u'Downloading config xml')
- config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
encodings = config.find('ENCODINGS')
formats = []
@@ -46,13 +44,10 @@ class FazIE(InfoExtractor):
})
descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description')
- info = {
+ return {
'id': video_id,
'title': self._og_search_title(webpage),
'formats': formats,
'description': descr,
'thumbnail': config.find('STILL/STILL_BIG').text,
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
- return info
diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py
index 9c89362ef..d7048c8c1 100644
--- a/youtube_dl/extractor/fktv.py
+++ b/youtube_dl/extractor/fktv.py
@@ -12,7 +12,7 @@ from ..utils import (
class FKTVIE(InfoExtractor):
IE_NAME = u'fernsehkritik.tv'
- _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/folge-(?P<ep>[0-9]+)(?:/.*)?'
+ _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/folge-(?P<ep>[0-9]+)(?:/.*)?'
_TEST = {
u'url': u'http://fernsehkritik.tv/folge-1',
@@ -39,7 +39,6 @@ class FKTVIE(InfoExtractor):
for i, _ in enumerate(files, 1):
video_id = '%04d%d' % (episode, i)
video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i)
- video_title = 'Fernsehkritik %d.%d' % (episode, i)
videos.append({
'id': video_id,
'url': video_url,
@@ -53,7 +52,7 @@ class FKTVIE(InfoExtractor):
class FKTVPosteckeIE(InfoExtractor):
IE_NAME = u'fernsehkritik.tv:postecke'
- _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/inline-video/postecke.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)'
+ _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/inline-video/postecke\.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)'
_TEST = {
u'url': u'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120',
u'file': u'0120.flv',
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index 086cafca0..ad85bc16d 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -1,6 +1,5 @@
# encoding: utf-8
import re
-import xml.etree.ElementTree
import json
from .common import InfoExtractor
@@ -11,11 +10,10 @@ from ..utils import (
class FranceTVBaseInfoExtractor(InfoExtractor):
def _extract_video(self, video_id):
- xml_desc = self._download_webpage(
+ info = self._download_xml(
'http://www.francetvinfo.fr/appftv/webservices/video/'
'getInfosOeuvre.php?id-diffusion='
+ video_id, video_id, 'Downloading XML config')
- info = xml.etree.ElementTree.fromstring(xml_desc.encode('utf-8'))
manifest_url = info.find('videos/video/url').text
video_url = manifest_url.replace('manifest.f4m', 'index_2_av.m3u8')
@@ -23,7 +21,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
thumbnail_path = info.find('image').text
return {'id': video_id,
- 'ext': 'mp4',
+ 'ext': 'flv' if video_url.startswith('rtmp') else 'mp4',
'url': video_url,
'title': info.find('titre').text,
'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', thumbnail_path),
@@ -47,7 +45,7 @@ class PluzzIE(FranceTVBaseInfoExtractor):
class FranceTvInfoIE(FranceTVBaseInfoExtractor):
IE_NAME = u'francetvinfo.fr'
- _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+).html'
+ _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+)\.html'
_TEST = {
u'url': u'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html',
@@ -68,35 +66,101 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
return self._extract_video(video_id)
-class France2IE(FranceTVBaseInfoExtractor):
- IE_NAME = u'france2.fr'
- _VALID_URL = r'''(?x)https?://www\.france2\.fr/
+class FranceTVIE(FranceTVBaseInfoExtractor):
+ IE_NAME = u'francetv'
+ IE_DESC = u'France 2, 3, 4, 5 and Ô'
+ _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/
(?:
- emissions/.*?/videos/(?P<id>\d+)
- | emission/(?P<key>[^/?]+)
+ emissions/.*?/(videos|emissions)/(?P<id>[^/?]+)
+ | (emissions?|jt)/(?P<key>[^/?]+)
)'''
- _TEST = {
- u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104',
- u'file': u'75540104.mp4',
- u'info_dict': {
- u'title': u'13h15, le samedi...',
- u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d',
+ _TESTS = [
+ # france2
+ {
+ u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104',
+ u'file': u'75540104.mp4',
+ u'info_dict': {
+ u'title': u'13h15, le samedi...',
+ u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d',
+ },
+ u'params': {
+ # m3u8 download
+ u'skip_download': True,
+ },
},
- u'params': {
- u'skip_download': True,
+ # france3
+ {
+ u'url': u'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575',
+ u'info_dict': {
+ u'id': u'000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au',
+ u'ext': u'flv',
+ u'title': u'Le scandale du prix des médicaments',
+ u'description': u'md5:1384089fbee2f04fc6c9de025ee2e9ce',
+ },
+ u'params': {
+ # rtmp download
+ u'skip_download': True,
+ },
},
- }
+ # france4
+ {
+ u'url': u'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
+ u'info_dict': {
+ u'id': u'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
+ u'ext': u'flv',
+ u'title': u'Hero Corp Making of - Extrait 1',
+ u'description': u'md5:c87d54871b1790679aec1197e73d650a',
+ },
+ u'params': {
+ # rtmp download
+ u'skip_download': True,
+ },
+ },
+ # france5
+ {
+ u'url': u'http://www.france5.fr/emissions/c-a-dire/videos/92837968',
+ u'info_dict': {
+ u'id': u'92837968',
+ u'ext': u'mp4',
+ u'title': u'C à dire ?!',
+ u'description': u'md5:fb1db1cbad784dcce7c7a7bd177c8e2f',
+ },
+ u'params': {
+ # m3u8 download
+ u'skip_download': True,
+ },
+ },
+ # franceo
+ {
+ u'url': u'http://www.franceo.fr/jt/info-afrique/04-12-2013',
+ u'info_dict': {
+ u'id': u'92327925',
+ u'ext': u'mp4',
+ u'title': u'Infô-Afrique',
+ u'description': u'md5:ebf346da789428841bee0fd2a935ea55',
+ },
+ u'params': {
+ # m3u8 download
+ u'skip_download': True,
+ },
+ u'skip': u'The id changes frequently',
+ },
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj.group('key'):
webpage = self._download_webpage(url, mobj.group('key'))
- video_id = self._html_search_regex(
- r'''(?x)<div\s+class="video-player">\s*
+ id_res = [
+ (r'''(?x)<div\s+class="video-player">\s*
<a\s+href="http://videos.francetv.fr/video/([0-9]+)"\s+
- class="francetv-video-player">''',
- webpage, u'video ID')
+ class="francetv-video-player">'''),
+ (r'<a id="player_direct" href="http://info\.francetelevisions'
+ '\.fr/\?id-video=([^"/&]+)'),
+ (r'<a class="video" id="ftv_player_(.+?)"'),
+ ]
+ video_id = self._html_search_regex(id_res, webpage, u'video ID')
else:
video_id = mobj.group('id')
return self._extract_video(video_id)
diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py
new file mode 100644
index 000000000..a3a5251fe
--- /dev/null
+++ b/youtube_dl/extractor/gamekings.py
@@ -0,0 +1,38 @@
+import re
+
+from .common import InfoExtractor
+
+
+class GamekingsIE(InfoExtractor):
+ _VALID_URL = r'http://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)'
+ _TEST = {
+ u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/",
+ u'file': u'20130811.mp4',
+ # MD5 is flaky, seems to change regularly
+ #u'md5': u'2f32b1f7b80fdc5cb616efb4f387f8a3',
+ u'info_dict': {
+ u"title": u"Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review",
+ u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.",
+ }
+ }
+
+ def _real_extract(self, url):
+
+ mobj = re.match(self._VALID_URL, url)
+ name = mobj.group('name')
+ webpage = self._download_webpage(url, name)
+ video_url = self._og_search_video_url(webpage)
+
+ video = re.search(r'[0-9]+', video_url)
+ video_id = video.group(0)
+
+ # Todo: add medium format
+ video_url = video_url.replace(video_id, 'large/' + video_id)
+
+ return {
+ 'id': video_id,
+ 'ext': 'mp4',
+ 'url': video_url,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ }
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index 098768361..26b7d2ae5 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -24,7 +24,7 @@ class GameSpotIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- page_id = video_id = mobj.group('page_id')
+ page_id = mobj.group('page_id')
webpage = self._download_webpage(url, page_id)
data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video')
data_video = json.loads(unescapeHTML(data_video_json))
@@ -47,13 +47,10 @@ class GameSpotIE(InfoExtractor):
'format_id': q,
})
- info = {
+ return {
'id': data_video['guid'],
'title': compat_urllib_parse.unquote(data_video['title']),
'formats': formats,
'description': get_meta_content('description', webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
- return info
diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py
index 3cc02d97e..d82a5d4b2 100644
--- a/youtube_dl/extractor/gametrailers.py
+++ b/youtube_dl/extractor/gametrailers.py
@@ -1,13 +1,10 @@
import re
-from .mtv import MTVIE, _media_xml_tag
+from .mtv import MTVServicesInfoExtractor
-class GametrailersIE(MTVIE):
- """
- Gametrailers use the same videos system as MTVIE, it just changes the feed
- url, where the uri is and the method to get the thumbnails.
- """
- _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
+
+class GametrailersIE(MTVServicesInfoExtractor):
+ _VALID_URL = r'http://www\.gametrailers\.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
_TEST = {
u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
@@ -17,15 +14,9 @@ class GametrailersIE(MTVIE):
u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
},
}
- # Overwrite MTVIE properties we don't want
- _TESTS = []
_FEED_URL = 'http://www.gametrailers.com/feeds/mrss'
- def _get_thumbnail_url(self, uri, itemdoc):
- search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
- return itemdoc.find(search_path).attrib['url']
-
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 2c8fcf5ae..bdb4f58d6 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -11,10 +11,14 @@ from ..utils import (
compat_urlparse,
ExtractorError,
+ HEADRequest,
smuggle_url,
unescapeHTML,
+ unified_strdate,
+ url_basename,
)
from .brightcove import BrightcoveIE
+from .ooyala import OoyalaIE
class GenericIE(InfoExtractor):
@@ -33,6 +37,7 @@ class GenericIE(InfoExtractor):
},
# embedded vimeo video
{
+ u'add_ie': ['Vimeo'],
u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',
u'file': u'22444065.mp4',
u'md5': u'2903896e23df39722c33f015af0666e2',
@@ -44,6 +49,7 @@ class GenericIE(InfoExtractor):
},
# bandcamp page with custom domain
{
+ u'add_ie': ['Bandcamp'],
u'url': u'http://bronyrock.com/track/the-pony-mash',
u'file': u'3235767654.mp3',
u'info_dict': {
@@ -52,6 +58,44 @@ class GenericIE(InfoExtractor):
},
u'skip': u'There is a limit of 200 free downloads / month for the test song',
},
+ # embedded brightcove video
+ # it also tests brightcove videos that need to set the 'Referer' in the
+ # http requests
+ {
+ u'add_ie': ['Brightcove'],
+ u'url': u'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
+ u'info_dict': {
+ u'id': u'2765128793001',
+ u'ext': u'mp4',
+ u'title': u'Le cours de bourse : l’analyse technique',
+ u'description': u'md5:7e9ad046e968cb2d1114004aba466fd9',
+ u'uploader': u'BFM BUSINESS',
+ },
+ u'params': {
+ u'skip_download': True,
+ },
+ },
+ # Direct link to a video
+ {
+ u'url': u'http://media.w3.org/2010/05/sintel/trailer.mp4',
+ u'file': u'trailer.mp4',
+ u'md5': u'67d406c2bcb6af27fa886f31aa934bbe',
+ u'info_dict': {
+ u'id': u'trailer',
+ u'title': u'trailer',
+ u'upload_date': u'20100513',
+ }
+ },
+ # ooyala video
+ {
+ u'url': u'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
+ u'md5': u'5644c6ca5d5782c1d0d350dad9bd840c',
+ u'info_dict': {
+ u'id': u'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
+ u'ext': u'mp4',
+ u'title': u'2cc213299525360.mov', #that's what we get
+ },
+ },
]
def report_download_webpage(self, video_id):
@@ -64,23 +108,20 @@ class GenericIE(InfoExtractor):
"""Report information extraction."""
self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
- def _test_redirect(self, url):
+ def _send_head(self, url):
"""Check if it is a redirect, like url shorteners, in case return the new url."""
- class HeadRequest(compat_urllib_request.Request):
- def get_method(self):
- return "HEAD"
class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
"""
Subclass the HTTPRedirectHandler to make it use our
- HeadRequest also on the redirected URL
+ HEADRequest also on the redirected URL
"""
def redirect_request(self, req, fp, code, msg, headers, newurl):
if code in (301, 302, 303, 307):
newurl = newurl.replace(' ', '%20')
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
- return HeadRequest(newurl,
+ return HEADRequest(newurl,
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True)
@@ -109,32 +150,49 @@ class GenericIE(InfoExtractor):
compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
opener.add_handler(handler())
- response = opener.open(HeadRequest(url))
+ response = opener.open(HEADRequest(url))
if response is None:
raise ExtractorError(u'Invalid URL protocol')
- new_url = response.geturl()
-
- if url == new_url:
- return False
-
- self.report_following_redirect(new_url)
- return new_url
+ return response
def _real_extract(self, url):
parsed_url = compat_urlparse.urlparse(url)
if not parsed_url.scheme:
self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
return self.url_result('http://' + url)
+ video_id = os.path.splitext(url.split('/')[-1])[0]
try:
- new_url = self._test_redirect(url)
- if new_url:
- return [self.url_result(new_url)]
+ response = self._send_head(url)
+
+ # Check for redirect
+ new_url = response.geturl()
+ if url != new_url:
+ self.report_following_redirect(new_url)
+ return self.url_result(new_url)
+
+ # Check for direct link to a video
+ content_type = response.headers.get('Content-Type', '')
+ m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
+ if m:
+ upload_date = response.headers.get('Last-Modified')
+ if upload_date:
+ upload_date = unified_strdate(upload_date)
+ return {
+ 'id': video_id,
+ 'title': os.path.splitext(url_basename(url))[0],
+ 'formats': [{
+ 'format_id': m.group('format_id'),
+ 'url': url,
+ 'vcodec': u'none' if m.group('type') == 'audio' else None
+ }],
+ 'upload_date': upload_date,
+ }
+
except compat_urllib_error.HTTPError:
# This may be a stupid server that doesn't like HEAD, our UA, or so
pass
- video_id = url.split('/')[-1]
try:
webpage = self._download_webpage(url, video_id)
except ValueError:
@@ -143,11 +201,25 @@ class GenericIE(InfoExtractor):
raise ExtractorError(u'Failed to download URL: %s' % url)
self.report_extraction(video_id)
+
+ # it's tempting to parse this further, but you would
+ # have to take into account all the variations like
+ # Video Title - Site Name
+ # Site Name | Video Title
+ # Video Title - Tagline | Site Name
+ # and so on and so forth; it's just not practical
+ video_title = self._html_search_regex(
+ r'(?s)<title>(.*?)</title>', webpage, u'video title',
+ default=u'video')
+
+ # video uploader is domain name
+ video_uploader = self._search_regex(
+ r'^(?:https?://)?([^/]*)/.*', url, u'video uploader')
+
# Look for BrightCove:
- m_brightcove = re.search(r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)
- if m_brightcove is not None:
+ bc_url = BrightcoveIE._extract_brightcove_url(webpage)
+ if bc_url is not None:
self.to_screen(u'Brightcove video detected.')
- bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())
return self.url_result(bc_url, 'Brightcove')
# Look for embedded Vimeo player
@@ -159,17 +231,73 @@ class GenericIE(InfoExtractor):
return self.url_result(surl, 'Vimeo')
# Look for embedded YouTube player
- mobj = re.search(
- r'<iframe[^>]+?src="(https?://(?:www\.)?youtube.com/embed/.+?)"', webpage)
+ matches = re.findall(r'''(?x)
+ (?:<iframe[^>]+?src=|embedSWF\(\s*)
+ (["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/
+ (?:embed|v)/.+?)
+ \1''', webpage)
+ if matches:
+ urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
+ for tuppl in matches]
+ return self.playlist_result(
+ urlrs, playlist_id=video_id, playlist_title=video_title)
+
+ # Look for embedded Dailymotion player
+ matches = re.findall(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
+ if matches:
+ urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
+ for tuppl in matches]
+ return self.playlist_result(
+ urlrs, playlist_id=video_id, playlist_title=video_title)
+
+ # Look for embedded Wistia player
+ match = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
+ if match:
+ return {
+ '_type': 'url_transparent',
+ 'url': unescapeHTML(match.group('url')),
+ 'ie_key': 'Wistia',
+ 'uploader': video_uploader,
+ 'title': video_title,
+ 'id': video_id,
+ }
+
+ # Look for embedded blip.tv player
+ mobj = re.search(r'<meta\s[^>]*https?://api.blip.tv/\w+/redirect/\w+/(\d+)', webpage)
+ if mobj:
+ return self.url_result('http://blip.tv/seo/-'+mobj.group(1), 'BlipTV')
+ mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*https?://(?:\w+\.)?blip.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', webpage)
if mobj:
- surl = unescapeHTML(mobj.group(1))
- return self.url_result(surl, 'Youtube')
+ player_url = 'http://blip.tv/play/%s.x?p=1' % mobj.group(1)
+ player_page = self._download_webpage(player_url, mobj.group(1))
+ blip_video_id = self._search_regex(r'data-episode-id="(\d+)', player_page, u'blip_video_id', fatal=False)
+ if blip_video_id:
+ return self.url_result('http://blip.tv/seo/-'+blip_video_id, 'BlipTV')
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None:
burl = unescapeHTML(mobj.group(1))
- return self.url_result(burl, 'Bandcamp')
+ # Don't set the extractor because it can be a track url or an album
+ return self.url_result(burl)
+
+ # Look for embedded Vevo player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'))
+
+ # Look for Ooyala videos
+ mobj = re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=([^"&]+)', webpage)
+ if mobj is not None:
+ return OoyalaIE._build_url_result(mobj.group(1))
+
+ # Look for Aparat videos
+ mobj = re.search(r'<iframe src="(http://www.aparat.com/video/[^"]+)"', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group(1), 'Aparat')
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
@@ -178,7 +306,7 @@ class GenericIE(InfoExtractor):
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
if mobj is None:
# Broaden the search a little bit: JWPlayer JS loader
- mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"&]*)', webpage)
+ mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"]*)', webpage)
if mobj is None:
# Try to find twitter cards info
mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
@@ -205,27 +333,11 @@ class GenericIE(InfoExtractor):
video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
# here's a fun little line of code for you:
- video_extension = os.path.splitext(video_id)[1][1:]
video_id = os.path.splitext(video_id)[0]
- # it's tempting to parse this further, but you would
- # have to take into account all the variations like
- # Video Title - Site Name
- # Site Name | Video Title
- # Video Title - Tagline | Site Name
- # and so on and so forth; it's just not practical
- video_title = self._html_search_regex(r'<title>(.*)</title>',
- webpage, u'video title', default=u'video', flags=re.DOTALL)
-
- # video uploader is domain name
- video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
- url, u'video uploader')
-
- return [{
+ return {
'id': video_id,
'url': video_url,
'uploader': video_uploader,
- 'upload_date': None,
'title': video_title,
- 'ext': video_extension,
- }]
+ }
diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py
index 3798118a7..0ee74fb38 100644
--- a/youtube_dl/extractor/hotnewhiphop.py
+++ b/youtube_dl/extractor/hotnewhiphop.py
@@ -11,7 +11,7 @@ class HotNewHipHopIE(InfoExtractor):
u'file': u'1435540.mp3',
u'md5': u'2c2cd2f76ef11a9b3b581e8b232f3d96',
u'info_dict': {
- u"title": u"Freddie Gibbs - Lay It Down"
+ u"title": u'Freddie Gibbs "Lay It Down"'
}
}
diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py
index 46954337f..bafc5826f 100644
--- a/youtube_dl/extractor/howcast.py
+++ b/youtube_dl/extractor/howcast.py
@@ -8,7 +8,7 @@ class HowcastIE(InfoExtractor):
_TEST = {
u'url': u'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
u'file': u'390161.mp4',
- u'md5': u'1d7ba54e2c9d7dc6935ef39e00529138',
+ u'md5': u'8b743df908c42f60cf6496586c7f12c3',
u'info_dict': {
u"description": u"The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here's the proper way to tie a square knot.",
u"title": u"How to Tie a Square Knot Properly"
diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py
index ab2b59103..9bd06e7c7 100644
--- a/youtube_dl/extractor/hypem.py
+++ b/youtube_dl/extractor/hypem.py
@@ -30,7 +30,7 @@ class HypemIE(InfoExtractor):
raise ExtractorError(u'Invalid URL: %s' % url)
track_id = mobj.group(1)
- data = { 'ax': 1, 'ts': time.time() }
+ data = {'ax': 1, 'ts': time.time()}
data_encoded = compat_urllib_parse.urlencode(data)
complete_url = url + "?" + data_encoded
request = compat_urllib_request.Request(complete_url)
@@ -68,4 +68,4 @@ class HypemIE(InfoExtractor):
'ext': "mp3",
'title': title,
'artist': artist,
- }] \ No newline at end of file
+ }]
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py
index c52146f7d..381af91e4 100644
--- a/youtube_dl/extractor/ign.py
+++ b/youtube_dl/extractor/ign.py
@@ -44,7 +44,7 @@ class IGNIE(InfoExtractor):
{
u'file': u'638672ee848ae4ff108df2a296418ee2.mp4',
u'info_dict': {
- u'title': u'GTA 5\'s Twisted Beauty in Super Slow Motion',
+ u'title': u'26 Twisted Moments from GTA 5 in Slow Motion',
u'description': u'The twisted beauty of GTA 5 in stunning slow motion.',
},
},
@@ -103,7 +103,7 @@ class IGNIE(InfoExtractor):
class OneUPIE(IGNIE):
"""Extractor for 1up.com, it uses the ign videos system."""
- _VALID_URL = r'https?://gamevideos.1up.com/(?P<type>video)/id/(?P<name_or_id>.+)'
+ _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)'
IE_NAME = '1up.com'
_DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>'
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
new file mode 100644
index 000000000..e5332cce8
--- /dev/null
+++ b/youtube_dl/extractor/imdb.py
@@ -0,0 +1,57 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urlparse,
+ get_element_by_attribute,
+)
+
+
+class ImdbIE(InfoExtractor):
+ IE_NAME = u'imdb'
+ IE_DESC = u'Internet Movie Database trailers'
+ _VALID_URL = r'http://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.imdb.com/video/imdb/vi2524815897',
+ u'md5': u'9f34fa777ade3a6e57a054fdbcb3a068',
+ u'info_dict': {
+ u'id': u'2524815897',
+ u'ext': u'mp4',
+ u'title': u'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
+ u'description': u'md5:9061c2219254e5d14e03c25c98e96a81',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id)
+ descr = get_element_by_attribute('itemprop', 'description', webpage)
+ available_formats = re.findall(
+ r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage,
+ flags=re.MULTILINE)
+ formats = []
+ for f_id, f_path in available_formats:
+ f_path = f_path.strip()
+ format_page = self._download_webpage(
+ compat_urlparse.urljoin(url, f_path),
+ u'Downloading info for %s format' % f_id)
+ json_data = self._search_regex(
+ r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
+ format_page, u'json data', flags=re.DOTALL)
+ info = json.loads(json_data)
+ format_info = info['videoPlayerObject']['video']
+ formats.append({
+ 'format_id': f_id,
+ 'url': format_info['url'],
+ })
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'formats': formats,
+ 'description': descr,
+ 'thumbnail': format_info['slate'],
+ }
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index 213aac428..660573d02 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -3,7 +3,7 @@ import re
from .common import InfoExtractor
class InstagramIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?instagram.com/p/(.*?)/'
+ _VALID_URL = r'(?:http://)?instagram\.com/p/(.*?)/'
_TEST = {
u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
u'file': u'aye83DjauH.mp4',
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py
index be8e05f53..16a6f73c8 100644
--- a/youtube_dl/extractor/internetvideoarchive.py
+++ b/youtube_dl/extractor/internetvideoarchive.py
@@ -1,5 +1,4 @@
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -43,9 +42,8 @@ class InternetVideoArchiveIE(InfoExtractor):
video_id = query_dic['publishedid'][0]
url = self._build_url(query)
- flashconfiguration_xml = self._download_webpage(url, video_id,
+ flashconfiguration = self._download_xml(url, video_id,
u'Downloading flash configuration')
- flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8'))
file_url = flashconfiguration.find('file').text
file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
# Replace some of the parameters in the query to get the best quality
@@ -53,9 +51,8 @@ class InternetVideoArchiveIE(InfoExtractor):
file_url = re.sub(r'(?<=\?)(.+)$',
lambda m: self._clean_query(m.group()),
file_url)
- info_xml = self._download_webpage(file_url, video_id,
+ info = self._download_xml(file_url, video_id,
u'Downloading video info')
- info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
item = info.find('channel/item')
def _bp(p):
diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py
new file mode 100644
index 000000000..4bdf55f93
--- /dev/null
+++ b/youtube_dl/extractor/ivi.py
@@ -0,0 +1,154 @@
+# encoding: utf-8
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_request,
+ ExtractorError,
+)
+
+
+class IviIE(InfoExtractor):
+ IE_DESC = u'ivi.ru'
+ IE_NAME = u'ivi'
+ _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)'
+
+ _TESTS = [
+ # Single movie
+ {
+ u'url': u'http://www.ivi.ru/watch/53141',
+ u'file': u'53141.mp4',
+ u'md5': u'6ff5be2254e796ed346251d117196cf4',
+ u'info_dict': {
+ u'title': u'Иван Васильевич меняет профессию',
+ u'description': u'md5:14d8eda24e9d93d29b5857012c6d6346',
+ u'duration': 5498,
+ u'thumbnail': u'http://thumbs.ivi.ru/f20.vcp.digitalaccess.ru/contents/d/1/c3c885163a082c29bceeb7b5a267a6.jpg',
+ },
+ u'skip': u'Only works from Russia',
+ },
+ # Serial's serie
+ {
+ u'url': u'http://www.ivi.ru/watch/dezhurnyi_angel/74791',
+ u'file': u'74791.mp4',
+ u'md5': u'3e6cc9a848c1d2ebcc6476444967baa9',
+ u'info_dict': {
+ u'title': u'Дежурный ангел - 1 серия',
+ u'duration': 2490,
+ u'thumbnail': u'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg',
+ },
+ u'skip': u'Only works from Russia',
+ }
+ ]
+
+ # Sorted by quality
+ _known_formats = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ']
+
+ # Sorted by size
+ _known_thumbnails = ['Thumb-120x90', 'Thumb-160', 'Thumb-640x480']
+
+ def _extract_description(self, html):
+ m = re.search(r'<meta name="description" content="(?P<description>[^"]+)"/>', html)
+ return m.group('description') if m is not None else None
+
+ def _extract_comment_count(self, html):
+ m = re.search(u'(?s)<a href="#" id="view-comments" class="action-button dim gradient">\s*Комментарии:\s*(?P<commentcount>\d+)\s*</a>', html)
+ return int(m.group('commentcount')) if m is not None else 0
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('videoid')
+
+ api_url = 'http://api.digitalaccess.ru/api/json/'
+
+ data = {u'method': u'da.content.get',
+ u'params': [video_id, {u'site': u's183',
+ u'referrer': u'http://www.ivi.ru/watch/%s' % video_id,
+ u'contentid': video_id
+ }
+ ]
+ }
+
+ request = compat_urllib_request.Request(api_url, json.dumps(data))
+
+ video_json_page = self._download_webpage(request, video_id, u'Downloading video JSON')
+ video_json = json.loads(video_json_page)
+
+ if u'error' in video_json:
+ error = video_json[u'error']
+ if error[u'origin'] == u'NoRedisValidData':
+ raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
+ raise ExtractorError(u'Unable to download video %s: %s' % (video_id, error[u'message']), expected=True)
+
+ result = video_json[u'result']
+
+ formats = [{'url': x[u'url'],
+ 'format_id': x[u'content_format']
+ } for x in result[u'files'] if x[u'content_format'] in self._known_formats]
+ formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
+
+ if len(formats) == 0:
+ self._downloader.report_warning(u'No media links available for %s' % video_id)
+ return
+
+ duration = result[u'duration']
+ compilation = result[u'compilation']
+ title = result[u'title']
+
+ title = '%s - %s' % (compilation, title) if compilation is not None else title
+
+ previews = result[u'preview']
+ previews.sort(key=lambda fmt: self._known_thumbnails.index(fmt['content_format']))
+ thumbnail = previews[-1][u'url'] if len(previews) > 0 else None
+
+ video_page = self._download_webpage(url, video_id, u'Downloading video page')
+ description = self._extract_description(video_page)
+ comment_count = self._extract_comment_count(video_page)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'duration': duration,
+ 'comment_count': comment_count,
+ 'formats': formats,
+ }
+
+
+class IviCompilationIE(InfoExtractor):
+ IE_DESC = u'ivi.ru compilations'
+ IE_NAME = u'ivi:compilation'
+ _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$'
+
+ def _extract_entries(self, html, compilation_id):
+ return [self.url_result('http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), 'Ivi')
+ for serie in re.findall(r'<strong><a href="/watch/%s/(\d+)">(?:[^<]+)</a></strong>' % compilation_id, html)]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ compilation_id = mobj.group('compilationid')
+ season_id = mobj.group('seasonid')
+
+ if season_id is not None: # Season link
+ season_page = self._download_webpage(url, compilation_id, u'Downloading season %s web page' % season_id)
+ playlist_id = '%s/season%s' % (compilation_id, season_id)
+ playlist_title = self._html_search_meta(u'title', season_page, u'title')
+ entries = self._extract_entries(season_page, compilation_id)
+ else: # Compilation link
+ compilation_page = self._download_webpage(url, compilation_id, u'Downloading compilation web page')
+ playlist_id = compilation_id
+ playlist_title = self._html_search_meta(u'title', compilation_page, u'title')
+ seasons = re.findall(r'<a href="/watch/%s/season(\d+)">[^<]+</a>' % compilation_id, compilation_page)
+ if len(seasons) == 0: # No seasons in this compilation
+ entries = self._extract_entries(compilation_page, compilation_id)
+ else:
+ entries = []
+ for season_id in seasons:
+ season_page = self._download_webpage('http://www.ivi.ru/watch/%s/season%s' % (compilation_id, season_id),
+ compilation_id, u'Downloading season %s web page' % season_id)
+ entries.extend(self._extract_entries(season_page, compilation_id))
+
+ return self.playlist_result(entries, playlist_id, playlist_title) \ No newline at end of file
diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py
index 6bb54b932..caf9d8c85 100644
--- a/youtube_dl/extractor/jeuxvideo.py
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -2,7 +2,6 @@
import json
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
@@ -22,7 +21,7 @@ class JeuxVideoIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- title = re.match(self._VALID_URL, url).group(1)
+ title = mobj.group(1)
webpage = self._download_webpage(url, title)
xml_link = self._html_search_regex(
r'<param name="flashvars" value="config=(.*?)" />',
@@ -32,12 +31,9 @@ class JeuxVideoIE(InfoExtractor):
r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml',
xml_link, u'video ID')
- xml_config = self._download_webpage(
+ config = self._download_xml(
xml_link, title, u'Downloading XML config')
- config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8'))
- info_json = self._search_regex(
- r'(?sm)<format\.json>(.*?)</format\.json>',
- xml_config, u'JSON information')
+ info_json = config.find('format.json').text
info = json.loads(info_json)['versions'][0]
video_url = 'http://video720.jeuxvideo.com/' + info['file']
diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py
index c7bb234fe..592c64e1d 100644
--- a/youtube_dl/extractor/jukebox.py
+++ b/youtube_dl/extractor/jukebox.py
@@ -8,7 +8,7 @@ from ..utils import (
)
class JukeboxIE(InfoExtractor):
- _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+).html'
+ _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html'
_IFRAME = r'<iframe .*src="(?P<iframe>[^"]*)".*>'
_VIDEO_URL = r'"config":{"file":"(?P<video_url>http:[^"]+[.](?P<video_ext>[^.?]+)[?]mdtk=[0-9]+)"'
_TITLE = r'<h1 class="inline">(?P<title>[^<]+)</h1>.*<span id="infos_article_artist">(?P<artist>[^<]+)</span>'
diff --git a/youtube_dl/extractor/justintv.py b/youtube_dl/extractor/justintv.py
index f60017992..e9bde0c18 100644
--- a/youtube_dl/extractor/justintv.py
+++ b/youtube_dl/extractor/justintv.py
@@ -1,7 +1,6 @@
import json
import os
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -94,10 +93,9 @@ class JustinTVIE(InfoExtractor):
archive_id = m.group(1)
api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
- chapter_info_xml = self._download_webpage(api, chapter_id,
+ doc = self._download_xml(api, chapter_id,
note=u'Downloading chapter information',
errnote=u'Chapter information download failed')
- doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
for a in doc.findall('.//archive'):
if archive_id == a.find('./id').text:
break
diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py
index 445d46501..50916f4a6 100644
--- a/youtube_dl/extractor/kankan.py
+++ b/youtube_dl/extractor/kankan.py
@@ -1,8 +1,10 @@
import re
+import hashlib
from .common import InfoExtractor
from ..utils import determine_ext
+_md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
class KankanIE(InfoExtractor):
_VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P<id>\d+)\.shtml'
@@ -30,7 +32,10 @@ class KankanIE(InfoExtractor):
video_id, u'Downloading video url info')
ip = self._search_regex(r'ip:"(.+?)"', video_info_page, u'video url ip')
path = self._search_regex(r'path:"(.+?)"', video_info_page, u'video url path')
- video_url = 'http://%s%s' % (ip, path)
+ param1 = self._search_regex(r'param1:(\d+)', video_info_page, u'param1')
+ param2 = self._search_regex(r'param2:(\d+)', video_info_page, u'param2')
+ key = _md5('xl_mp43651' + param1 + param2)
+ video_url = 'http://%s%s?key=%s&key1=%s' % (ip, path, key, param2)
return {'id': video_id,
'title': title,
diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py
index 5e05900da..29658a7d6 100644
--- a/youtube_dl/extractor/keezmovies.py
+++ b/youtube_dl/extractor/keezmovies.py
@@ -12,7 +12,7 @@ from ..aes import (
)
class KeezMoviesIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))'
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
_TEST = {
u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
u'file': u'1214711.mp4',
@@ -43,10 +43,10 @@ class KeezMoviesIE(InfoExtractor):
if webpage.find('encrypted=true')!=-1:
password = self._html_search_regex(r'video_title=(.+?)&amp;', webpage, u'password')
video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8')
- path = compat_urllib_parse_urlparse( video_url ).path
- extension = os.path.splitext( path )[1][1:]
+ path = compat_urllib_parse_urlparse(video_url).path
+ extension = os.path.splitext(path)[1][1:]
format = path.split('/')[4].split('_')[:2]
- format = "-".join( format )
+ format = "-".join(format)
age_limit = self._rta_search(webpage)
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index dd062a14e..5ae57a77c 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -8,7 +8,7 @@ from ..utils import (
class LiveLeakIE(InfoExtractor):
- _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
+ _VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
IE_NAME = u'liveleak'
_TEST = {
u'url': u'http://www.liveleak.com/view?i=757_1364311680',
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index 4531fd6ab..1dcd1fb2d 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -5,13 +5,13 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
compat_urlparse,
- get_meta_content,
- ExtractorError,
+ xpath_with_ns,
)
class LivestreamIE(InfoExtractor):
- _VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
+ IE_NAME = u'livestream'
+ _VALID_URL = r'http://new\.livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
_TEST = {
u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
u'file': u'4719370.mp4',
@@ -54,3 +54,43 @@ class LivestreamIE(InfoExtractor):
info = json.loads(self._download_webpage(api_url, video_id,
u'Downloading video info'))
return self._extract_video_info(info)
+
+
+# The original version of Livestream uses a different system
+class LivestreamOriginalIE(InfoExtractor):
+ IE_NAME = u'livestream:original'
+ _VALID_URL = r'https?://www\.livestream\.com/(?P<user>[^/]+)/video\?.*?clipId=(?P<id>.*?)(&|$)'
+ _TEST = {
+ u'url': u'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
+ u'info_dict': {
+ u'id': u'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
+ u'ext': u'flv',
+ u'title': u'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital',
+ },
+ u'params': {
+ # rtmp
+ u'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ user = mobj.group('user')
+ api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id)
+
+ info = self._download_xml(api_url, video_id)
+ item = info.find('channel').find('item')
+ ns = {'media': 'http://search.yahoo.com/mrss'}
+ thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url']
+ # Remove the extension and number from the path (like 1.jpg)
+ path = self._search_regex(r'(user-files/.+)_.*?\.jpg$', thumbnail_url, u'path')
+
+ return {
+ 'id': video_id,
+ 'title': item.find('title').text,
+ 'url': 'rtmp://extondemand.livestream.com/ondemand',
+ 'play_path': 'mp4:trans/dv15/mogulus-{0}.mp4'.format(path),
+ 'ext': 'flv',
+ 'thumbnail': thumbnail_url,
+ }
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py
new file mode 100644
index 000000000..d29cf2c07
--- /dev/null
+++ b/youtube_dl/extractor/mdr.py
@@ -0,0 +1,78 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+)
+
+
+class MDRIE(InfoExtractor):
+ _VALID_URL = r'^(?P<domain>(?:https?://)?(?:www\.)?mdr\.de)/mediathek/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)_.*'
+
+ _TESTS = [{
+ u'url': u'http://www.mdr.de/mediathek/themen/nachrichten/video165624_zc-c5c7de76_zs-3795826d.html',
+ u'file': u'165624.mp4',
+ u'md5': u'ae785f36ecbf2f19b42edf1bc9c85815',
+ u'info_dict': {
+ u"title": u"MDR aktuell Eins30 09.12.2013, 22:48 Uhr"
+ },
+ },
+ {
+ u'url': u'http://www.mdr.de/mediathek/radio/mdr1-radio-sachsen/audio718370_zc-67b21197_zs-1b9b2483.html',
+ u'file': u'718370.mp3',
+ u'md5': u'a9d21345a234c7b45dee612f290fd8d7',
+ u'info_dict': {
+ u"title": u"MDR 1 RADIO SACHSEN 10.12.2013, 05:00 Uhr"
+ },
+ }]
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ video_id = m.group('video_id')
+ domain = m.group('domain')
+
+ # determine title and media streams from webpage
+ html = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(r'<h2>(.*?)</h2>', html, u'title')
+ xmlurl = self._search_regex(
+ r'(/mediathek/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, u'XML URL')
+
+ doc = self._download_xml(domain + xmlurl, video_id)
+ formats = []
+ for a in doc.findall('./assets/asset'):
+ url_el = a.find('.//progressiveDownloadUrl')
+ if url_el is None:
+ continue
+ abr = int(a.find('bitrateAudio').text) // 1000
+ media_type = a.find('mediaType').text
+ format = {
+ 'abr': abr,
+ 'filesize': int(a.find('fileSize').text),
+ 'url': url_el.text,
+ }
+
+ vbr_el = a.find('bitrateVideo')
+ if vbr_el is None:
+ format.update({
+ 'vcodec': 'none',
+ 'format_id': u'%s-%d' % (media_type, abr),
+ })
+ else:
+ vbr = int(vbr_el.text) // 1000
+ format.update({
+ 'vbr': vbr,
+ 'width': int(a.find('frameWidth').text),
+ 'height': int(a.find('frameHeight').text),
+ 'format_id': u'%s-%d' % (media_type, vbr),
+ })
+ formats.append(format)
+ formats.sort(key=lambda f: (f.get('vbr'), f['abr']))
+ if not formats:
+ raise ExtractorError(u'Could not find any valid formats')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py
index 91480ba87..99d3c83a5 100644
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -1,14 +1,10 @@
import re
-import socket
from .common import InfoExtractor
from ..utils import (
- compat_http_client,
compat_parse_qs,
- compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
- compat_str,
determine_ext,
ExtractorError,
)
@@ -69,6 +65,21 @@ class MetacafeIE(InfoExtractor):
u'age_limit': 18,
},
},
+ # cbs video
+ {
+ u'url': u'http://www.metacafe.com/watch/cb-0rOxMBabDXN6/samsung_galaxy_note_2_samsungs_next_generation_phablet/',
+ u'info_dict': {
+ u'id': u'0rOxMBabDXN6',
+ u'ext': u'flv',
+ u'title': u'Samsung Galaxy Note 2: Samsung\'s next-generation phablet',
+ u'description': u'md5:54d49fac53d26d5a0aaeccd061ada09d',
+ u'duration': 129,
+ },
+ u'params': {
+ # rtmp download
+ u'skip_download': True,
+ },
+ },
]
@@ -78,12 +89,8 @@ class MetacafeIE(InfoExtractor):
def _real_initialize(self):
# Retrieve disclaimer
- request = compat_urllib_request.Request(self._DISCLAIMER)
- try:
- self.report_disclaimer()
- compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
+ self.report_disclaimer()
+ self._download_webpage(self._DISCLAIMER, None, False, u'Unable to retrieve disclaimer')
# Confirm age
disclaimer_form = {
@@ -92,11 +99,8 @@ class MetacafeIE(InfoExtractor):
}
request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- try:
- self.report_age_confirmation()
- compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
+ self.report_age_confirmation()
+ self._download_webpage(request, None, False, u'Unable to confirm age')
def _real_extract(self, url):
# Extract id and simplified title from URL
@@ -106,10 +110,16 @@ class MetacafeIE(InfoExtractor):
video_id = mobj.group(1)
- # Check if video comes from YouTube
- mobj2 = re.match(r'^yt-(.*)$', video_id)
- if mobj2 is not None:
- return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
+ # the video may come from an external site
+ m_external = re.match('^(\w{2})-(.*)$', video_id)
+ if m_external is not None:
+ prefix, ext_id = m_external.groups()
+ # Check if video comes from YouTube
+ if prefix == 'yt':
+ return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube')
+ # CBS videos use theplatform.com
+ if prefix == 'cb':
+ return self.url_result('theplatform:%s' % ext_id, 'ThePlatform')
# Retrieve video webpage to extract further information
req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py
index 449138b56..e560c1d35 100644
--- a/youtube_dl/extractor/metacritic.py
+++ b/youtube_dl/extractor/metacritic.py
@@ -1,8 +1,10 @@
import re
-import xml.etree.ElementTree
import operator
from .common import InfoExtractor
+from ..utils import (
+ fix_xml_all_ampersand,
+)
class MetacriticIE(InfoExtractor):
@@ -23,9 +25,8 @@ class MetacriticIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
# The xml is not well formatted, there are raw '&'
- info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id,
- video_id, u'Downloading info xml').replace('&', '&amp;')
- info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+ info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
+ video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand)
clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
formats = []
@@ -43,13 +44,10 @@ class MetacriticIE(InfoExtractor):
description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>',
webpage, u'description', flags=re.DOTALL)
- info = {
+ return {
'id': video_id,
'title': clip.find('title').text,
'formats': formats,
'description': description,
'duration': int(clip.find('duration').text),
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
- return info
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index a200dcd74..125d81551 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -1,13 +1,10 @@
import json
import re
-import socket
from .common import InfoExtractor
from ..utils import (
- compat_http_client,
- compat_urllib_error,
- compat_urllib_request,
unified_strdate,
+ ExtractorError,
)
@@ -31,13 +28,18 @@ class MixcloudIE(InfoExtractor):
"""Returns 1st active url from list"""
for url in url_list:
try:
- compat_urllib_request.urlopen(url)
+ # We only want to know if the request succeed
+ # don't download the whole file
+ self._request_webpage(url, None, False)
return url
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error):
+ except ExtractorError:
url = None
return None
+ def _get_url(self, template_url):
+ return self.check_urls(template_url % i for i in range(30))
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -53,14 +55,19 @@ class MixcloudIE(InfoExtractor):
preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url')
song_url = preview_url.replace('/previews/', '/cloudcasts/originals/')
template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
- final_song_url = self.check_urls(template_url % i for i in range(30))
+ final_song_url = self._get_url(template_url)
+ if final_song_url is None:
+ self.to_screen('Trying with m4a extension')
+ template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
+ final_song_url = self._get_url(template_url)
+ if final_song_url is None:
+ raise ExtractorError(u'Unable to extract track url')
return {
'id': track_id,
'title': info['name'],
'url': final_song_url,
- 'ext': 'mp3',
- 'description': info['description'],
+ 'description': info.get('description'),
'thumbnail': info['pictures'].get('extra_large'),
'uploader': info['user']['name'],
'uploader_id': info['user']['username'],
diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py
new file mode 100644
index 000000000..b9430b09b
--- /dev/null
+++ b/youtube_dl/extractor/mofosex.py
@@ -0,0 +1,49 @@
+import os
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse_urlparse,
+ compat_urllib_request,
+ compat_urllib_parse,
+)
+
+class MofosexIE(InfoExtractor):
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>mofosex\.com/videos/(?P<videoid>[0-9]+)/.*?\.html)'
+ _TEST = {
+ u'url': u'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html',
+ u'file': u'5018.mp4',
+ u'md5': u'1b2eb47ac33cc75d4a80e3026b613c5a',
+ u'info_dict': {
+ u"title": u"Japanese Teen Music Video",
+ u"age_limit": 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('videoid')
+ url = 'http://www.' + mobj.group('url')
+
+ req = compat_urllib_request.Request(url)
+ req.add_header('Cookie', 'age_verified=1')
+ webpage = self._download_webpage(req, video_id)
+
+ video_title = self._html_search_regex(r'<h1>(.+?)<', webpage, u'title')
+ video_url = compat_urllib_parse.unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, u'video_url'))
+ path = compat_urllib_parse_urlparse(video_url).path
+ extension = os.path.splitext(path)[1][1:]
+ format = path.split('/')[5].split('_')[:2]
+ format = "-".join(format)
+
+ age_limit = self._rta_search(webpage)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'url': video_url,
+ 'ext': extension,
+ 'format': format,
+ 'format_id': format,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index e96d3952c..ed11f521a 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -10,34 +10,8 @@ from ..utils import (
def _media_xml_tag(tag):
return '{http://search.yahoo.com/mrss/}%s' % tag
-class MTVIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
-
- _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
-
- _TESTS = [
- {
- u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
- u'file': u'853555.mp4',
- u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
- u'info_dict': {
- u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
- u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
- },
- },
- {
- u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
- u'file': u'USCJY1331283.mp4',
- u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
- u'info_dict': {
- u'title': u'Everything Has Changed',
- u'upload_date': u'20130606',
- u'uploader': u'Taylor Swift',
- },
- u'skip': u'VEVO is only available in some countries',
- },
- ]
+class MTVServicesInfoExtractor(InfoExtractor):
@staticmethod
def _id_from_uri(uri):
return uri.split(':')[-1]
@@ -47,18 +21,22 @@ class MTVIE(InfoExtractor):
def _transform_rtmp_url(rtmp_video_url):
m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url)
if not m:
- raise ExtractorError(u'Cannot transform RTMP url')
+ return rtmp_video_url
base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
return base + m.group('finalid')
def _get_thumbnail_url(self, uri, itemdoc):
- return 'http://mtv.mtvnimages.com/uri/' + uri
+ search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
+ thumb_node = itemdoc.find(search_path)
+ if thumb_node is None:
+ return None
+ else:
+ return thumb_node.attrib['url']
def _extract_video_formats(self, metadataXml):
if '/error_country_block.swf' in metadataXml:
raise ExtractorError(u'This video is not available from your country.', expected=True)
mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8'))
- renditions = mdoc.findall('.//rendition')
formats = []
for rendition in mdoc.findall('.//rendition'):
@@ -93,7 +71,7 @@ class MTVIE(InfoExtractor):
else:
description = None
- info = {
+ return {
'title': itemdoc.find('title').text,
'formats': self._extract_video_formats(mediagen_page),
'id': video_id,
@@ -101,32 +79,67 @@ class MTVIE(InfoExtractor):
'description': description,
}
- # TODO: Remove when #980 has been merged
- info.update(info['formats'][-1])
-
- return info
-
def _get_videos_info(self, uri):
video_id = self._id_from_uri(uri)
data = compat_urllib_parse.urlencode({'uri': uri})
- infoXml = self._download_webpage(self._FEED_URL +'?' + data, video_id,
- u'Downloading info')
- idoc = xml.etree.ElementTree.fromstring(infoXml.encode('utf-8'))
+
+ def fix_ampersand(s):
+ """ Fix unencoded ampersand in XML """
+ return s.replace(u'& ', '&amp; ')
+ idoc = self._download_xml(
+ self._FEED_URL + '?' + data, video_id,
+ u'Downloading info', transform_source=fix_ampersand)
return [self._get_video_info(item) for item in idoc.findall('.//item')]
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
- webpage = self._download_webpage(url, video_id)
+class MTVIE(MTVServicesInfoExtractor):
+ _VALID_URL = r'''(?x)^https?://
+ (?:(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$|
+ m\.mtv\.com/videos/video\.rbml\?.*?id=(?P<mgid>[^&]+))'''
- # Some videos come from Vevo.com
- m_vevo = re.search(r'isVevoVideo = true;.*?vevoVideoId = "(.*?)";',
- webpage, re.DOTALL)
- if m_vevo:
- vevo_id = m_vevo.group(1);
- self.to_screen(u'Vevo video detected: %s' % vevo_id)
- return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
+ _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
- uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri')
+ _TESTS = [
+ {
+ u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
+ u'file': u'853555.mp4',
+ u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
+ u'info_dict': {
+ u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
+ u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
+ },
+ },
+ {
+ u'add_ie': ['Vevo'],
+ u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
+ u'file': u'USCJY1331283.mp4',
+ u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
+ u'info_dict': {
+ u'title': u'Everything Has Changed',
+ u'upload_date': u'20130606',
+ u'uploader': u'Taylor Swift',
+ },
+ u'skip': u'VEVO is only available in some countries',
+ },
+ ]
+
+ def _get_thumbnail_url(self, uri, itemdoc):
+ return 'http://mtv.mtvnimages.com/uri/' + uri
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('videoid')
+ uri = mobj.group('mgid')
+ if uri is None:
+ webpage = self._download_webpage(url, video_id)
+
+ # Some videos come from Vevo.com
+ m_vevo = re.search(r'isVevoVideo = true;.*?vevoVideoId = "(.*?)";',
+ webpage, re.DOTALL)
+ if m_vevo:
+ vevo_id = m_vevo.group(1);
+ self.to_screen(u'Vevo video detected: %s' % vevo_id)
+ return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
+
+ uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri')
return self._get_videos_info(uri)
diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py
index 03e31ea1c..1772b7f9a 100644
--- a/youtube_dl/extractor/muzu.py
+++ b/youtube_dl/extractor/muzu.py
@@ -9,7 +9,7 @@ from ..utils import (
class MuzuTVIE(InfoExtractor):
- _VALID_URL = r'https?://www.muzu.tv/(.+?)/(.+?)/(?P<id>\d+)'
+ _VALID_URL = r'https?://www\.muzu\.tv/(.+?)/(.+?)/(?P<id>\d+)'
IE_NAME = u'muzu.tv'
_TEST = {
diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py
index 107665d15..4becddee6 100644
--- a/youtube_dl/extractor/myspass.py
+++ b/youtube_dl/extractor/myspass.py
@@ -1,5 +1,4 @@
import os.path
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -10,7 +9,7 @@ from ..utils import (
class MySpassIE(InfoExtractor):
- _VALID_URL = r'http://www.myspass.de/.*'
+ _VALID_URL = r'http://www\.myspass\.de/.*'
_TEST = {
u'url': u'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/',
u'file': u'11741.mp4',
@@ -33,8 +32,7 @@ class MySpassIE(InfoExtractor):
# get metadata
metadata_url = META_DATA_URL_TEMPLATE % video_id
- metadata_text = self._download_webpage(metadata_url, video_id)
- metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
+ metadata = self._download_xml(metadata_url, video_id)
# extract values from metadata
url_flv_el = metadata.find('url_flv')
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index 9df236d69..4cab30631 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -1,6 +1,5 @@
# encoding: utf-8
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -10,7 +9,7 @@ from ..utils import (
class NaverIE(InfoExtractor):
- _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
_TEST = {
u'url': u'http://tvcast.naver.com/v/81652',
@@ -38,14 +37,12 @@ class NaverIE(InfoExtractor):
'protocol': 'p2p',
'inKey': key,
})
- info_xml = self._download_webpage(
+ info = self._download_xml(
'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query,
video_id, u'Downloading video info')
- urls_xml = self._download_webpage(
+ urls = self._download_xml(
'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls,
video_id, u'Downloading video formats info')
- info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
- urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8'))
formats = []
for format_el in urls.findall('EncodingOptions/EncodingOption'):
@@ -59,7 +56,7 @@ class NaverIE(InfoExtractor):
'height': int(format_el.find('height').text),
})
- info = {
+ return {
'id': video_id,
'title': info.find('Subject').text,
'formats': formats,
@@ -68,6 +65,3 @@ class NaverIE(InfoExtractor):
'upload_date': info.find('WriteDate').text.replace('.', ''),
'view_count': int(info.find('PlayCount').text),
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
- return info
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index 3bc9dae6d..e8bbfff7b 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -1,5 +1,4 @@
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import find_xpath_attr, compat_str
@@ -21,8 +20,8 @@ class NBCNewsIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
- info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video')
+ all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
+ info = all_info.find('video')
return {'id': video_id,
'title': info.find('headline').text,
diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py
new file mode 100644
index 000000000..d81df3c10
--- /dev/null
+++ b/youtube_dl/extractor/ndtv.py
@@ -0,0 +1,64 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import month_by_name
+
+
+class NDTVIE(InfoExtractor):
+ _VALID_URL = r'^https?://(?:www\.)?ndtv\.com/video/player/[^/]*/[^/]*/(?P<id>[a-z0-9]+)'
+
+ _TEST = {
+ u"url": u"http://www.ndtv.com/video/player/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal/300710",
+ u"file": u"300710.mp4",
+ u"md5": u"39f992dbe5fb531c395d8bbedb1e5e88",
+ u"info_dict": {
+ u"title": u"NDTV exclusive: Don't need character certificate from Rahul Gandhi, says Arvind Kejriwal",
+ u"description": u"In an exclusive interview to NDTV, Aam Aadmi Party's Arvind Kejriwal says it makes no difference to him that Rahul Gandhi said the Congress needs to learn from his party.",
+ u"upload_date": u"20131208",
+ u"duration": 1327,
+ u"thumbnail": u"http://i.ndtvimg.com/video/images/vod/medium/2013-12/big_300710_1386518307.jpg",
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ filename = self._search_regex(
+ r"__filename='([^']+)'", webpage, u'video filename')
+ video_url = (u'http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' %
+ filename)
+
+ duration_str = filename = self._search_regex(
+ r"__duration='([^']+)'", webpage, u'duration', fatal=False)
+ duration = None if duration_str is None else int(duration_str)
+
+ date_m = re.search(r'''(?x)
+ <p\s+class="vod_dateline">\s*
+ Published\s+On:\s*
+ (?P<monthname>[A-Za-z]+)\s+(?P<day>[0-9]+),\s*(?P<year>[0-9]+)
+ ''', webpage)
+ upload_date = None
+ assert date_m
+ if date_m is not None:
+ month = month_by_name(date_m.group('monthname'))
+ if month is not None:
+ upload_date = '%s%02d%02d' % (
+ date_m.group('year'), month, int(date_m.group('day')))
+
+ description = self._og_search_description(webpage)
+ READ_MORE = u' (Read more)'
+ if description.endswith(READ_MORE):
+ description = description[:-len(READ_MORE)]
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': self._og_search_title(webpage),
+ 'description': description,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'duration': duration,
+ 'upload_date': upload_date,
+ }
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py
index 224f56ac8..2edd806a3 100644
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -1,6 +1,5 @@
import re
import json
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -26,9 +25,8 @@ class NHLBaseInfoExtractor(InfoExtractor):
'path': initial_video_url.replace('.mp4', '_sd.mp4'),
})
path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data
- path_response = self._download_webpage(path_url, video_id,
+ path_doc = self._download_xml(path_url, video_id,
u'Downloading final video url')
- path_doc = xml.etree.ElementTree.fromstring(path_response)
video_url = path_doc.find('path').text
join = compat_urlparse.urljoin
@@ -72,7 +70,7 @@ class NHLIE(NHLBaseInfoExtractor):
class NHLVideocenterIE(NHLBaseInfoExtractor):
IE_NAME = u'nhl.com:videocenter'
- IE_DESC = u'Download the first 12 videos from a videocenter category'
+ IE_DESC = u'NHL videocenter category'
_VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?.*?catid=(?P<catid>[^&]+))?'
@classmethod
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
new file mode 100644
index 000000000..46774317c
--- /dev/null
+++ b/youtube_dl/extractor/niconico.py
@@ -0,0 +1,127 @@
+# encoding: utf-8
+
+import re
+import socket
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_http_client,
+ compat_urllib_error,
+ compat_urllib_parse,
+ compat_urllib_request,
+ compat_urlparse,
+ compat_str,
+
+ ExtractorError,
+ unified_strdate,
+)
+
+
+class NiconicoIE(InfoExtractor):
+ IE_NAME = u'niconico'
+ IE_DESC = u'ニコニコ動画'
+
+ _TEST = {
+ u'url': u'http://www.nicovideo.jp/watch/sm22312215',
+ u'file': u'sm22312215.mp4',
+ u'md5': u'd1a75c0823e2f629128c43e1212760f9',
+ u'info_dict': {
+ u'title': u'Big Buck Bunny',
+ u'uploader': u'takuya0301',
+ u'uploader_id': u'2698420',
+ u'upload_date': u'20131123',
+ u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
+ },
+ u'params': {
+ u'username': u'ydl.niconico@gmail.com',
+ u'password': u'youtube-dl',
+ },
+ }
+
+ _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$'
+ _NETRC_MACHINE = 'niconico'
+ # If True it will raise an error if no login info is provided
+ _LOGIN_REQUIRED = True
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ # No authentication to be performed
+ if username is None:
+ if self._LOGIN_REQUIRED:
+ raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+ return False
+
+ # Log in
+ login_form_strs = {
+ u'mail': username,
+ u'password': password,
+ }
+ # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
+ # chokes on unicode
+ login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+ login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
+ request = compat_urllib_request.Request(
+ u'https://secure.nicovideo.jp/secure/login', login_data)
+ login_results = self._download_webpage(
+ request, u'', note=u'Logging in', errnote=u'Unable to log in')
+ if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
+ self._downloader.report_warning(u'unable to log in: bad username or password')
+ return False
+ return True
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+
+ # Get video webpage. We are not actually interested in it, but need
+ # the cookies in order to be able to download the info webpage
+ self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id)
+
+ video_info = self._download_xml(
+ 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
+ note=u'Downloading video info page')
+
+ # Get flv info
+ flv_info_webpage = self._download_webpage(
+ u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
+ video_id, u'Downloading flv info')
+ video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
+
+ # Start extracting information
+ video_title = video_info.find('.//title').text
+ video_extension = video_info.find('.//movie_type').text
+ video_format = video_extension.upper()
+ video_thumbnail = video_info.find('.//thumbnail_url').text
+ video_description = video_info.find('.//description').text
+ video_uploader_id = video_info.find('.//user_id').text
+ video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0])
+ video_view_count = video_info.find('.//view_counter').text
+ video_webpage_url = video_info.find('.//watch_url').text
+
+ # uploader
+ video_uploader = video_uploader_id
+ url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id
+ try:
+ user_info = self._download_xml(
+ url, video_id, note=u'Downloading user information')
+ video_uploader = user_info.find('.//nickname').text
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err))
+
+ return {
+ 'id': video_id,
+ 'url': video_real_url,
+ 'title': video_title,
+ 'ext': video_extension,
+ 'format': video_format,
+ 'thumbnail': video_thumbnail,
+ 'description': video_description,
+ 'uploader': video_uploader,
+ 'upload_date': video_upload_date,
+ 'uploader_id': video_uploader_id,
+ 'view_count': video_view_count,
+ 'webpage_url': video_webpage_url,
+ }
diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py
new file mode 100644
index 000000000..ea986c00e
--- /dev/null
+++ b/youtube_dl/extractor/ninegag.py
@@ -0,0 +1,43 @@
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class NineGagIE(InfoExtractor):
+ IE_NAME = '9gag'
+ _VALID_URL = r'^https?://(?:www\.)?9gag\.tv/v/(?P<id>[0-9]+)'
+
+ _TEST = {
+ u"url": u"http://9gag.tv/v/1912",
+ u"file": u"1912.mp4",
+ u"info_dict": {
+ u"description": u"This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
+ u"title": u"\"People Are Awesome 2013\" Is Absolutely Awesome"
+ },
+ u'add_ie': [u'Youtube']
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ data_json = self._html_search_regex(r'''(?x)
+ <div\s*id="tv-video"\s*data-video-source="youtube"\s*
+ data-video-meta="([^"]+)"''', webpage, u'video metadata')
+
+ data = json.loads(data_json)
+
+ return {
+ '_type': 'url_transparent',
+ 'url': data['youtubeVideoId'],
+ 'ie_key': 'Youtube',
+ 'id': video_id,
+ 'title': data['title'],
+ 'description': data['description'],
+ 'view_count': int(data['view_count']),
+ 'like_count': int(data['statistic']['like']),
+ 'dislike_count': int(data['statistic']['dislike']),
+ 'thumbnail': data['thumbnail_url'],
+ }
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py
index 1f7b4d2e7..d08e47734 100644
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -22,6 +22,11 @@ class OoyalaIE(InfoExtractor):
def _url_for_embed_code(embed_code):
return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+ @classmethod
+ def _build_url_result(cls, embed_code):
+ return cls.url_result(cls._url_for_embed_code(embed_code),
+ ie=cls.ie_key())
+
def _extract_result(self, info, more_info):
return {'id': info['embedCode'],
'ext': 'mp4',
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index cfca2a063..b42eae89a 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -12,7 +12,7 @@ from ..utils import (
)
class ORFIE(InfoExtractor):
- _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'
+ _VALID_URL = r'https?://tvthek\.orf\.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 65462d867..25f019231 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
class PBSIE(InfoExtractor):
- _VALID_URL = r'https?://video.pbs.org/video/(?P<id>\d+)/?'
+ _VALID_URL = r'https?://video\.pbs\.org/video/(?P<id>\d+)/?'
_TEST = {
u'url': u'http://video.pbs.org/video/2365006249/',
diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py
new file mode 100644
index 000000000..58200971b
--- /dev/null
+++ b/youtube_dl/extractor/podomatic.py
@@ -0,0 +1,49 @@
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class PodomaticIE(InfoExtractor):
+ IE_NAME = 'podomatic'
+ _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)'
+
+ _TEST = {
+ u"url": u"http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00",
+ u"file": u"2009-01-02T16_03_35-08_00.mp3",
+ u"md5": u"84bb855fcf3429e6bf72460e1eed782d",
+ u"info_dict": {
+ u"uploader": u"Science Teaching Tips",
+ u"uploader_id": u"scienceteachingtips",
+ u"title": u"64. When the Moon Hits Your Eye",
+ u"duration": 446,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ channel = mobj.group('channel')
+
+ json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' +
+ '?permalink=true&rtmp=0') %
+ (mobj.group('proto'), channel, video_id))
+ data_json = self._download_webpage(
+ json_url, video_id, note=u'Downloading video info')
+ data = json.loads(data_json)
+
+ video_url = data['downloadLink']
+ uploader = data['podcast']
+ title = data['title']
+ thumbnail = data['imageLocation']
+ duration = int(data['length'] / 1000.0)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'uploader': uploader,
+ 'uploader_id': channel,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
new file mode 100644
index 000000000..71abd5013
--- /dev/null
+++ b/youtube_dl/extractor/pornhd.py
@@ -0,0 +1,38 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import compat_urllib_parse
+
+
+class PornHdIE(InfoExtractor):
+ _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)'
+ _TEST = {
+ u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
+ u'file': u'1962.flv',
+ u'md5': u'35272469887dca97abd30abecc6cdf75',
+ u'info_dict': {
+ u"title": u"sierra-day-gets-his-cum-all-over-herself-hd-porn-video",
+ u"age_limit": 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ video_id = mobj.group('video_id')
+ video_title = mobj.group('video_title')
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._html_search_regex(
+ r'&hd=(http.+?)&', webpage, u'video URL')
+ video_url = compat_urllib_parse.unquote(video_url)
+ age_limit = 18
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'flv',
+ 'title': video_title,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 5e2454f1b..d9135c6b9 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -6,14 +6,13 @@ from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
- unescapeHTML,
)
from ..aes import (
aes_decrypt_text
)
class PornHubIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9]+))'
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9a-f]+))'
_TEST = {
u'url': u'http://www.pornhub.com/view_video.php?viewkey=648719015',
u'file': u'648719015.mp4',
@@ -47,10 +46,10 @@ class PornHubIE(InfoExtractor):
formats = []
for video_url in video_urls:
- path = compat_urllib_parse_urlparse( video_url ).path
- extension = os.path.splitext( path )[1][1:]
+ path = compat_urllib_parse_urlparse(video_url).path
+ extension = os.path.splitext(path)[1][1:]
format = path.split('/')[5].split('_')[:2]
- format = "-".join( format )
+ format = "-".join(format)
formats.append({
'url': video_url,
'ext': extension,
diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py
new file mode 100644
index 000000000..33054591b
--- /dev/null
+++ b/youtube_dl/extractor/pyvideo.py
@@ -0,0 +1,51 @@
+import re
+import os
+
+from .common import InfoExtractor
+
+
+class PyvideoIE(InfoExtractor):
+ _VALID_URL = r'(?:http://)?(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)'
+ _TESTS = [{
+ u'url': u'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes',
+ u'file': u'24_4WWkSmNo.mp4',
+ u'md5': u'de317418c8bc76b1fd8633e4f32acbc6',
+ u'info_dict': {
+ u"title": u"Become a logging expert in 30 minutes",
+ u"description": u"md5:9665350d466c67fb5b1598de379021f7",
+ u"upload_date": u"20130320",
+ u"uploader": u"NextDayVideo",
+ u"uploader_id": u"NextDayVideo",
+ },
+ u'add_ie': ['Youtube'],
+ },
+ {
+ u'url': u'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v',
+ u'md5': u'5fe1c7e0a8aa5570330784c847ff6d12',
+ u'info_dict': {
+ u'id': u'2542',
+ u'ext': u'm4v',
+ u'title': u'Gloriajw-SpotifyWithErikBernhardsson182',
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage)
+
+ if m_youtube is not None:
+ return self.url_result(m_youtube.group(1), 'Youtube')
+
+ title = self._html_search_regex(r'<div class="section">.*?<h3>([^>]+?)</h3>',
+ webpage, u'title', flags=re.DOTALL)
+ video_url = self._search_regex([r'<source src="(.*?)"',
+ r'<dt>Download</dt>.*?<a href="(.+?)"'],
+ webpage, u'video url', flags=re.DOTALL)
+ return {
+ 'id': video_id,
+ 'title': os.path.splitext(title)[0],
+ 'url': video_url,
+ }
diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py
new file mode 100644
index 000000000..34652f6c1
--- /dev/null
+++ b/youtube_dl/extractor/radiofrance.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+import re
+
+from .common import InfoExtractor
+
+
+class RadioFranceIE(InfoExtractor):
+ _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
+ IE_NAME = u'radiofrance'
+
+ _TEST = {
+ u'url': u'http://maison.radiofrance.fr/radiovisions/one-one',
+ u'file': u'one-one.ogg',
+ u'md5': u'bdbb28ace95ed0e04faab32ba3160daf',
+ u'info_dict': {
+ u"title": u"One to one",
+ u"description": u"Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
+ u"uploader": u"Thomas Hercouët",
+ },
+ }
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ video_id = m.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, u'title')
+ description = self._html_search_regex(
+ r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
+ webpage, u'description', fatal=False)
+ uploader = self._html_search_regex(
+ r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
+ webpage, u'uploader', fatal=False)
+
+ formats_str = self._html_search_regex(
+ r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
+ webpage, u'audio URLs')
+ formats = [
+ {
+ 'format_id': fm[0],
+ 'url': fm[1],
+ 'vcodec': 'none',
+ }
+ for fm in
+ re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)
+ ]
+ # No sorting, we don't know any more about these formats
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'uploader': uploader,
+ }
diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py
index 994778e16..c2254ae8a 100644
--- a/youtube_dl/extractor/redtube.py
+++ b/youtube_dl/extractor/redtube.py
@@ -8,7 +8,9 @@ class RedTubeIE(InfoExtractor):
_TEST = {
u'url': u'http://www.redtube.com/66418',
u'file': u'66418.mp4',
- u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',
+ # md5 varies from time to time, as in
+ # https://travis-ci.org/rg3/youtube-dl/jobs/14052463#L295
+ #u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',
u'info_dict': {
u"title": u"Sucked on a toilet",
u"age_limit": 18,
@@ -28,7 +30,7 @@ class RedTubeIE(InfoExtractor):
r'<source src="(.+?)" type="video/mp4">', webpage, u'video URL')
video_title = self._html_search_regex(
- r'<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
+ r'<h1 class="videoTitle[^"]*">(.+?)</h1>',
webpage, u'title')
# No self-labeling, but they describe themselves as
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
index 9ac7c3be8..ccf0b1546 100644
--- a/youtube_dl/extractor/rtlnow.py
+++ b/youtube_dl/extractor/rtlnow.py
@@ -7,14 +7,15 @@ from ..utils import (
ExtractorError,
)
+
class RTLnowIE(InfoExtractor):
"""Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
- _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?rtlnitronow\.de/|(?:www\.)?superrtlnow\.de/|(?:www\.)?n-tvnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
+ _VALID_URL = r'(?:http://)?(?P<url>(?P<domain>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
_TESTS = [{
u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
u'file': u'90419.flv',
u'info_dict': {
- u'upload_date': u'20070416',
+ u'upload_date': u'20070416',
u'title': u'Ahornallee - Folge 1 - Der Einzug',
u'description': u'Folge 1 - Der Einzug',
},
@@ -63,18 +64,6 @@ class RTLnowIE(InfoExtractor):
},
},
{
- u'url': u'http://www.rtlnitronow.de/recht-ordnung/stadtpolizei-frankfurt-gerichtsvollzieher-leipzig.php?film_id=129679&player=1&season=1',
- u'file': u'129679.flv',
- u'info_dict': {
- u'upload_date': u'20131016',
- u'title': u'Recht & Ordnung - Stadtpolizei Frankfurt/ Gerichtsvollzieher...',
- u'description': u'Stadtpolizei Frankfurt/ Gerichtsvollzieher Leipzig',
- },
- u'params': {
- u'skip_download': True,
- },
- },
- {
u'url': u'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10',
u'file': u'124903.flv',
u'info_dict': {
@@ -93,7 +82,7 @@ class RTLnowIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
webpage_url = u'http://' + mobj.group('url')
- video_page_url = u'http://' + mobj.group('base_url')
+ video_page_url = u'http://' + mobj.group('domain') + u'/'
video_id = mobj.group(u'video_id')
webpage = self._download_webpage(webpage_url, video_id)
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py
index a18034fe2..e3e9bc07f 100644
--- a/youtube_dl/extractor/rutube.py
+++ b/youtube_dl/extractor/rutube.py
@@ -11,7 +11,7 @@ from ..utils import (
class RutubeIE(InfoExtractor):
- _VALID_URL = r'https?://rutube.ru/video/(?P<long_id>\w+)'
+ _VALID_URL = r'https?://rutube\.ru/video/(?P<long_id>\w+)'
_TEST = {
u'url': u'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py
index 14b1c656c..74a87fe56 100644
--- a/youtube_dl/extractor/sina.py
+++ b/youtube_dl/extractor/sina.py
@@ -1,7 +1,6 @@
# coding: utf-8
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -35,12 +34,11 @@ class SinaIE(InfoExtractor):
def _extract_video(self, video_id):
data = compat_urllib_parse.urlencode({'vid': video_id})
- url_page = self._download_webpage('http://v.iask.com/v_play.php?%s' % data,
+ url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data,
video_id, u'Downloading video url')
image_page = self._download_webpage(
'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data,
video_id, u'Downloading thumbnail info')
- url_doc = xml.etree.ElementTree.fromstring(url_page.encode('utf-8'))
return {'id': video_id,
'url': url_doc.find('./durl/url').text,
diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py
index 2cba53076..d68646d24 100644
--- a/youtube_dl/extractor/slashdot.py
+++ b/youtube_dl/extractor/slashdot.py
@@ -4,9 +4,10 @@ from .common import InfoExtractor
class SlashdotIE(InfoExtractor):
- _VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P<id>.*?)(&|$)'
+ _VALID_URL = r'https?://tv\.slashdot\.org/video/\?embed=(?P<id>.*?)(&|$)'
_TEST = {
+ u'add_ie': ['Ooyala'],
u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz',
u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4',
u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735',
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
new file mode 100644
index 000000000..a589a893b
--- /dev/null
+++ b/youtube_dl/extractor/smotri.py
@@ -0,0 +1,356 @@
+# encoding: utf-8
+
+import re
+import json
+import hashlib
+import uuid
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ compat_urllib_request,
+ ExtractorError,
+)
+
+
+class SmotriIE(InfoExtractor):
+ IE_DESC = u'Smotri.com'
+ IE_NAME = u'smotri'
+ _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))'
+
+ _TESTS = [
+ # real video id 2610366
+ {
+ u'url': u'http://smotri.com/video/view/?id=v261036632ab',
+ u'file': u'v261036632ab.mp4',
+ u'md5': u'2a7b08249e6f5636557579c368040eb9',
+ u'info_dict': {
+ u'title': u'катастрофа с камер видеонаблюдения',
+ u'uploader': u'rbc2008',
+ u'uploader_id': u'rbc08',
+ u'upload_date': u'20131118',
+ u'description': u'катастрофа с камер видеонаблюдения, видео катастрофа с камер видеонаблюдения',
+ u'thumbnail': u'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg',
+ },
+ },
+ # real video id 57591
+ {
+ u'url': u'http://smotri.com/video/view/?id=v57591cb20',
+ u'file': u'v57591cb20.flv',
+ u'md5': u'830266dfc21f077eac5afd1883091bcd',
+ u'info_dict': {
+ u'title': u'test',
+ u'uploader': u'Support Photofile@photofile',
+ u'uploader_id': u'support-photofile',
+ u'upload_date': u'20070704',
+ u'description': u'test, видео test',
+ u'thumbnail': u'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',
+ },
+ },
+ # video-password
+ {
+ u'url': u'http://smotri.com/video/view/?id=v1390466a13c',
+ u'file': u'v1390466a13c.mp4',
+ u'md5': u'f6331cef33cad65a0815ee482a54440b',
+ u'info_dict': {
+ u'title': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
+ u'uploader': u'timoxa40',
+ u'uploader_id': u'timoxa40',
+ u'upload_date': u'20100404',
+ u'thumbnail': u'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg',
+ u'description': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1, видео TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
+ },
+ u'params': {
+ u'videopassword': u'qwerty',
+ },
+ },
+ # age limit + video-password
+ {
+ u'url': u'http://smotri.com/video/view/?id=v15408898bcf',
+ u'file': u'v15408898bcf.flv',
+ u'md5': u'91e909c9f0521adf5ee86fbe073aad70',
+ u'info_dict': {
+ u'title': u'этот ролик не покажут по ТВ',
+ u'uploader': u'zzxxx',
+ u'uploader_id': u'ueggb',
+ u'upload_date': u'20101001',
+ u'thumbnail': u'http://frame3.loadup.ru/75/75/1540889.1.3.jpg',
+ u'age_limit': 18,
+ u'description': u'этот ролик не покажут по ТВ, видео этот ролик не покажут по ТВ',
+ },
+ u'params': {
+ u'videopassword': u'333'
+ }
+ }
+ ]
+
+ _SUCCESS = 0
+ _PASSWORD_NOT_VERIFIED = 1
+ _PASSWORD_DETECTED = 2
+ _VIDEO_NOT_FOUND = 3
+
+ def _search_meta(self, name, html, display_name=None):
+ if display_name is None:
+ display_name = name
+ return self._html_search_regex(
+ r'<meta itemprop="%s" content="([^"]+)" />' % re.escape(name),
+ html, display_name, fatal=False)
+ return self._html_search_meta(name, html, display_name)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('videoid')
+ real_video_id = mobj.group('realvideoid')
+
+ # Download video JSON data
+ video_json_url = 'http://smotri.com/vt.php?id=%s' % real_video_id
+ video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON')
+ video_json = json.loads(video_json_page)
+
+ status = video_json['status']
+ if status == self._VIDEO_NOT_FOUND:
+ raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
+ elif status == self._PASSWORD_DETECTED: # The video is protected by a password, retry with
+ # video-password set
+ video_password = self._downloader.params.get('videopassword', None)
+ if not video_password:
+ raise ExtractorError(u'This video is protected by a password, use the --video-password option', expected=True)
+ video_json_url += '&md5pass=%s' % hashlib.md5(video_password.encode('utf-8')).hexdigest()
+ video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON (video-password set)')
+ video_json = json.loads(video_json_page)
+ status = video_json['status']
+ if status == self._PASSWORD_NOT_VERIFIED:
+ raise ExtractorError(u'Video password is invalid', expected=True)
+
+ if status != self._SUCCESS:
+ raise ExtractorError(u'Unexpected status value %s' % status)
+
+ # Extract the URL of the video
+ video_url = video_json['file_data']
+
+ # Video JSON does not provide enough meta data
+ # We will extract some from the video web page instead
+ video_page_url = 'http://' + mobj.group('url')
+ video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page')
+
+ # Adult content
+ if re.search(u'EroConfirmText">', video_page) is not None:
+ self.report_age_confirmation()
+ confirm_string = self._html_search_regex(
+ r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id,
+ video_page, u'confirm string')
+ confirm_url = video_page_url + '&confirm=%s' % confirm_string
+ video_page = self._download_webpage(confirm_url, video_id, u'Downloading video page (age confirmed)')
+ adult_content = True
+ else:
+ adult_content = False
+
+ # Extract the rest of meta data
+ video_title = self._search_meta(u'name', video_page, u'title')
+ if not video_title:
+ video_title = video_url.rsplit('/', 1)[-1]
+
+ video_description = self._search_meta(u'description', video_page)
+ END_TEXT = u' на сайте Smotri.com'
+ if video_description.endswith(END_TEXT):
+ video_description = video_description[:-len(END_TEXT)]
+ START_TEXT = u'Смотреть онлайн ролик '
+ if video_description.startswith(START_TEXT):
+ video_description = video_description[len(START_TEXT):]
+ video_thumbnail = self._search_meta(u'thumbnail', video_page)
+
+ upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date')
+ upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
+ video_upload_date = (
+ (
+ upload_date_m.group('year') +
+ upload_date_m.group('month') +
+ upload_date_m.group('day')
+ )
+ if upload_date_m else None
+ )
+
+ duration_str = self._search_meta(u'duration', video_page)
+ duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
+ video_duration = (
+ (
+ (int(duration_m.group('hours')) * 60 * 60) +
+ (int(duration_m.group('minutes')) * 60) +
+ int(duration_m.group('seconds'))
+ )
+ if duration_m else None
+ )
+
+ video_uploader = self._html_search_regex(
+ u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>',
+ video_page, u'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL)
+
+ video_uploader_id = self._html_search_regex(
+ u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\\(.*?\'([^\']+)\'\\);">',
+ video_page, u'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL)
+
+ video_view_count = self._html_search_regex(
+ u'Общее количество просмотров.*?<span class="Number">(\\d+)</span>',
+ video_page, u'view count', fatal=False, flags=re.MULTILINE|re.DOTALL)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
+ 'thumbnail': video_thumbnail,
+ 'description': video_description,
+ 'uploader': video_uploader,
+ 'upload_date': video_upload_date,
+ 'uploader_id': video_uploader_id,
+ 'duration': video_duration,
+ 'view_count': video_view_count,
+ 'age_limit': 18 if adult_content else 0,
+ 'video_page_url': video_page_url
+ }
+
+
+class SmotriCommunityIE(InfoExtractor):
+ IE_DESC = u'Smotri.com community videos'
+ IE_NAME = u'smotri:community'
+ _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ community_id = mobj.group('communityid')
+
+ url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id
+ rss = self._download_xml(url, community_id, u'Downloading community RSS')
+
+ entries = [self.url_result(video_url.text, 'Smotri')
+ for video_url in rss.findall('./channel/item/link')]
+
+ description_text = rss.find('./channel/description').text
+ community_title = self._html_search_regex(
+ u'^Видео сообщества "([^"]+)"$', description_text, u'community title')
+
+ return self.playlist_result(entries, community_id, community_title)
+
+
+class SmotriUserIE(InfoExtractor):
+ IE_DESC = u'Smotri.com user videos'
+ IE_NAME = u'smotri:user'
+ _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ user_id = mobj.group('userid')
+
+ url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id
+ rss = self._download_xml(url, user_id, u'Downloading user RSS')
+
+ entries = [self.url_result(video_url.text, 'Smotri')
+ for video_url in rss.findall('./channel/item/link')]
+
+ description_text = rss.find('./channel/description').text
+ user_nickname = self._html_search_regex(
+ u'^Видео режиссера (.*)$', description_text,
+ u'user nickname')
+
+ return self.playlist_result(entries, user_id, user_nickname)
+
+
+class SmotriBroadcastIE(InfoExtractor):
+ IE_DESC = u'Smotri.com broadcasts'
+ IE_NAME = u'smotri:broadcast'
+ _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<broadcastid>[^/]+))/?.*'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ broadcast_id = mobj.group('broadcastid')
+
+ broadcast_url = 'http://' + mobj.group('url')
+ broadcast_page = self._download_webpage(broadcast_url, broadcast_id, u'Downloading broadcast page')
+
+ if re.search(u'>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None:
+ raise ExtractorError(u'Broadcast %s does not exist' % broadcast_id, expected=True)
+
+ # Adult content
+ if re.search(u'EroConfirmText">', broadcast_page) is not None:
+
+ (username, password) = self._get_login_info()
+ if username is None:
+ raise ExtractorError(u'Erotic broadcasts allowed only for registered users, '
+ u'use --username and --password options to provide account credentials.', expected=True)
+
+ # Log in
+ login_form_strs = {
+ u'login-hint53': '1',
+ u'confirm_erotic': '1',
+ u'login': username,
+ u'password': password,
+ }
+ # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
+ # chokes on unicode
+ login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+ login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
+ login_url = broadcast_url + '/?no_redirect=1'
+ request = compat_urllib_request.Request(login_url, login_data)
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ broadcast_page = self._download_webpage(
+ request, broadcast_id, note=u'Logging in and confirming age')
+
+ if re.search(u'>Неверный логин или пароль<', broadcast_page) is not None:
+ raise ExtractorError(u'Unable to log in: bad username or password', expected=True)
+
+ adult_content = True
+ else:
+ adult_content = False
+
+ ticket = self._html_search_regex(
+ u'window\.broadcast_control\.addFlashVar\\(\'file\', \'([^\']+)\'\\);',
+ broadcast_page, u'broadcast ticket')
+
+ url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket
+
+ broadcast_password = self._downloader.params.get('videopassword', None)
+ if broadcast_password:
+ url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest()
+
+ broadcast_json_page = self._download_webpage(url, broadcast_id, u'Downloading broadcast JSON')
+
+ try:
+ broadcast_json = json.loads(broadcast_json_page)
+
+ protected_broadcast = broadcast_json['_pass_protected'] == 1
+ if protected_broadcast and not broadcast_password:
+ raise ExtractorError(u'This broadcast is protected by a password, use the --video-password option', expected=True)
+
+ broadcast_offline = broadcast_json['is_play'] == 0
+ if broadcast_offline:
+ raise ExtractorError(u'Broadcast %s is offline' % broadcast_id, expected=True)
+
+ rtmp_url = broadcast_json['_server']
+ if not rtmp_url.startswith('rtmp://'):
+ raise ExtractorError(u'Unexpected broadcast rtmp URL')
+
+ broadcast_playpath = broadcast_json['_streamName']
+ broadcast_thumbnail = broadcast_json['_imgURL']
+ broadcast_title = broadcast_json['title']
+ broadcast_description = broadcast_json['description']
+ broadcaster_nick = broadcast_json['nick']
+ broadcaster_login = broadcast_json['login']
+ rtmp_conn = 'S:%s' % uuid.uuid4().hex
+ except KeyError:
+ if protected_broadcast:
+ raise ExtractorError(u'Bad broadcast password', expected=True)
+ raise ExtractorError(u'Unexpected broadcast JSON')
+
+ return {
+ 'id': broadcast_id,
+ 'url': rtmp_url,
+ 'title': broadcast_title,
+ 'thumbnail': broadcast_thumbnail,
+ 'description': broadcast_description,
+ 'uploader': broadcaster_nick,
+ 'uploader_id': broadcaster_login,
+ 'age_limit': 18 if adult_content else 0,
+ 'ext': 'flv',
+ 'play_path': broadcast_playpath,
+ 'rtmp_live': True,
+ 'rtmp_conn': rtmp_conn
+ }
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 29cd5617c..e22ff9c38 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
import json
import re
import itertools
@@ -23,25 +24,72 @@ class SoundcloudIE(InfoExtractor):
"""
_VALID_URL = r'''^(?:https?://)?
- (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$)
+ (?:(?:(?:www\.|m\.)?soundcloud\.com/
+ (?P<uploader>[\w\d-]+)/
+ (?!sets/)(?P<title>[\w\d-]+)/?
+ (?P<token>[^?]+?)?(?:[?].*)?$)
|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+))
- |(?P<widget>w.soundcloud.com/player/?.*?url=.*)
+ |(?P<widget>w\.soundcloud\.com/player/?.*?url=.*)
)
'''
IE_NAME = u'soundcloud'
- _TEST = {
- u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
- u'file': u'62986583.mp3',
- u'md5': u'ebef0a451b909710ed1d7787dddbf0d7',
- u'info_dict': {
- u"upload_date": u"20121011",
- u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",
- u"uploader": u"E.T. ExTerrestrial Music",
- u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1"
- }
- }
+ _TESTS = [
+ {
+ u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
+ u'file': u'62986583.mp3',
+ u'md5': u'ebef0a451b909710ed1d7787dddbf0d7',
+ u'info_dict': {
+ u"upload_date": u"20121011",
+ u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",
+ u"uploader": u"E.T. ExTerrestrial Music",
+ u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1"
+ }
+ },
+ # not streamable song
+ {
+ u'url': u'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
+ u'info_dict': {
+ u'id': u'47127627',
+ u'ext': u'mp3',
+ u'title': u'Goldrushed',
+ u'uploader': u'The Royal Concept',
+ u'upload_date': u'20120521',
+ },
+ u'params': {
+ # rtmp
+ u'skip_download': True,
+ },
+ },
+ # private link
+ {
+ u'url': u'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp',
+ u'md5': u'aa0dd32bfea9b0c5ef4f02aacd080604',
+ u'info_dict': {
+ u'id': u'123998367',
+ u'ext': u'mp3',
+ u'title': u'Youtube - Dl Test Video \'\' Ä↭',
+ u'uploader': u'jaimeMF',
+ u'description': u'test chars: \"\'/\\ä↭',
+ u'upload_date': u'20131209',
+ },
+ },
+ # downloadable song
+ {
+ u'url': u'https://soundcloud.com/simgretina/just-your-problem-baby-1',
+ u'md5': u'56a8b69568acaa967b4c49f9d1d52d19',
+ u'info_dict': {
+ u'id': u'105614606',
+ u'ext': u'wav',
+ u'title': u'Just Your Problem Baby (Acapella)',
+ u'description': u'Vocals',
+ u'uploader': u'Sim Gretina',
+ u'upload_date': u'20130815',
+ },
+ },
+ ]
_CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
+ _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
@classmethod
def suitable(cls, url):
@@ -55,25 +103,85 @@ class SoundcloudIE(InfoExtractor):
def _resolv_url(cls, url):
return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
- def _extract_info_dict(self, info, full_title=None, quiet=False):
- video_id = info['id']
- name = full_title or video_id
- if quiet == False:
+ def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None):
+ track_id = compat_str(info['id'])
+ name = full_title or track_id
+ if quiet:
self.report_extraction(name)
thumbnail = info['artwork_url']
if thumbnail is not None:
thumbnail = thumbnail.replace('-large', '-t500x500')
- return {
- 'id': info['id'],
- 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
+ ext = u'mp3'
+ result = {
+ 'id': track_id,
'uploader': info['user']['username'],
'upload_date': unified_strdate(info['created_at']),
- 'title': info['title'],
- 'ext': u'mp3',
+ 'title': info['title'],
'description': info['description'],
'thumbnail': thumbnail,
}
+ if info.get('downloadable', False):
+ # We can build a direct link to the song
+ format_url = (
+ u'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
+ track_id, self._CLIENT_ID))
+ result['formats'] = [{
+ 'format_id': 'download',
+ 'ext': info.get('original_format', u'mp3'),
+ 'url': format_url,
+ 'vcodec': 'none',
+ }]
+ else:
+ # We have to retrieve the url
+ streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
+ 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
+ stream_json = self._download_webpage(
+ streams_url,
+ track_id, u'Downloading track url')
+
+ formats = []
+ format_dict = json.loads(stream_json)
+ for key, stream_url in format_dict.items():
+ if key.startswith(u'http'):
+ formats.append({
+ 'format_id': key,
+ 'ext': ext,
+ 'url': stream_url,
+ 'vcodec': 'none',
+ })
+ elif key.startswith(u'rtmp'):
+ # The url doesn't have an rtmp app, we have to extract the playpath
+ url, path = stream_url.split('mp3:', 1)
+ formats.append({
+ 'format_id': key,
+ 'url': url,
+ 'play_path': 'mp3:' + path,
+ 'ext': ext,
+ 'vcodec': 'none',
+ })
+
+ if not formats:
+ # We fallback to the stream_url in the original info, this
+ # cannot be always used, sometimes it can give an HTTP 404 error
+ formats.append({
+ 'format_id': u'fallback',
+ 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
+ 'ext': ext,
+ 'vcodec': 'none',
+ })
+
+ def format_pref(f):
+ if f['format_id'].startswith('http'):
+ return 2
+ if f['format_id'].startswith('rtmp'):
+ return 1
+ return 0
+
+ formats.sort(key=format_pref)
+ result['formats'] = formats
+
+ return result
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
@@ -81,6 +189,7 @@ class SoundcloudIE(InfoExtractor):
raise ExtractorError(u'Invalid URL: %s' % url)
track_id = mobj.group('track_id')
+ token = None
if track_id is not None:
info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
full_title = track_id
@@ -89,87 +198,28 @@ class SoundcloudIE(InfoExtractor):
return self.url_result(query['url'][0], ie='Soundcloud')
else:
# extract uploader (which is in the url)
- uploader = mobj.group(1)
+ uploader = mobj.group('uploader')
# extract simple title (uploader + slug of song title)
- slug_title = mobj.group(2)
- full_title = '%s/%s' % (uploader, slug_title)
+ slug_title = mobj.group('title')
+ token = mobj.group('token')
+ full_title = resolve_title = '%s/%s' % (uploader, slug_title)
+ if token:
+ resolve_title += '/%s' % token
self.report_resolve(full_title)
- url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
+ url = 'http://soundcloud.com/%s' % resolve_title
info_json_url = self._resolv_url(url)
info_json = self._download_webpage(info_json_url, full_title, u'Downloading info JSON')
info = json.loads(info_json)
- return self._extract_info_dict(info, full_title)
+ return self._extract_info_dict(info, full_title, secret_token=token)
class SoundcloudSetIE(SoundcloudIE):
_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
IE_NAME = u'soundcloud:set'
- _TEST = {
- u"url":"https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep",
- u"playlist": [
- {
- u"file":"30510138.mp3",
- u"md5":"f9136bf103901728f29e419d2c70f55d",
- u"info_dict": {
- u"upload_date": u"20111213",
- u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
- u"uploader": u"The Royal Concept",
- u"title": u"D-D-Dance"
- }
- },
- {
- u"file":"47127625.mp3",
- u"md5":"09b6758a018470570f8fd423c9453dd8",
- u"info_dict": {
- u"upload_date": u"20120521",
- u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
- u"uploader": u"The Royal Concept",
- u"title": u"The Royal Concept - Gimme Twice"
- }
- },
- {
- u"file":"47127627.mp3",
- u"md5":"154abd4e418cea19c3b901f1e1306d9c",
- u"info_dict": {
- u"upload_date": u"20120521",
- u"uploader": u"The Royal Concept",
- u"title": u"Goldrushed"
- }
- },
- {
- u"file":"47127629.mp3",
- u"md5":"2f5471edc79ad3f33a683153e96a79c1",
- u"info_dict": {
- u"upload_date": u"20120521",
- u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
- u"uploader": u"The Royal Concept",
- u"title": u"In the End"
- }
- },
- {
- u"file":"47127631.mp3",
- u"md5":"f9ba87aa940af7213f98949254f1c6e2",
- u"info_dict": {
- u"upload_date": u"20120521",
- u"description": u"The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com",
- u"uploader": u"The Royal Concept",
- u"title": u"Knocked Up"
- }
- },
- {
- u"file":"75206121.mp3",
- u"md5":"f9d1fe9406717e302980c30de4af9353",
- u"info_dict": {
- u"upload_date": u"20130116",
- u"description": u"The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central). \r\nAs a gift to our fans we would like to offer you a free download of the track! ",
- u"uploader": u"The Royal Concept",
- u"title": u"World On Fire"
- }
- }
- ]
- }
+ # it's in tests/test_playlists.py
+ _TESTS = []
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -188,7 +238,6 @@ class SoundcloudSetIE(SoundcloudIE):
resolv_url = self._resolv_url(url)
info_json = self._download_webpage(resolv_url, full_title)
- videos = []
info = json.loads(info_json)
if 'errors' in info:
for err in info['errors']:
@@ -204,11 +253,11 @@ class SoundcloudSetIE(SoundcloudIE):
class SoundcloudUserIE(SoundcloudIE):
- _VALID_URL = r'https?://(www\.)?soundcloud.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'
+ _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'
IE_NAME = u'soundcloud:user'
# it's in tests/test_playlists.py
- _TEST = None
+ _TESTS = []
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py
index b1e96b679..fd90cc5dd 100644
--- a/youtube_dl/extractor/southparkstudios.py
+++ b/youtube_dl/extractor/southparkstudios.py
@@ -1,38 +1,42 @@
import re
-from .mtv import MTVIE, _media_xml_tag
+from .mtv import MTVServicesInfoExtractor
-class SouthParkStudiosIE(MTVIE):
+class SouthParkStudiosIE(MTVServicesInfoExtractor):
IE_NAME = u'southparkstudios.com'
- _VALID_URL = r'https?://www\.southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$)'
+ _VALID_URL = r'(https?://)?(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
_FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
- _TEST = {
+ _TESTS = [{
u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured',
u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4',
u'info_dict': {
u'title': u'Bat Daded',
u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.',
},
- }
-
- # Overwrite MTVIE properties we don't want
- _TESTS = []
-
- def _get_thumbnail_url(self, uri, itemdoc):
- search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
- thumb_node = itemdoc.find(search_path)
- if thumb_node is None:
- return None
- else:
- return thumb_node.attrib['url']
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
+ url = u'http://www.' + mobj.group(u'url')
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"',
webpage, u'mgid')
return self._get_videos_info(mgid)
+
+class SouthparkDeIE(SouthParkStudiosIE):
+ IE_NAME = u'southpark.de'
+ _VALID_URL = r'(https?://)?(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
+ _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'
+
+ _TESTS = [{
+ u'url': u'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured',
+ u'file': u'85487c96-b3b9-4e39-9127-ad88583d9bf2.mp4',
+ u'info_dict': {
+ u'title': u'The Government Won\'t Respect My Privacy',
+ u'description': u'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
+ },
+ }]
diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py
new file mode 100644
index 000000000..11455e0fa
--- /dev/null
+++ b/youtube_dl/extractor/space.py
@@ -0,0 +1,35 @@
+import re
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveIE
+from ..utils import RegexNotFoundError, ExtractorError
+
+
+class SpaceIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html'
+ _TEST = {
+ u'add_ie': ['Brightcove'],
+ u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
+ u'info_dict': {
+ u'id': u'2780937028001',
+ u'ext': u'mp4',
+ u'title': u'Huge Martian Landforms\' Detail Revealed By European Probe | Video',
+ u'description': u'md5:db81cf7f3122f95ed234b631a6ea1e61',
+ u'uploader': u'TechMedia Networks',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ title = mobj.group('title')
+ webpage = self._download_webpage(url, title)
+ try:
+ # Some videos require the playerKey field, which isn't define in
+ # the BrightcoveExperience object
+ brightcove_url = self._og_search_video_url(webpage)
+ except RegexNotFoundError:
+ # Other videos works fine with the info from the object
+ brightcove_url = BrightcoveIE._extract_brightcove_url(webpage)
+ if brightcove_url is None:
+ raise ExtractorError(u'The webpage does not contain a video', expected=True)
+ return self.url_result(brightcove_url, BrightcoveIE.ie_key())
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py
index 32df0a7fb..9e2ad0d99 100644
--- a/youtube_dl/extractor/spankwire.py
+++ b/youtube_dl/extractor/spankwire.py
@@ -6,7 +6,6 @@ from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
- unescapeHTML,
)
from ..aes import (
aes_decrypt_text
@@ -36,11 +35,12 @@ class SpankwireIE(InfoExtractor):
webpage = self._download_webpage(req, video_id)
video_title = self._html_search_regex(r'<h1>([^<]+)', webpage, u'title')
- video_uploader = self._html_search_regex(r'by:\s*<a [^>]*>(.+?)</a>', webpage, u'uploader', fatal=False)
- thumbnail = self._html_search_regex(r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False)
- description = self._html_search_regex(r'>\s*Description:</div>\s*<[^>]*>([^<]+)', webpage, u'description', fatal=False)
- if len(description) == 0:
- description = None
+ video_uploader = self._html_search_regex(
+ r'by:\s*<a [^>]*>(.+?)</a>', webpage, u'uploader', fatal=False)
+ thumbnail = self._html_search_regex(
+ r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False)
+ description = self._html_search_regex(
+ r'<div\s+id="descriptionContent">([^<]+)<', webpage, u'description', fatal=False)
video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage)))
if webpage.find('flashvars\.encrypted = "true"') != -1:
@@ -49,10 +49,10 @@ class SpankwireIE(InfoExtractor):
formats = []
for video_url in video_urls:
- path = compat_urllib_parse_urlparse( video_url ).path
- extension = os.path.splitext( path )[1][1:]
+ path = compat_urllib_parse_urlparse(video_url).path
+ extension = os.path.splitext(path)[1][1:]
format = path.split('/')[4].split('_')[:2]
- format = "-".join( format )
+ format = "-".join(format)
formats.append({
'url': video_url,
'ext': extension,
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py
index 13c86401c..695520524 100644
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -1,19 +1,26 @@
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
class SpiegelIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
- _TEST = {
+ _TESTS = [{
u'url': u'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
u'file': u'1259285.mp4',
u'md5': u'2c2754212136f35fb4b19767d242f66e',
u'info_dict': {
u"title": u"Vulkanausbruch in Ecuador: Der \"Feuerschlund\" ist wieder aktiv"
}
- }
+ },
+ {
+ u'url': u'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
+ u'file': u'1309159.mp4',
+ u'md5': u'f2cdf638d7aa47654e251e1aee360af1',
+ u'info_dict': {
+ u'title': u'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers'
+ }
+ }]
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
@@ -21,25 +28,36 @@ class SpiegelIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
- webpage, u'title')
+ video_title = self._html_search_regex(
+ r'<div class="module-title">(.*?)</div>', webpage, u'title')
xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
- xml_code = self._download_webpage(xml_url, video_id,
- note=u'Downloading XML', errnote=u'Failed to download XML')
-
- idoc = xml.etree.ElementTree.fromstring(xml_code)
- last_type = idoc[-1]
- filename = last_type.findall('./filename')[0].text
- duration = float(last_type.findall('./duration')[0].text)
+ idoc = self._download_xml(
+ xml_url, video_id,
+ note=u'Downloading XML', errnote=u'Failed to download XML')
+
+ formats = [
+ {
+ 'format_id': n.tag.rpartition('type')[2],
+ 'url': u'http://video2.spiegel.de/flash/' + n.find('./filename').text,
+ 'width': int(n.find('./width').text),
+ 'height': int(n.find('./height').text),
+ 'abr': int(n.find('./audiobitrate').text),
+ 'vbr': int(n.find('./videobitrate').text),
+ 'vcodec': n.find('./codec').text,
+ 'acodec': 'MP4A',
+ }
+ for n in list(idoc)
+ # Blacklist type 6, it's extremely LQ and not available on the same server
+ if n.tag.startswith('type') and n.tag != 'type6'
+ ]
+ formats.sort(key=lambda f: f['vbr'])
+ duration = float(idoc[0].findall('./duration')[0].text)
- video_url = 'http://video2.spiegel.de/flash/' + filename
- video_ext = filename.rpartition('.')[2]
info = {
'id': video_id,
- 'url': video_url,
- 'ext': video_ext,
'title': video_title,
'duration': duration,
+ 'formats': formats,
}
- return [info]
+ return info
diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py
index b27838bf9..44c52c718 100644
--- a/youtube_dl/extractor/stanfordoc.py
+++ b/youtube_dl/extractor/stanfordoc.py
@@ -1,14 +1,7 @@
import re
-import socket
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
- compat_http_client,
- compat_str,
- compat_urllib_error,
- compat_urllib_request,
-
ExtractorError,
orderedSet,
unescapeHTML,
@@ -18,7 +11,7 @@ from ..utils import (
class StanfordOpenClassroomIE(InfoExtractor):
IE_NAME = u'stanfordoc'
IE_DESC = u'Stanford Open ClassRoom'
- _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
+ _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
_TEST = {
u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
u'file': u'PracticalUnix_intro-environment.mp4',
@@ -45,11 +38,7 @@ class StanfordOpenClassroomIE(InfoExtractor):
self.report_extraction(info['id'])
baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
xmlUrl = baseUrl + video + '.xml'
- try:
- metaXml = compat_urllib_request.urlopen(xmlUrl).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
- mdoc = xml.etree.ElementTree.fromstring(metaXml)
+ mdoc = self._download_xml(xmlUrl, info['id'])
try:
info['title'] = mdoc.findall('./title')[0].text
info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
@@ -95,12 +84,9 @@ class StanfordOpenClassroomIE(InfoExtractor):
'upload_date': None,
}
- self.report_download_webpage(info['id'])
rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
- try:
- rootpage = compat_urllib_request.urlopen(rootURL).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
+ rootpage = self._download_webpage(rootURL, info['id'],
+ errnote=u'Unable to download course info page')
info['title'] = info['id']
diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py
new file mode 100644
index 000000000..9faf3a5e3
--- /dev/null
+++ b/youtube_dl/extractor/streamcloud.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+
+
+class StreamcloudIE(InfoExtractor):
+ IE_NAME = u'streamcloud.eu'
+ _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)/(?P<fname>[^#?]*)\.html'
+
+ _TEST = {
+ u'url': u'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html',
+ u'file': u'skp9j99s4bpz.mp4',
+ u'md5': u'6bea4c7fa5daaacc2a946b7146286686',
+ u'info_dict': {
+ u'title': u'youtube-dl test video \'/\\ ä ↭',
+ u'duration': 9,
+ },
+ u'skip': u'Only available from the EU'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ orig_webpage = self._download_webpage(url, video_id)
+
+ fields = re.findall(r'''(?x)<input\s+
+ type="(?:hidden|submit)"\s+
+ name="([^"]+)"\s+
+ (?:id="[^"]+"\s+)?
+ value="([^"]*)"
+ ''', orig_webpage)
+ post = compat_urllib_parse.urlencode(fields)
+
+ self.to_screen('%s: Waiting for timeout' % video_id)
+ time.sleep(12)
+ headers = {
+ b'Content-Type': b'application/x-www-form-urlencoded',
+ }
+ req = compat_urllib_request.Request(url, post, headers)
+
+ webpage = self._download_webpage(
+ req, video_id, note=u'Downloading video page ...')
+ title = self._html_search_regex(
+ r'<h1[^>]*>([^<]+)<', webpage, u'title')
+ video_url = self._search_regex(
+ r'file:\s*"([^"]+)"', webpage, u'video URL')
+ duration_str = self._search_regex(
+ r'duration:\s*"?([0-9]+)"?', webpage, u'duration', fatal=False)
+ duration = None if duration_str is None else int(duration_str)
+ thumbnail = self._search_regex(
+ r'image:\s*"([^"]+)"', webpage, u'thumbnail URL', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py
index 90de7de3a..4b4c5235d 100644
--- a/youtube_dl/extractor/subtitles.py
+++ b/youtube_dl/extractor/subtitles.py
@@ -12,9 +12,9 @@ class SubtitlesInfoExtractor(InfoExtractor):
return any([self._downloader.params.get('writesubtitles', False),
self._downloader.params.get('writeautomaticsub')])
- def _list_available_subtitles(self, video_id, webpage=None):
+ def _list_available_subtitles(self, video_id, webpage):
""" outputs the available subtitles for the video """
- sub_lang_list = self._get_available_subtitles(video_id)
+ sub_lang_list = self._get_available_subtitles(video_id, webpage)
auto_captions_list = self._get_available_automatic_caption(video_id, webpage)
sub_lang = ",".join(list(sub_lang_list.keys()))
self.to_screen(u'%s: Available subtitles for video: %s' %
@@ -23,7 +23,7 @@ class SubtitlesInfoExtractor(InfoExtractor):
self.to_screen(u'%s: Available automatic captions for video: %s' %
(video_id, auto_lang))
- def extract_subtitles(self, video_id, video_webpage=None):
+ def extract_subtitles(self, video_id, webpage):
"""
returns {sub_lang: sub} ,{} if subtitles not found or None if the
subtitles aren't requested.
@@ -32,9 +32,9 @@ class SubtitlesInfoExtractor(InfoExtractor):
return None
available_subs_list = {}
if self._downloader.params.get('writeautomaticsub', False):
- available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage))
+ available_subs_list.update(self._get_available_automatic_caption(video_id, webpage))
if self._downloader.params.get('writesubtitles', False):
- available_subs_list.update(self._get_available_subtitles(video_id))
+ available_subs_list.update(self._get_available_subtitles(video_id, webpage))
if not available_subs_list: # error, it didn't get the available subtitles
return {}
@@ -74,7 +74,7 @@ class SubtitlesInfoExtractor(InfoExtractor):
return
return sub
- def _get_available_subtitles(self, video_id):
+ def _get_available_subtitles(self, video_id, webpage):
"""
returns {sub_lang: url} or {} if not available
Must be redefined by the subclasses
diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py
index 81fa35c4b..c9359fafb 100644
--- a/youtube_dl/extractor/sztvhu.py
+++ b/youtube_dl/extractor/sztvhu.py
@@ -15,7 +15,8 @@ class SztvHuIE(InfoExtractor):
u'info_dict': {
u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren",
u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...',
- }
+ },
+ u'skip': u'Service temporarily disabled as of 2013-11-20'
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index c910110ca..2bf26d056 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -11,7 +11,7 @@ class TeamcocoIE(InfoExtractor):
_TEST = {
u'url': u'http://teamcoco.com/video/louis-ck-interview-george-w-bush',
u'file': u'19705.mp4',
- u'md5': u'27b6f7527da5acf534b15f21b032656e',
+ u'md5': u'cde9ba0fa3506f5f017ce11ead928f9a',
u'info_dict': {
u"description": u"Louis C.K. got starstruck by George W. Bush, so what? Part one.",
u"title": u"Louis C.K. Interview Pt. 1 11/3/11"
@@ -31,16 +31,39 @@ class TeamcocoIE(InfoExtractor):
self.report_extraction(video_id)
data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
- data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
+ data = self._download_xml(data_url, video_id, 'Downloading data webpage')
- video_url = self._html_search_regex(r'<file [^>]*type="high".*?>(.*?)</file>',
- data, u'video URL')
- return [{
+ qualities = ['500k', '480p', '1000k', '720p', '1080p']
+ formats = []
+ for file in data.findall('files/file'):
+ if file.attrib.get('playmode') == 'all':
+ # it just duplicates one of the entries
+ break
+ file_url = file.text
+ m_format = re.search(r'(\d+(k|p))\.mp4', file_url)
+ if m_format is not None:
+ format_id = m_format.group(1)
+ else:
+ format_id = file.attrib['bitrate']
+ formats.append({
+ 'url': file_url,
+ 'ext': 'mp4',
+ 'format_id': format_id,
+ })
+ def sort_key(f):
+ try:
+ return qualities.index(f['format_id'])
+ except ValueError:
+ return -1
+ formats.sort(key=sort_key)
+ if not formats:
+ raise ExtractorError(u'Unable to extract video URL')
+
+ return {
'id': video_id,
- 'url': video_url,
- 'ext': 'mp4',
+ 'formats': formats,
'title': self._og_search_title(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
- }]
+ }
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index dfa1176a3..4bca62ba0 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -1,10 +1,13 @@
import json
import re
-from .common import InfoExtractor
+from .subtitles import SubtitlesInfoExtractor
+from ..utils import (
+ RegexNotFoundError,
+)
-class TEDIE(InfoExtractor):
+class TEDIE(SubtitlesInfoExtractor):
_VALID_URL=r'''http://www\.ted\.com/
(
((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
@@ -32,33 +35,32 @@ class TEDIE(InfoExtractor):
def _real_extract(self, url):
m=re.match(self._VALID_URL, url, re.VERBOSE)
if m.group('type_talk'):
- return [self._talk_info(url)]
+ return self._talk_info(url)
else :
playlist_id=m.group('playlist_id')
name=m.group('name')
self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
return [self._playlist_videos_info(url,name,playlist_id)]
- def _playlist_videos_info(self,url,name,playlist_id=0):
+
+ def _playlist_videos_info(self, url, name, playlist_id):
'''Returns the videos of the playlist'''
- video_RE=r'''
- <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
- ([.\s]*?)data-playlist_item_id="(\d+)"
- ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
- '''
- video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
- webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
- m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
- m_names=re.finditer(video_name_RE,webpage)
+
+ webpage = self._download_webpage(
+ url, playlist_id, u'Downloading playlist webpage')
+ matches = re.finditer(
+ r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
+ webpage)
playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
webpage, 'playlist title')
- playlist_entries = []
- for m_video, m_name in zip(m_videos,m_names):
- talk_url='http://www.ted.com%s' % m_name.group('talk_url')
- playlist_entries.append(self.url_result(talk_url, 'TED'))
- return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
+ playlist_entries = [
+ self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
+ for m in matches
+ ]
+ return self.playlist_result(
+ playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
def _talk_info(self, url, video_id=0):
"""Return the video for the talk in the url"""
@@ -81,16 +83,35 @@ class TEDIE(InfoExtractor):
'ext': 'mp4',
'url': stream['file'],
'format': stream['id']
- } for stream in info['htmlStreams']]
- info = {
- 'id': info['id'],
+ } for stream in info['htmlStreams']]
+
+ video_id = info['id']
+
+ # subtitles
+ video_subtitles = self.extract_subtitles(video_id, webpage)
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, webpage)
+ return
+
+ return {
+ 'id': video_id,
'title': title,
'thumbnail': thumbnail,
'description': desc,
+ 'subtitles': video_subtitles,
'formats': formats,
}
- # TODO: Remove when #980 has been merged
- info.update(info['formats'][-1])
-
- return info
+ def _get_available_subtitles(self, video_id, webpage):
+ try:
+ options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
+ languages = re.findall(r'(?:<option value=")(\S+)"', options)
+ if languages:
+ sub_lang_list = {}
+ for l in languages:
+ url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
+ sub_lang_list[l] = url
+ return sub_lang_list
+ except RegexNotFoundError:
+ self._downloader.report_warning(u'video doesn\'t have subtitles')
+ return {}
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index 772134a12..2c5c88be8 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -7,7 +7,7 @@ from .common import InfoExtractor
class TF1IE(InfoExtractor):
"""TF1 uses the wat.tv player."""
- _VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html'
+ _VALID_URL = r'http://videos\.tf1\.fr/.*-(.*?)\.html'
_TEST = {
u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
u'file': u'10635995.mp4',
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
new file mode 100644
index 000000000..cec65261b
--- /dev/null
+++ b/youtube_dl/extractor/theplatform.py
@@ -0,0 +1,80 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ xpath_with_ns,
+)
+
+_x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'})
+
+
+class ThePlatformIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://link\.theplatform\.com/s/[^/]+/|theplatform:)(?P<id>[^/\?]+)'
+
+ _TEST = {
+ # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
+ u'url': u'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true',
+ u'info_dict': {
+ u'id': u'e9I_cZgTgIPd',
+ u'ext': u'flv',
+ u'title': u'Blackberry\'s big, bold Z30',
+ u'description': u'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.',
+ u'duration': 247,
+ },
+ u'params': {
+ # rtmp download
+ u'skip_download': True,
+ },
+ }
+
+ def _get_info(self, video_id):
+ smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
+ 'format=smil&mbr=true'.format(video_id))
+ meta = self._download_xml(smil_url, video_id)
+
+ try:
+ error_msg = next(
+ n.attrib['abstract']
+ for n in meta.findall(_x('.//smil:ref'))
+ if n.attrib.get('title') == u'Geographic Restriction')
+ except StopIteration:
+ pass
+ else:
+ raise ExtractorError(error_msg, expected=True)
+
+ info_url = 'http://link.theplatform.com/s/dJ5BDC/{0}?format=preview'.format(video_id)
+ info_json = self._download_webpage(info_url, video_id)
+ info = json.loads(info_json)
+
+ head = meta.find(_x('smil:head'))
+ body = meta.find(_x('smil:body'))
+ base_url = head.find(_x('smil:meta')).attrib['base']
+ switch = body.find(_x('smil:switch'))
+ formats = []
+ for f in switch.findall(_x('smil:video')):
+ attr = f.attrib
+ formats.append({
+ 'url': base_url,
+ 'play_path': 'mp4:' + attr['src'],
+ 'ext': 'flv',
+ 'width': int(attr['width']),
+ 'height': int(attr['height']),
+ 'vbr': int(attr['system-bitrate']),
+ })
+ formats.sort(key=lambda f: (f['height'], f['width'], f['vbr']))
+
+ return {
+ 'id': video_id,
+ 'title': info['title'],
+ 'formats': formats,
+ 'description': info['description'],
+ 'thumbnail': info['defaultThumbnailUrl'],
+ 'duration': info['duration']//1000,
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ return self._get_info(video_id)
diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py
new file mode 100644
index 000000000..1e9598ef6
--- /dev/null
+++ b/youtube_dl/extractor/toutv.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ unified_strdate,
+)
+
+
+class TouTvIE(InfoExtractor):
+ IE_NAME = u'tou.tv'
+ _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))'
+
+ _TEST = {
+ u'url': u'http://www.tou.tv/30-vies/S04E41',
+ u'file': u'30-vies_S04E41.mp4',
+ u'info_dict': {
+ u'title': u'30 vies Saison 4 / Épisode 41',
+ u'description': u'md5:da363002db82ccbe4dafeb9cab039b09',
+ u'age_limit': 8,
+ u'uploader': u'Groupe des Nouveaux Médias',
+ u'duration': 1296,
+ u'upload_date': u'20131118',
+ u'thumbnail': u'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg',
+ },
+ u'params': {
+ u'skip_download': True, # Requires rtmpdump
+ },
+ u'skip': 'Only available in Canada'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+
+ mediaId = self._search_regex(
+ r'"idMedia":\s*"([^"]+)"', webpage, u'media ID')
+
+ streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId
+ streams_doc = self._download_xml(
+ streams_url, video_id, note=u'Downloading stream list')
+
+ video_url = next(n.text
+ for n in streams_doc.findall('.//choice/url')
+ if u'//ad.doubleclick' not in n.text)
+ if video_url.endswith('/Unavailable.flv'):
+ raise ExtractorError(
+ u'Access to this video is blocked from outside of Canada',
+ expected=True)
+
+ duration_str = self._html_search_meta(
+ 'video:duration', webpage, u'duration')
+ duration = int(duration_str) if duration_str else None
+ upload_date_str = self._html_search_meta(
+ 'video:release_date', webpage, u'upload date')
+ upload_date = unified_strdate(upload_date_str) if upload_date_str else None
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'url': video_url,
+ 'description': self._og_search_description(webpage),
+ 'uploader': self._dc_search_uploader(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'age_limit': self._media_rating_search(webpage),
+ 'duration': duration,
+ 'upload_date': upload_date,
+ 'ext': 'mp4',
+ }
diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py
index 0bf028f61..d64aaa41f 100644
--- a/youtube_dl/extractor/trilulilu.py
+++ b/youtube_dl/extractor/trilulilu.py
@@ -1,6 +1,5 @@
import json
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
@@ -36,12 +35,10 @@ class TriluliluIE(InfoExtractor):
format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/'
u'video-formats2' % log)
- format_str = self._download_webpage(
+ format_doc = self._download_xml(
format_url, video_id,
note=u'Downloading formats',
errnote=u'Error while downloading formats')
-
- format_doc = xml.etree.ElementTree.fromstring(format_str)
video_url_template = (
u'http://fs%(server)s.trilulilu.ro/stream.php?type=video'
@@ -58,7 +55,7 @@ class TriluliluIE(InfoExtractor):
for fnode in format_doc.findall('./formats/format')
]
- info = {
+ return {
'_type': 'video',
'id': video_id,
'formats': formats,
@@ -67,7 +64,3 @@ class TriluliluIE(InfoExtractor):
'thumbnail': thumbnail,
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
-
- return info
diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py
index aea9d9a24..4d9d41db3 100644
--- a/youtube_dl/extractor/tube8.py
+++ b/youtube_dl/extractor/tube8.py
@@ -5,8 +5,6 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
- compat_urllib_parse,
- unescapeHTML,
)
from ..aes import (
aes_decrypt_text
@@ -46,10 +44,10 @@ class Tube8IE(InfoExtractor):
if webpage.find('"encrypted":true')!=-1:
password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, u'password')
video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8')
- path = compat_urllib_parse_urlparse( video_url ).path
- extension = os.path.splitext( path )[1][1:]
+ path = compat_urllib_parse_urlparse(video_url).path
+ extension = os.path.splitext(path)[1][1:]
format = path.split('/')[4].split('_')[:2]
- format = "-".join( format )
+ format = "-".join(format)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py
new file mode 100644
index 000000000..bfed9dd04
--- /dev/null
+++ b/youtube_dl/extractor/tvp.py
@@ -0,0 +1,42 @@
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class TvpIE(InfoExtractor):
+ IE_NAME = u'tvp.pl'
+ _VALID_URL = r'https?://www\.tvp\.pl/.*?wideo/(?P<date>\d+)/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.tvp.pl/warszawa/magazyny/campusnews/wideo/31102013/12878238',
+ u'md5': u'148408967a6a468953c0a75cbdaf0d7a',
+ u'file': u'12878238.wmv',
+ u'info_dict': {
+ u'title': u'31.10.2013 - Odcinek 2',
+ u'description': u'31.10.2013 - Odcinek 2',
+ },
+ u'skip': u'Download has to use same server IP as extraction. Therefore, a good (load-balancing) DNS resolver will make the download fail.'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ json_url = 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id
+ json_params = self._download_webpage(
+ json_url, video_id, u"Downloading video metadata")
+
+ params = json.loads(json_params)
+ self.report_extraction(video_id)
+ video_url = params['video_url']
+
+ title = self._og_search_title(webpage, fatal=True)
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'ext': 'wmv',
+ 'url': video_url,
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py
index 516e18914..474610eec 100644
--- a/youtube_dl/extractor/unistra.py
+++ b/youtube_dl/extractor/unistra.py
@@ -3,7 +3,7 @@ import re
from .common import InfoExtractor
class UnistraIE(InfoExtractor):
- _VALID_URL = r'http://utv.unistra.fr/(?:index|video).php\?id_video\=(\d+)'
+ _VALID_URL = r'http://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(\d+)'
_TEST = {
u'url': u'http://utv.unistra.fr/video.php?id_video=154',
diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py
index 3a99a29c6..3cf8c853d 100644
--- a/youtube_dl/extractor/veehd.py
+++ b/youtube_dl/extractor/veehd.py
@@ -9,7 +9,7 @@ from ..utils import (
)
class VeeHDIE(InfoExtractor):
- _VALID_URL = r'https?://veehd.com/video/(?P<id>\d+)'
+ _VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)'
_TEST = {
u'url': u'http://veehd.com/video/4686958',
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index 3f6020f74..a4b26a26f 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -15,7 +15,12 @@ class VevoIE(InfoExtractor):
Accepts urls from vevo.com or in the format 'vevo:{id}'
(currently used by MTVIE)
"""
- _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)'
+ _VALID_URL = r'''(?x)
+ (?:https?://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?|
+ https?://cache\.vevo\.com/m/html/embed\.html\?video=|
+ https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|
+ vevo:)
+ (?P<id>[^&?#]+)'''
_TESTS = [{
u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
u'file': u'GB1101300280.mp4',
@@ -24,7 +29,7 @@ class VevoIE(InfoExtractor):
u"upload_date": u"20130624",
u"uploader": u"Hurts",
u"title": u"Somebody to Die For",
- u"duration": 230,
+ u"duration": 230.12,
u"width": 1920,
u"height": 1080,
}
@@ -78,12 +83,13 @@ class VevoIE(InfoExtractor):
continue
format_url = self._SMIL_BASE_URL + m.group('path')
- format_note = ('%(vcodec)s@%(vbr)4sk, %(acodec)s@%(abr)3sk' %
- m.groupdict())
formats.append({
'url': format_url,
'format_id': u'SMIL_' + m.group('cbr'),
- 'format_note': format_note,
+ 'vcodec': m.group('vcodec'),
+ 'acodec': m.group('acodec'),
+ 'vbr': int(m.group('vbr')),
+ 'abr': int(m.group('abr')),
'ext': m.group('ext'),
'width': int(m.group('width')),
'height': int(m.group('height')),
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
index 6b93afa50..87812d6af 100644
--- a/youtube_dl/extractor/vice.py
+++ b/youtube_dl/extractor/vice.py
@@ -6,7 +6,7 @@ from ..utils import ExtractorError
class ViceIE(InfoExtractor):
- _VALID_URL = r'http://www.vice.com/.*?/(?P<name>.+)'
+ _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
_TEST = {
u'url': u'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py
index 12c84a985..9328ef4a2 100644
--- a/youtube_dl/extractor/viddler.py
+++ b/youtube_dl/extractor/viddler.py
@@ -2,13 +2,10 @@ import json
import re
from .common import InfoExtractor
-from ..utils import (
- determine_ext,
-)
class ViddlerIE(InfoExtractor):
- _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[0-9]+)'
+ _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler\.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)'
_TEST = {
u"url": u"http://www.viddler.com/v/43903784",
u'file': u'43903784.mp4',
@@ -47,7 +44,7 @@ class ViddlerIE(InfoExtractor):
r"thumbnail\s*:\s*'([^']*)'",
webpage, u'thumbnail', fatal=False)
- info = {
+ return {
'_type': 'video',
'id': video_id,
'title': title,
@@ -56,9 +53,3 @@ class ViddlerIE(InfoExtractor):
'duration': duration,
'formats': formats,
}
-
- # TODO: Remove when #980 has been merged
- info['formats'][-1]['ext'] = determine_ext(info['formats'][-1]['url'])
- info.update(info['formats'][-1])
-
- return info
diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py
index 94f64ffa5..f75169041 100644
--- a/youtube_dl/extractor/videofyme.py
+++ b/youtube_dl/extractor/videofyme.py
@@ -1,5 +1,4 @@
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -8,7 +7,7 @@ from ..utils import (
)
class VideofyMeIE(InfoExtractor):
- _VALID_URL = r'https?://(www.videofy.me/.+?|p.videofy.me/v)/(?P<id>\d+)(&|#|$)'
+ _VALID_URL = r'https?://(www\.videofy\.me/.+?|p\.videofy\.me/v)/(?P<id>\d+)(&|#|$)'
IE_NAME = u'videofy.me'
_TEST = {
@@ -27,9 +26,8 @@ class VideofyMeIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- config_xml = self._download_webpage('http://sunshine.videofy.me/?videoId=%s' % video_id,
+ config = self._download_xml('http://sunshine.videofy.me/?videoId=%s' % video_id,
video_id)
- config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
video = config.find('video')
sources = video.find('sources')
url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key)
diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py
index 65f39b982..65463c733 100644
--- a/youtube_dl/extractor/videopremium.py
+++ b/youtube_dl/extractor/videopremium.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
class VideoPremiumIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.tv/(?P<id>\w+)(?:/.*)?'
+ _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.(?:tv|me)/(?P<id>\w+)(?:/.*)?'
_TEST = {
u'url': u'http://videopremium.tv/4w7oadjsf156',
u'file': u'4w7oadjsf156.f4v',
@@ -15,6 +15,7 @@ class VideoPremiumIE(InfoExtractor):
u'params': {
u'skip_download': True,
},
+ u'skip': u'Test file has been deleted.',
}
def _real_extract(self, url):
@@ -24,12 +25,16 @@ class VideoPremiumIE(InfoExtractor):
webpage_url = 'http://videopremium.tv/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
- self.report_extraction(video_id)
+ if re.match(r"^<html><head><script[^>]*>window.location\s*=", webpage):
+ # Download again, we need a cookie
+ webpage = self._download_webpage(
+ webpage_url, video_id,
+ note=u'Downloading webpage again (with cookie)')
- video_title = self._html_search_regex(r'<h2(?:.*?)>\s*(.+?)\s*<',
- webpage, u'video title')
+ video_title = self._html_search_regex(
+ r'<h2(?:.*?)>\s*(.+?)\s*<', webpage, u'video title')
- return [{
+ return {
'id': video_id,
'url': "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16),
'play_path': "mp4:%s.f4v" % video_id,
@@ -37,4 +42,4 @@ class VideoPremiumIE(InfoExtractor):
'player_url': "http://videopremium.tv/uplayer/uppod.swf",
'ext': 'f4v',
'title': video_title,
- }]
+ }
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
new file mode 100644
index 000000000..2206a06d5
--- /dev/null
+++ b/youtube_dl/extractor/viki.py
@@ -0,0 +1,101 @@
+import re
+
+from ..utils import (
+ ExtractorError,
+ unescapeHTML,
+ unified_strdate,
+)
+from .subtitles import SubtitlesInfoExtractor
+
+
+class VikiIE(SubtitlesInfoExtractor):
+ IE_NAME = u'viki'
+
+ _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
+ _TEST = {
+ u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14',
+ u'file': u'1023585v.mp4',
+ u'md5': u'a21454021c2646f5433514177e2caa5f',
+ u'info_dict': {
+ u'title': u'Heirs Episode 14',
+ u'uploader': u'SBS',
+ u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e',
+ u'upload_date': u'20131121',
+ u'age_limit': 13,
+ },
+ u'skip': u'Blocked in the US',
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ uploader_m = re.search(
+ r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
+ if uploader_m is None:
+ uploader = None
+ else:
+ uploader = uploader_m.group(1).strip()
+
+ rating_str = self._html_search_regex(
+ r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
+ u'rating information', default='').strip()
+ RATINGS = {
+ 'G': 0,
+ 'PG': 10,
+ 'PG-13': 13,
+ 'R': 16,
+ 'NC': 18,
+ }
+ age_limit = RATINGS.get(rating_str)
+
+ info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id
+ info_webpage = self._download_webpage(
+ info_url, video_id, note=u'Downloading info page')
+ if re.match(r'\s*<div\s+class="video-error', info_webpage):
+ raise ExtractorError(
+ u'Video %s is blocked from your location.' % video_id,
+ expected=True)
+ video_url = self._html_search_regex(
+ r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL')
+
+ upload_date_str = self._html_search_regex(
+ r'"created_at":"([^"]+)"', info_webpage, u'upload date')
+ upload_date = (
+ unified_strdate(upload_date_str)
+ if upload_date_str is not None
+ else None
+ )
+
+ # subtitles
+ video_subtitles = self.extract_subtitles(video_id, info_webpage)
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, info_webpage)
+ return
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'age_limit': age_limit,
+ 'uploader': uploader,
+ 'subtitles': video_subtitles,
+ 'upload_date': upload_date,
+ }
+
+ def _get_available_subtitles(self, video_id, info_webpage):
+ res = {}
+ for sturl_html in re.findall(r'<track src="([^"]+)"/>', info_webpage):
+ sturl = unescapeHTML(sturl_html)
+ m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
+ if not m:
+ continue
+ res[m.group('lang')] = sturl
+ return res
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index c7d864a2b..ea4409528 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -20,7 +20,7 @@ class VimeoIE(InfoExtractor):
"""Information extractor for vimeo.com."""
# _VALID_URL matches Vimeo URLs
- _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'
+ _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:.*?/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'
_NETRC_MACHINE = 'vimeo'
IE_NAME = u'vimeo'
_TESTS = [
@@ -115,7 +115,7 @@ class VimeoIE(InfoExtractor):
def _real_initialize(self):
self._login()
- def _real_extract(self, url, new_video=True):
+ def _real_extract(self, url):
url, data = unsmuggle_url(url)
headers = std_headers
if data is not None:
@@ -128,11 +128,9 @@ class VimeoIE(InfoExtractor):
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
- if not mobj.group('proto'):
- url = 'https://' + url
- elif mobj.group('pro'):
+ if mobj.group('pro') or mobj.group('player'):
url = 'http://player.vimeo.com/video/' + video_id
- elif mobj.group('direct_link'):
+ else:
url = 'https://vimeo.com/' + video_id
# Retrieve video webpage to extract further information
@@ -153,8 +151,14 @@ class VimeoIE(InfoExtractor):
config = json.loads(config_json)
except RegexNotFoundError:
# For pro videos or player.vimeo.com urls
- config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'],
- webpage, u'info section', flags=re.DOTALL)
+ # We try to find out to which variable is assigned the config dic
+ m_variable_name = re.search('(\w)\.video\.id', webpage)
+ if m_variable_name is not None:
+ config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1))
+ else:
+ config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
+ config = self._search_regex(config_re, webpage, u'info section',
+ flags=re.DOTALL)
config = json.loads(config)
except Exception as e:
if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
@@ -198,6 +202,16 @@ class VimeoIE(InfoExtractor):
if mobj is not None:
video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
+ try:
+ view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, u'view count'))
+ like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, u'like count'))
+ comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, u'comment count'))
+ except RegexNotFoundError:
+ # This info is only available in vimeo.com/{id} urls
+ view_count = None
+ like_count = None
+ comment_count = None
+
# Vimeo specific: extract request signature and timestamp
sig = config['request']['signature']
timestamp = config['request']['timestamp']
@@ -205,7 +219,7 @@ class VimeoIE(InfoExtractor):
# Vimeo specific: extract video codec and quality information
# First consider quality, then codecs, then take everything
codecs = [('vp6', 'flv'), ('vp8', 'flv'), ('h264', 'mp4')]
- files = { 'hd': [], 'sd': [], 'other': []}
+ files = {'hd': [], 'sd': [], 'other': []}
config_files = config["video"].get("files") or config["request"].get("files")
for codec_name, codec_extension in codecs:
for quality in config_files.get(codec_name, []):
@@ -234,7 +248,7 @@ class VimeoIE(InfoExtractor):
if len(formats) == 0:
raise ExtractorError(u'No known codec found')
- return [{
+ return {
'id': video_id,
'uploader': video_uploader,
'uploader_id': video_uploader_id,
@@ -243,32 +257,88 @@ class VimeoIE(InfoExtractor):
'thumbnail': video_thumbnail,
'description': video_description,
'formats': formats,
- }]
+ 'webpage_url': url,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'comment_count': comment_count,
+ }
class VimeoChannelIE(InfoExtractor):
IE_NAME = u'vimeo:channel'
_VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)'
_MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
+ _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- channel_id = mobj.group('id')
- video_ids = []
+ def _page_url(self, base_url, pagenum):
+ return '%s/videos/page:%d/' % (base_url, pagenum)
+ def _extract_list_title(self, webpage):
+ return self._html_search_regex(self._TITLE_RE, webpage, u'list title')
+
+ def _extract_videos(self, list_id, base_url):
+ video_ids = []
for pagenum in itertools.count(1):
- webpage = self._download_webpage('http://vimeo.com/channels/%s/videos/page:%d' % (channel_id, pagenum),
- channel_id, u'Downloading page %s' % pagenum)
+ webpage = self._download_webpage(
+ self._page_url(base_url, pagenum) ,list_id,
+ u'Downloading page %s' % pagenum)
video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))
if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
break
entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')
for video_id in video_ids]
- channel_title = self._html_search_regex(r'<a href="/channels/%s">(.*?)</a>' % channel_id,
- webpage, u'channel title')
return {'_type': 'playlist',
- 'id': channel_id,
- 'title': channel_title,
+ 'id': list_id,
+ 'title': self._extract_list_title(webpage),
'entries': entries,
}
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ channel_id = mobj.group('id')
+ return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id)
+
+
+class VimeoUserIE(VimeoChannelIE):
+ IE_NAME = u'vimeo:user'
+ _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)'
+ _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
+
+ @classmethod
+ def suitable(cls, url):
+ if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url) or VimeoAlbumIE.suitable(url) or VimeoGroupsIE.suitable(url):
+ return False
+ return super(VimeoUserIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ name = mobj.group('name')
+ return self._extract_videos(name, 'http://vimeo.com/%s' % name)
+
+
+class VimeoAlbumIE(VimeoChannelIE):
+ IE_NAME = u'vimeo:album'
+ _VALID_URL = r'(?:https?://)?vimeo.\com/album/(?P<id>\d+)'
+ _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
+
+ def _page_url(self, base_url, pagenum):
+ return '%s/page:%d/' % (base_url, pagenum)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ album_id = mobj.group('id')
+ return self._extract_videos(album_id, 'http://vimeo.com/album/%s' % album_id)
+
+
+class VimeoGroupsIE(VimeoAlbumIE):
+ IE_NAME = u'vimeo:group'
+ _VALID_URL = r'(?:https?://)?vimeo.\com/groups/(?P<name>[^/]+)'
+
+ def _extract_list_title(self, webpage):
+ return self._og_search_title(webpage)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ name = mobj.group('name')
+ return self._extract_videos(name, 'http://vimeo.com/groups/%s' % name)
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index c4ec1f06f..651ba317d 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -27,7 +27,7 @@ class VineIE(InfoExtractor):
video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
webpage, u'video URL')
- uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
+ uploader = self._html_search_regex(r'<p class="username">(.*?)</p>',
webpage, u'uploader', fatal=False, flags=re.DOTALL)
return [{
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py
index 29c25f0e3..4fab6c6e8 100644
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -11,7 +11,7 @@ from ..utils import (
class WatIE(InfoExtractor):
- _VALID_URL=r'http://www.wat.tv/.*-(?P<shortID>.*?)_.*?.html'
+ _VALID_URL=r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html'
IE_NAME = 'wat.tv'
_TEST = {
u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py
index 0757495bd..fa784ab99 100644
--- a/youtube_dl/extractor/weibo.py
+++ b/youtube_dl/extractor/weibo.py
@@ -13,6 +13,7 @@ class WeiboIE(InfoExtractor):
_VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P<id>.+?)\.htm'
_TEST = {
+ u'add_ie': ['Sina'],
u'url': u'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',
u'file': u'98322879.flv',
u'info_dict': {
diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py
index b9c3b13f9..82a626e0e 100644
--- a/youtube_dl/extractor/wimp.py
+++ b/youtube_dl/extractor/wimp.py
@@ -11,7 +11,8 @@ class WimpIE(InfoExtractor):
u'file': u'deerfence.flv',
u'md5': u'8b215e2e0168c6081a1cf84b2846a2b5',
u'info_dict': {
- u"title": u"Watch Till End: Herd of deer jump over a fence."
+ u"title": u"Watch Till End: Herd of deer jump over a fence.",
+ u"description": u"These deer look as fluid as running water when they jump over this fence as a herd. This video is one that needs to be watched until the very end for the true majesty to be witnessed, but once it comes, it's sure to take your breath away.",
}
}
@@ -19,18 +20,14 @@ class WimpIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
- title = self._search_regex(r'<meta name="description" content="(.+?)" />',webpage, 'video title')
- thumbnail_url = self._search_regex(r'<meta property="og\:image" content="(.+?)" />', webpage,'video thumbnail')
googleString = self._search_regex("googleCode = '(.*?)'", webpage, 'file url')
googleString = base64.b64decode(googleString).decode('ascii')
- final_url = self._search_regex('","(.*?)"', googleString,'final video url')
- ext = final_url.rpartition(u'.')[2]
-
- return [{
- 'id': video_id,
- 'url': final_url,
- 'ext': ext,
- 'title': title,
- 'thumbnail': thumbnail_url,
- }]
+ final_url = self._search_regex('","(.*?)"', googleString, u'final video url')
+ return {
+ 'id': video_id,
+ 'url': final_url,
+ 'title': self._og_search_title(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'description': self._og_search_description(webpage),
+ }
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
new file mode 100644
index 000000000..e1748c261
--- /dev/null
+++ b/youtube_dl/extractor/wistia.py
@@ -0,0 +1,55 @@
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class WistiaIE(InfoExtractor):
+ _VALID_URL = r'^https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'
+
+ _TEST = {
+ u"url": u"http://fast.wistia.net/embed/iframe/sh7fpupwlt",
+ u"file": u"sh7fpupwlt.mov",
+ u"md5": u"cafeb56ec0c53c18c97405eecb3133df",
+ u"info_dict": {
+ u"title": u"cfh_resourceful_zdkh_final_1"
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ data_json = self._html_search_regex(
+ r'Wistia.iframeInit\((.*?), {}\);', webpage, u'video data')
+
+ data = json.loads(data_json)
+
+ formats = []
+ thumbnails = []
+ for atype, a in data['assets'].items():
+ if atype == 'still':
+ thumbnails.append({
+ 'url': a['url'],
+ 'resolution': '%dx%d' % (a['width'], a['height']),
+ })
+ continue
+ if atype == 'preview':
+ continue
+ formats.append({
+ 'format_id': atype,
+ 'url': a['url'],
+ 'width': a['width'],
+ 'height': a['height'],
+ 'filesize': a['size'],
+ 'ext': a['ext'],
+ })
+ formats.sort(key=lambda a: a['filesize'])
+
+ return {
+ 'id': video_id,
+ 'title': data['name'],
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ }
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 7444d3393..ef9997ee4 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -46,7 +46,7 @@ class XHamsterIE(InfoExtractor):
return mobj.group('server')+'/key='+mobj.group('file')
def is_hd(webpage):
- return webpage.find('<div class=\'icon iconHD\'>') != -1
+ return webpage.find('<div class=\'icon iconHD\'') != -1
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py
index 8a0eb1afd..1177a4b14 100644
--- a/youtube_dl/extractor/xnxx.py
+++ b/youtube_dl/extractor/xnxx.py
@@ -9,7 +9,7 @@ from ..utils import (
class XNXXIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
+ _VALID_URL = r'^(?:https?://)?(?:video|www)\.xnxx\.com/video([0-9]+)/(.*)'
VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
new file mode 100644
index 000000000..1a6a7688d
--- /dev/null
+++ b/youtube_dl/extractor/xtube.py
@@ -0,0 +1,54 @@
+import os
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse_urlparse,
+ compat_urllib_request,
+)
+
+class XTubeIE(InfoExtractor):
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
+ _TEST = {
+ u'url': u'http://www.xtube.com/watch.php?v=kVTUy_G222_',
+ u'file': u'kVTUy_G222_.mp4',
+ u'md5': u'092fbdd3cbe292c920ef6fc6a8a9cdab',
+ u'info_dict': {
+ u"title": u"strange erotica",
+ u"description": u"surreal gay themed erotica...almost an ET kind of thing",
+ u"uploader": u"greenshowers",
+ u"age_limit": 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('videoid')
+ url = 'http://www.' + mobj.group('url')
+
+ req = compat_urllib_request.Request(url)
+ req.add_header('Cookie', 'age_verified=1')
+ webpage = self._download_webpage(req, video_id)
+
+ video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, u'title')
+ video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, u'uploader', fatal=False)
+ video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, u'description', fatal=False)
+ video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, u'video_url').replace('\\/', '/')
+ path = compat_urllib_parse_urlparse(video_url).path
+ extension = os.path.splitext(path)[1][1:]
+ format = path.split('/')[5].split('_')[:2]
+ format[0] += 'p'
+ format[1] += 'k'
+ format = "-".join(format)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'uploader': video_uploader,
+ 'description': video_description,
+ 'url': video_url,
+ 'ext': extension,
+ 'format': format,
+ 'format_id': format,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 464b498f5..5c9c361b9 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -17,27 +17,21 @@ class YahooIE(InfoExtractor):
_TESTS = [
{
u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
- u'file': u'214727115.flv',
+ u'file': u'214727115.mp4',
+ u'md5': u'4962b075c08be8690a922ee026d05e69',
u'info_dict': {
u'title': u'Julian Smith & Travis Legg Watch Julian Smith',
u'description': u'Julian and Travis watch Julian Smith',
},
- u'params': {
- # Requires rtmpdump
- u'skip_download': True,
- },
},
{
u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
- u'file': u'103000935.flv',
+ u'file': u'103000935.mp4',
+ u'md5': u'd6e6fc6e1313c608f316ddad7b82b306',
u'info_dict': {
u'title': u'Codefellas - The Cougar Lies with Spanish Moss',
u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
},
- u'params': {
- # Requires rtmpdump
- u'skip_download': True,
- },
},
]
@@ -46,15 +40,19 @@ class YahooIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- items_json = self._search_regex(r'YVIDEO_INIT_ITEMS = ({.*?});$',
+ items_json = self._search_regex(r'mediaItems: ({.*?})$',
webpage, u'items', flags=re.MULTILINE)
items = json.loads(items_json)
info = items['mediaItems']['query']['results']['mediaObj'][0]
# The 'meta' field is not always in the video webpage, we request it
# from another page
long_id = info['id']
+ return self._get_info(long_id, video_id)
+
+ def _get_info(self, long_id, video_id):
query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
- ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2"' % long_id)
+ ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
+ ' AND protocol="http"' % long_id)
data = compat_urllib_parse.urlencode({
'q': query,
'env': 'prod',
@@ -91,17 +89,39 @@ class YahooIE(InfoExtractor):
formats.append(format_info)
formats = sorted(formats, key=lambda f:(f['height'], f['width']))
- info = {
+ return {
'id': video_id,
'title': meta['title'],
'formats': formats,
'description': clean_html(meta['description']),
'thumbnail': meta['thumbnail'],
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
- return info
+
+class YahooNewsIE(YahooIE):
+ IE_NAME = 'yahoo:news'
+ _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
+
+ _TEST = {
+ u'url': u'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
+ u'md5': u'67010fdf3a08d290e060a4dd96baa07b',
+ u'info_dict': {
+ u'id': u'104538833',
+ u'ext': u'mp4',
+ u'title': u'China Moses Is Crazy About the Blues',
+ u'description': u'md5:9900ab8cd5808175c7b3fe55b979bed0',
+ },
+ }
+
+ # Overwrite YahooIE properties we don't want
+ _TESTS = []
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, u'long id')
+ return self._get_info(long_id, video_id)
class YahooSearchIE(SearchInfoExtractor):
@@ -132,7 +152,7 @@ class YahooSearchIE(SearchInfoExtractor):
mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
res['entries'].append(e)
- if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
+ if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1)):
break
return res
diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py
index 1fcc518ac..e971b5b4b 100644
--- a/youtube_dl/extractor/youjizz.py
+++ b/youtube_dl/extractor/youjizz.py
@@ -7,7 +7,7 @@ from ..utils import (
class YouJizzIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
+ _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+)\.html$'
_TEST = {
u'url': u'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
u'file': u'2189178.flv',
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index 9d88c17f5..a8fd40c83 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -18,7 +18,7 @@ class YoukuIE(InfoExtractor):
u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",
u"file": u"XNDgyMDQ2NTQw_part00.flv",
u"md5": u"ffe3f2e435663dc2d1eea34faeff5b5b",
- u"params": { u"test": False },
+ u"params": {u"test": False},
u"info_dict": {
u"title": u"youtube-dl test video \"'/\\ä↭𝕐"
}
@@ -37,8 +37,8 @@ class YoukuIE(InfoExtractor):
source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
seed = float(seed)
for i in range(len(source)):
- seed = (seed * 211 + 30031 ) % 65536
- index = math.floor(seed / 65536 * len(source) )
+ seed = (seed * 211 + 30031) % 65536
+ index = math.floor(seed / 65536 * len(source))
mixed.append(source[int(index)])
source.remove(source[int(index)])
#return ''.join(mixed)
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index e46a9b4d6..bd0f2cae0 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -81,14 +81,14 @@ class YouPornIE(InfoExtractor):
# http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
# A path looks like this:
# /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
- video_url = unescapeHTML( link )
- path = compat_urllib_parse_urlparse( video_url ).path
- extension = os.path.splitext( path )[1][1:]
+ video_url = unescapeHTML(link)
+ path = compat_urllib_parse_urlparse(video_url).path
+ extension = os.path.splitext(path)[1][1:]
format = path.split('/')[4].split('_')[:2]
# size = format[0]
# bitrate = format[1]
- format = "-".join( format )
+ format = "-".join(format)
# title = u'%s-%s-%s' % (video_title, size, bitrate)
formats.append({
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index dc601de52..c860eedda 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -7,20 +7,16 @@ import itertools
import json
import os.path
import re
-import socket
import string
import struct
import traceback
-import xml.etree.ElementTree
import zlib
from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
compat_chr,
- compat_http_client,
compat_parse_qs,
- compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
@@ -29,6 +25,7 @@ from ..utils import (
clean_html,
get_cachedir,
get_element_by_id,
+ get_element_by_attribute,
ExtractorError,
unescapeHTML,
unified_strdate,
@@ -45,19 +42,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
- def report_lang(self):
- """Report attempt to set language."""
- self.to_screen(u'Setting language')
-
def _set_language(self):
- request = compat_urllib_request.Request(self._LANG_URL)
- try:
- self.report_lang()
- compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
- return False
- return True
+ return bool(self._download_webpage(
+ self._LANG_URL, None,
+ note=u'Setting language', errnote='unable to set language',
+ fatal=False))
def _login(self):
(username, password) = self._get_login_info()
@@ -67,12 +56,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
return False
- request = compat_urllib_request.Request(self._LOGIN_URL)
- try:
- login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
- return False
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None,
+ note=u'Downloading login page',
+ errnote=u'unable to fetch login page', fatal=False)
+ if login_page is False:
+ return
galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
login_page, u'Login GALX parameter')
@@ -102,29 +91,28 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# chokes on unicode
login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
- request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
- try:
- self.report_login()
- login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
- if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
- self._downloader.report_warning(u'unable to log in: bad username or password')
- return False
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
+
+ req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
+ login_results = self._download_webpage(
+ req, None,
+ note=u'Logging in', errnote=u'unable to log in', fatal=False)
+ if login_results is False:
+ return False
+ if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
+ self._downloader.report_warning(u'unable to log in: bad username or password')
return False
return True
def _confirm_age(self):
age_form = {
- 'next_url': '/',
- 'action_confirm': 'Confirm',
- }
- request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
- try:
- self.report_age_confirmation()
- compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
+ 'next_url': '/',
+ 'action_confirm': 'Confirm',
+ }
+ req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
+
+ self._download_webpage(
+ req, None,
+ note=u'Confirming age', errnote=u'Unable to confirm age')
return True
def _real_initialize(self):
@@ -139,10 +127,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
IE_DESC = u'YouTube.com'
- _VALID_URL = r"""^
+ _VALID_URL = r"""(?x)^
(
- (?:https?://)? # http(s):// (optional)
- (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
+ (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
+ (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
tube\.majestyc\.net/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
@@ -248,21 +236,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'248': 'webm',
}
_video_dimensions = {
- '5': '240x400',
+ '5': '400x240',
'6': '???',
'13': '???',
- '17': '144x176',
- '18': '360x640',
- '22': '720x1280',
- '34': '360x640',
- '35': '480x854',
- '36': '240x320',
- '37': '1080x1920',
- '38': '3072x4096',
- '43': '360x640',
- '44': '480x854',
- '45': '720x1280',
- '46': '1080x1920',
+ '17': '176x144',
+ '18': '640x360',
+ '22': '1280x720',
+ '34': '640x360',
+ '35': '854x480',
+ '36': '320x240',
+ '37': '1920x1080',
+ '38': '4096x3072',
+ '43': '640x360',
+ '44': '854x480',
+ '45': '1280x720',
+ '46': '1920x1080',
'82': '360p',
'83': '480p',
'84': '720p',
@@ -336,19 +324,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"uploader": u"Philipp Hagemeister",
u"uploader_id": u"phihag",
u"upload_date": u"20121002",
- u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
- }
- },
- {
- u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
- u"file": u"1ltcDfZMA3U.mp4",
- u"note": u"Test VEVO video (#897)",
- u"info_dict": {
- u"upload_date": u"20070518",
- u"title": u"Maps - It Will Find You",
- u"description": u"Music video by Maps performing It Will Find You.",
- u"uploader": u"MuteUSA",
- u"uploader_id": u"MuteUSA"
+ u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
}
},
{
@@ -375,6 +351,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"uploader_id": u"justintimberlakeVEVO"
}
},
+ {
+ u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
+ u"file": u"yZIXLfi8CZQ.mp4",
+ u"note": u"Embed-only video (#1746)",
+ u"info_dict": {
+ u"upload_date": u"20120608",
+ u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
+ u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
+ u"uploader": u"SET India",
+ u"uploader_id": u"setindia"
+ }
+ },
]
@@ -382,16 +370,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
if YoutubePlaylistIE.suitable(url): return False
- return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+ return re.match(cls._VALID_URL, url) is not None
def __init__(self, *args, **kwargs):
super(YoutubeIE, self).__init__(*args, **kwargs)
self._player_cache = {}
- def report_video_webpage_download(self, video_id):
- """Report attempt to download video webpage."""
- self.to_screen(u'%s: Downloading video webpage' % video_id)
-
def report_video_info_webpage_download(self, video_id):
"""Report attempt to download video info webpage."""
self.to_screen(u'%s: Downloading video info webpage' % video_id)
@@ -1031,6 +1015,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
"""Turn the encrypted s field into a working signature"""
if player_url is not None:
+ if player_url.startswith(u'//'):
+ player_url = u'https:' + player_url
try:
player_id = (player_url, len(s))
if player_id not in self._player_cache:
@@ -1094,7 +1080,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
- def _get_available_subtitles(self, video_id):
+ def _get_available_subtitles(self, video_id, webpage):
try:
sub_list = self._download_webpage(
'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
@@ -1110,7 +1096,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
params = compat_urllib_parse.urlencode({
'lang': lang,
'v': video_id,
- 'fmt': self._downloader.params.get('subtitlesformat'),
+ 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
'name': l[0].encode('utf-8'),
})
url = u'http://www.youtube.com/api/timedtext?' + params
@@ -1123,7 +1109,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _get_available_automatic_caption(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
- sub_format = self._downloader.params.get('subtitlesformat')
+ sub_format = self._downloader.params.get('subtitlesformat', 'srt')
self.to_screen(u'%s: Looking for automatic captions' % video_id)
mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
err_msg = u'Couldn\'t find automatic captions for %s' % video_id
@@ -1142,8 +1128,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'asrs': 1,
})
list_url = caption_url + '&' + list_params
- list_page = self._download_webpage(list_url, video_id)
- caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
+ caption_list = self._download_xml(list_url, video_id)
original_lang_node = caption_list.find('track')
if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
self._downloader.report_warning(u'Video doesn\'t have automatic captions')
@@ -1257,15 +1242,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
video_id = self._extract_id(url)
# Get video webpage
- self.report_video_webpage_download(video_id)
url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
- request = compat_urllib_request.Request(url)
- try:
- video_webpage_bytes = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
-
- video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
+ video_webpage = self._download_webpage(url, video_id)
# Attempt to extract SWF player URL
mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
@@ -1282,7 +1260,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
data = compat_urllib_parse.urlencode({'video_id': video_id,
- 'el': 'embedded',
+ 'el': 'player_embedded',
'gl': 'US',
'hl': 'en',
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
@@ -1311,6 +1289,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
raise ExtractorError(u'"token" parameter not in video info for unknown reason')
+ if 'view_count' in video_info:
+ view_count = int(video_info['view_count'][0])
+ else:
+ view_count = None
+
# Check for "rental" videos
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
raise ExtractorError(u'"rental" videos not supported')
@@ -1360,6 +1343,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# description
video_description = get_element_by_id("eow-description", video_webpage)
if video_description:
+ video_description = re.sub(r'''(?x)
+ <a\s+
+ (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ title="([^"]+)"\s+
+ (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ class="yt-uix-redirect-link"\s*>
+ [^<]+
+ </a>
+ ''', r'\1', video_description)
video_description = clean_html(video_description)
else:
fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
@@ -1368,6 +1360,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
video_description = u''
+ def _extract_count(klass):
+ count = self._search_regex(
+ r'class="%s">([\d,]+)</span>' % re.escape(klass),
+ video_webpage, klass, default=None)
+ if count is not None:
+ return int(count.replace(',', ''))
+ return None
+ like_count = _extract_count(u'likes-count')
+ dislike_count = _extract_count(u'dislikes-count')
+
# subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage)
@@ -1377,9 +1379,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if 'length_seconds' not in video_info:
self._downloader.report_warning(u'unable to extract video duration')
- video_duration = ''
+ video_duration = None
else:
- video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
+ video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
# annotations
video_annotations = None
@@ -1497,11 +1499,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'subtitles': video_subtitles,
'duration': video_duration,
'age_limit': 18 if age_gate else 0,
- 'annotations': video_annotations
+ 'annotations': video_annotations,
+ 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
})
return results
-class YoutubePlaylistIE(InfoExtractor):
+class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
IE_DESC = u'YouTube.com playlists'
_VALID_URL = r"""(?:
(?:https?://)?
@@ -1512,13 +1518,14 @@ class YoutubePlaylistIE(InfoExtractor):
\? (?:.*?&)*? (?:p|a|list)=
| p/
)
- ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
+ ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
.*
|
- ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
+ ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
)"""
- _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
- _MAX_RESULTS = 50
+ _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
+ _MORE_PAGES_INDICATOR = r'data-link-type="next"'
+ _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
IE_NAME = u'youtube:playlist'
@classmethod
@@ -1526,6 +1533,27 @@ class YoutubePlaylistIE(InfoExtractor):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+ def _real_initialize(self):
+ self._login()
+
+ def _ids_to_results(self, ids):
+ return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
+ for vid_id in ids]
+
+ def _extract_mix(self, playlist_id):
+ # The mixes are generated from a a single video
+ # the id of the playlist is just 'RD' + video_id
+ url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
+ webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
+ title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
+ get_element_by_attribute('class', 'title ', webpage))
+ title = clean_html(title_span)
+ video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
+ ids = orderedSet(re.findall(video_re, webpage))
+ url_results = self._ids_to_results(ids)
+
+ return self.playlist_result(url_results, playlist_id, title)
+
def _real_extract(self, url):
# Extract playlist id
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@@ -1539,51 +1567,73 @@ class YoutubePlaylistIE(InfoExtractor):
video_id = query_dict['v'][0]
if self._downloader.params.get('noplaylist'):
self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
- return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
+ return self.url_result(video_id, 'Youtube', video_id=video_id)
else:
self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
- # Download playlist videos from API
- videos = []
+ if playlist_id.startswith('RD'):
+ # Mixes require a custom extraction process
+ return self._extract_mix(playlist_id)
+ if playlist_id.startswith('TL'):
+ raise ExtractorError(u'For downloading YouTube.com top lists, use '
+ u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
+
+ # Extract the video ids from the playlist pages
+ ids = []
for page_num in itertools.count(1):
- start_index = self._MAX_RESULTS * (page_num - 1) + 1
- if start_index >= 1000:
- self._downloader.report_warning(u'Max number of results reached')
- break
- url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
+ url = self._TEMPLATE_URL % (playlist_id, page_num)
page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
+ matches = re.finditer(self._VIDEO_RE, page)
+ # We remove the duplicates and the link with index 0
+ # (it's not the first video of the playlist)
+ new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
+ ids.extend(new_ids)
- try:
- response = json.loads(page)
- except ValueError as err:
- raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
-
- if 'feed' not in response:
- raise ExtractorError(u'Got a malformed response from YouTube API')
- playlist_title = response['feed']['title']['$t']
- if 'entry' not in response['feed']:
- # Number of videos is a multiple of self._MAX_RESULTS
+ if re.search(self._MORE_PAGES_INDICATOR, page) is None:
break
- for entry in response['feed']['entry']:
- index = entry['yt$position']['$t']
- if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
- videos.append((
- index,
- 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
- ))
+ playlist_title = self._og_search_title(page)
- videos = [v[1] for v in sorted(videos)]
+ url_results = self._ids_to_results(ids)
+ return self.playlist_result(url_results, playlist_id, playlist_title)
- url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
- return [self.playlist_result(url_results, playlist_id, playlist_title)]
+
+class YoutubeTopListIE(YoutubePlaylistIE):
+ IE_NAME = u'youtube:toplist'
+ IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
+ u' (Example: "yttoplist:music:Top Tracks")')
+ _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ channel = mobj.group('chann')
+ title = mobj.group('title')
+ query = compat_urllib_parse.urlencode({'title': title})
+ playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
+ channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
+ link = self._html_search_regex(playlist_re, channel_page, u'list')
+ url = compat_urlparse.urljoin('https://www.youtube.com/', link)
+
+ video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
+ ids = []
+ # sometimes the webpage doesn't contain the videos
+ # retry until we get them
+ for i in itertools.count(0):
+ msg = u'Downloading Youtube mix'
+ if i > 0:
+ msg += ', retry #%d' % i
+ webpage = self._download_webpage(url, title, msg)
+ ids = orderedSet(re.findall(video_re, webpage))
+ if ids:
+ break
+ url_results = self._ids_to_results(ids)
+ return self.playlist_result(url_results, playlist_title=title)
class YoutubeChannelIE(InfoExtractor):
IE_DESC = u'YouTube.com channels'
_VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
- _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
_MORE_PAGES_INDICATOR = 'yt-uix-load-more'
_MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
IE_NAME = u'youtube:channel'
@@ -1604,36 +1654,38 @@ class YoutubeChannelIE(InfoExtractor):
# Download channel page
channel_id = mobj.group(1)
video_ids = []
- pagenum = 1
-
- url = self._TEMPLATE_URL % (channel_id, pagenum)
- page = self._download_webpage(url, channel_id,
- u'Downloading page #%s' % pagenum)
-
- # Extract video identifiers
- ids_in_page = self.extract_videos_from_page(page)
- video_ids.extend(ids_in_page)
-
- # Download any subsequent channel pages using the json-based channel_ajax query
- if self._MORE_PAGES_INDICATOR in page:
+ url = 'https://www.youtube.com/channel/%s/videos' % channel_id
+ channel_page = self._download_webpage(url, channel_id)
+ autogenerated = re.search(r'''(?x)
+ class="[^"]*?(?:
+ channel-header-autogenerated-label|
+ yt-channel-title-autogenerated
+ )[^"]*"''', channel_page) is not None
+
+ if autogenerated:
+ # The videos are contained in a single page
+ # the ajax pages can't be used, they are empty
+ video_ids = self.extract_videos_from_page(channel_page)
+ else:
+ # Download all channel pages using the json-based channel_ajax query
for pagenum in itertools.count(1):
url = self._MORE_PAGES_URL % (pagenum, channel_id)
page = self._download_webpage(url, channel_id,
u'Downloading page #%s' % pagenum)
-
+
page = json.loads(page)
-
+
ids_in_page = self.extract_videos_from_page(page['content_html'])
video_ids.extend(ids_in_page)
-
- if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
+
+ if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
break
self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
- urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
- url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
- return [self.playlist_result(url_entries, channel_id)]
+ url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in video_ids]
+ return self.playlist_result(url_entries, channel_id)
class YoutubeUserIE(InfoExtractor):
@@ -1697,9 +1749,11 @@ class YoutubeUserIE(InfoExtractor):
if len(ids_in_page) < self._GDATA_PAGE_SIZE:
break
- urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
- url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
- return [self.playlist_result(url_results, playlist_title = username)]
+ url_results = [
+ self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in video_ids]
+ return self.playlist_result(url_results, playlist_title=username)
+
class YoutubeSearchIE(SearchInfoExtractor):
IE_DESC = u'YouTube.com searches'
@@ -1708,10 +1762,6 @@ class YoutubeSearchIE(SearchInfoExtractor):
IE_NAME = u'youtube:search'
_SEARCH_KEY = 'ytsearch'
- def report_download_page(self, query, pagenum):
- """Report attempt to download search page with given number."""
- self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
-
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
@@ -1720,16 +1770,15 @@ class YoutubeSearchIE(SearchInfoExtractor):
limit = n
while (50 * pagenum) < limit:
- self.report_download_page(query, pagenum+1)
result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
- request = compat_urllib_request.Request(result_url)
- try:
- data = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
- api_response = json.loads(data)['data']
-
- if not 'items' in api_response:
+ data_json = self._download_webpage(
+ result_url, video_id=u'query "%s"' % query,
+ note=u'Downloading page %s' % (pagenum + 1),
+ errnote=u'Unable to download API page')
+ data = json.loads(data_json)
+ api_response = data['data']
+
+ if 'items' not in api_response:
raise ExtractorError(u'[youtube] No video results')
new_ids = list(video['id'] for video in api_response['items'])
@@ -1740,9 +1789,15 @@ class YoutubeSearchIE(SearchInfoExtractor):
if len(video_ids) > n:
video_ids = video_ids[:n]
- videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
+ videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in video_ids]
return self.playlist_result(videos, query)
+class YoutubeSearchDateIE(YoutubeSearchIE):
+ IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
+ _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
+ _SEARCH_KEY = 'ytsearchdate'
+ IE_DESC = u'YouTube.com searches, newest videos first'
class YoutubeShowIE(InfoExtractor):
IE_DESC = u'YouTube.com (multi-season) shows'
@@ -1766,7 +1821,6 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
"""
_LOGIN_REQUIRED = True
- _PAGING_STEP = 30
# use action_load_personal_feed instead of action_load_system_feed
_PERSONAL_FEED = False
@@ -1786,9 +1840,8 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
def _real_extract(self, url):
feed_entries = []
- # The step argument is available only in 2.7 or higher
- for i in itertools.count(0):
- paging = i*self._PAGING_STEP
+ paging = 0
+ for i in itertools.count(1):
info = self._download_webpage(self._FEED_TEMPLATE % paging,
u'%s feed' % self._FEED_NAME,
u'Downloading page %s' % i)
@@ -1796,9 +1849,12 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
feed_html = info['feed_html']
m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
ids = orderedSet(m.group(1) for m in m_ids)
- feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
+ feed_entries.extend(
+ self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in ids)
if info['paging'] is None:
break
+ paging = info['paging']
return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
@@ -1818,9 +1874,15 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
_VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
_FEED_NAME = 'watch_later'
_PLAYLIST_TITLE = u'Youtube Watch Later'
- _PAGING_STEP = 100
_PERSONAL_FEED = True
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
+ _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
+ _FEED_NAME = 'history'
+ _PERSONAL_FEED = True
+ _PLAYLIST_TITLE = u'Youtube Watch History'
+
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
IE_NAME = u'youtube:favorites'
IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index faed7ff7f..35ece354a 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -1,75 +1,125 @@
+# coding: utf-8
+
+import operator
import re
from .common import InfoExtractor
from ..utils import (
- determine_ext,
- ExtractorError,
+ unified_strdate,
)
class ZDFIE(InfoExtractor):
- _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
- _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
+ _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?'
+
+ _TEST = {
+ u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt",
+ u"file": u"2037704.webm",
+ u"info_dict": {
+ u"upload_date": u"20131127",
+ u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".",
+ u"uploader": u"spezial",
+ u"title": u"ZDFspezial - Ende des Machtpokers"
+ },
+ u"skip": u"Videos on ZDF.de are depublicised in short order",
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('video_id')
- if mobj.group('hash'):
- url = url.replace(u'#', u'', 1)
+ xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+ doc = self._download_xml(
+ xml_url, video_id,
+ note=u'Downloading video info',
+ errnote=u'Failed to download video info')
+
+ title = doc.find('.//information/title').text
+ description = doc.find('.//information/detail').text
+ uploader_node = doc.find('.//details/originChannelTitle')
+ uploader = None if uploader_node is None else uploader_node.text
+ duration_str = doc.find('.//details/length').text
+ duration_m = re.match(r'''(?x)^
+ (?P<hours>[0-9]{2})
+ :(?P<minutes>[0-9]{2})
+ :(?P<seconds>[0-9]{2})
+ (?:\.(?P<ms>[0-9]+)?)
+ ''', duration_str)
+ duration = (
+ (
+ (int(duration_m.group('hours')) * 60 * 60) +
+ (int(duration_m.group('minutes')) * 60) +
+ int(duration_m.group('seconds'))
+ )
+ if duration_m
+ else None
+ )
+ upload_date = unified_strdate(doc.find('.//details/airtime').text)
+
+ def xml_to_format(fnode):
+ video_url = fnode.find('url').text
+ is_available = u'http://www.metafilegenerator' not in video_url
- html = self._download_webpage(url, video_id)
- streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
- if streams is None:
- raise ExtractorError(u'No media url found.')
+ format_id = fnode.attrib['basetype']
+ format_m = re.match(r'''(?x)
+ (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
+ (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
+ ''', format_id)
- # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
- # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
- # choose first/default media type and highest quality for now
- def stream_pref(s):
- TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming']
+ ext = format_m.group('container')
+ is_supported = ext != 'f4f'
+
+ PROTO_ORDER = ['http', 'rtmp', 'rtsp']
try:
- type_pref = TYPE_ORDER.index(s['media_type'])
+ proto_pref = -PROTO_ORDER.index(format_m.group('proto'))
except ValueError:
- type_pref = 999
+ proto_pref = -999
- QUALITY_ORDER = ['veryhigh', '300']
+ quality = fnode.find('./quality').text
+ QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low']
try:
- quality_pref = QUALITY_ORDER.index(s['quality'])
+ quality_pref = -QUALITY_ORDER.index(quality)
except ValueError:
- quality_pref = 999
-
- return (type_pref, quality_pref)
-
- sorted_streams = sorted(streams, key=stream_pref)
- if not sorted_streams:
- raise ExtractorError(u'No stream found.')
- stream = sorted_streams[0]
+ quality_pref = -999
- media_link = self._download_webpage(
- stream['video_url'],
- video_id,
- u'Get stream URL')
+ abr = int(fnode.find('./audioBitrate').text) // 1000
+ vbr = int(fnode.find('./videoBitrate').text) // 1000
+ pref = (is_available, is_supported,
+ proto_pref, quality_pref, vbr, abr)
- MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
- RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
+ format_note = u''
+ if not is_supported:
+ format_note += u'(unsupported)'
+ if not format_note:
+ format_note = None
- mobj = re.search(self._MEDIA_STREAM, media_link)
- if mobj is None:
- mobj = re.search(RTSP_STREAM, media_link)
- if mobj is None:
- raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
- video_url = mobj.group('video_url')
+ return {
+ 'format_id': format_id + u'-' + quality,
+ 'url': video_url,
+ 'ext': ext,
+ 'acodec': format_m.group('acodec'),
+ 'vcodec': format_m.group('vcodec'),
+ 'abr': abr,
+ 'vbr': vbr,
+ 'width': int(fnode.find('./width').text),
+ 'height': int(fnode.find('./height').text),
+ 'filesize': int(fnode.find('./filesize').text),
+ 'format_note': format_note,
+ '_pref': pref,
+ '_available': is_available,
+ }
- title = self._html_search_regex(
- r'<h1(?: class="beitragHeadline")?>(.*?)</h1>',
- html, u'title')
+ format_nodes = doc.findall('.//formitaeten/formitaet')
+ formats = sorted(filter(lambda f: f['_available'],
+ map(xml_to_format, format_nodes)),
+ key=operator.itemgetter('_pref'))
return {
'id': video_id,
- 'url': video_url,
'title': title,
- 'ext': determine_ext(video_url)
+ 'formats': formats,
+ 'description': description,
+ 'uploader': uploader,
+ 'duration': duration,
+ 'upload_date': upload_date,
}