aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rw-r--r--youtube_dl/PostProcessor.py15
-rw-r--r--youtube_dl/YoutubeDL.py17
-rw-r--r--youtube_dl/__init__.py45
-rw-r--r--youtube_dl/extractor/__init__.py17
-rw-r--r--youtube_dl/extractor/arte.py183
-rw-r--r--youtube_dl/extractor/brightcove.py7
-rw-r--r--youtube_dl/extractor/cinemassacre.py91
-rw-r--r--youtube_dl/extractor/gamespot.py71
-rw-r--r--youtube_dl/extractor/generic.py23
-rw-r--r--youtube_dl/extractor/internetvideoarchive.py87
-rw-r--r--youtube_dl/extractor/nowvideo.py43
-rw-r--r--youtube_dl/extractor/rottentomatoes.py16
-rw-r--r--youtube_dl/extractor/rutube.py58
-rw-r--r--youtube_dl/extractor/sztvhu.py44
-rw-r--r--youtube_dl/extractor/techtalks.py65
-rw-r--r--youtube_dl/extractor/tudou.py36
-rw-r--r--youtube_dl/extractor/videodetective.py30
-rw-r--r--youtube_dl/extractor/videopremium.py40
-rw-r--r--youtube_dl/extractor/vimeo.py11
-rw-r--r--youtube_dl/extractor/websurg.py59
-rw-r--r--youtube_dl/extractor/youku.py2
-rw-r--r--youtube_dl/extractor/youtube.py12
-rw-r--r--youtube_dl/utils.py35
-rw-r--r--youtube_dl/version.py2
24 files changed, 878 insertions, 131 deletions
diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py
index fbf8a7f98..13b56ede5 100644
--- a/youtube_dl/PostProcessor.py
+++ b/youtube_dl/PostProcessor.py
@@ -2,9 +2,15 @@ import os
import subprocess
import sys
import time
-import datetime
-from .utils import *
+
+from .utils import (
+ compat_subprocess_get_DEVNULL,
+ encodeFilename,
+ PostProcessingError,
+ shell_quote,
+ subtitles_filename,
+)
class PostProcessor(object):
@@ -83,6 +89,8 @@ class FFmpegPostProcessor(PostProcessor):
+ opts +
[encodeFilename(self._ffmpeg_filename_argument(out_path))])
+ if self._downloader.params.get('verbose', False):
+ self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd))
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout,stderr = p.communicate()
if p.returncode != 0:
@@ -178,7 +186,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
extension = self._preferredcodec
more_opts = []
if self._preferredquality is not None:
- if int(self._preferredquality) < 10:
+ # The opus codec doesn't support the -aq option
+ if int(self._preferredquality) < 10 and extension != 'opus':
more_opts += [self._exes['avconv'] and '-q:a' or '-aq', self._preferredquality]
else:
more_opts += [self._exes['avconv'] and '-b:a' or '-ab', self._preferredquality + 'k']
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index a32e50772..f22a8bd0e 100644
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -71,6 +71,7 @@ class YoutubeDL(object):
logtostderr: Log messages to stderr instead of stdout.
writedescription: Write the video description to a .description file
writeinfojson: Write the video description to a .info.json file
+ writeannotations: Write the video annotations to a .annotations.xml file
writethumbnail: Write the thumbnail image to a file
writesubtitles: Write the video subtitles to a file
writeautomaticsub: Write the automatic subtitles to a file
@@ -258,6 +259,10 @@ class YoutubeDL(object):
""" Report that the metadata file has been written """
self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
+ def report_writeannotations(self, annofn):
+ """ Report that the annotations file has been written. """
+ self.to_screen(u'[info] Writing video annotations to: ' + annofn)
+
def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded."""
try:
@@ -599,6 +604,18 @@ class YoutubeDL(object):
self.report_error(u'Cannot write description file ' + descfn)
return
+ if self.params.get('writeannotations', False):
+ try:
+ annofn = filename + u'.annotations.xml'
+ self.report_writeannotations(annofn)
+ with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
+ annofile.write(info_dict['annotations'])
+ except (KeyError, TypeError):
+ self.report_warning(u'There are no annotations to write.')
+ except (OSError, IOError):
+ self.report_error(u'Cannot write annotations file: ' + annofn)
+ return
+
subtitles_are_requested = any([self.params.get('writesubtitles', False),
self.params.get('writeautomaticsub')])
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index bc8e97250..cd642ce3b 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -31,6 +31,7 @@ __authors__ = (
'Huarong Huo',
'Ismael Mejía',
'Steffan \'Ruirize\' James',
+ 'Andras Elso',
)
__license__ = 'Public Domain'
@@ -46,17 +47,43 @@ import shlex
import socket
import subprocess
import sys
-import warnings
+import traceback
import platform
-from .utils import *
+from .utils import (
+ compat_cookiejar,
+ compat_print,
+ compat_str,
+ compat_urllib_request,
+ DateRange,
+ decodeOption,
+ determine_ext,
+ DownloadError,
+ get_cachedir,
+ make_HTTPS_handler,
+ MaxDownloadsReached,
+ platform_name,
+ preferredencoding,
+ SameFileError,
+ std_headers,
+ write_string,
+ YoutubeDLHandler,
+)
from .update import update_self
from .version import __version__
-from .FileDownloader import *
+from .FileDownloader import (
+ FileDownloader,
+)
from .extractor import gen_extractors
from .YoutubeDL import YoutubeDL
-from .PostProcessor import *
+from .PostProcessor import (
+ FFmpegMetadataPP,
+ FFmpegVideoConvertor,
+ FFmpegExtractAudioPP,
+ FFmpegEmbedSubtitlePP,
+)
+
def parseOpts(overrideArguments=None):
def _readOptions(filename_bytes):
@@ -240,11 +267,11 @@ def parseOpts(overrideArguments=None):
help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'')
downloader.add_option('-r', '--rate-limit',
- dest='ratelimit', metavar='LIMIT', help='maximum download rate (e.g. 50k or 44.6m)')
+ dest='ratelimit', metavar='LIMIT', help='maximum download rate in bytes per second (e.g. 50K or 4.2M)')
downloader.add_option('-R', '--retries',
dest='retries', metavar='RETRIES', help='number of retries (default is %default)', default=10)
downloader.add_option('--buffer-size',
- dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16k) (default is %default)', default="1024")
+ dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16K) (default is %default)', default="1024")
downloader.add_option('--no-resize-buffer',
action='store_true', dest='noresizebuffer',
help='do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.', default=False)
@@ -339,6 +366,9 @@ def parseOpts(overrideArguments=None):
filesystem.add_option('--write-info-json',
action='store_true', dest='writeinfojson',
help='write video metadata to a .info.json file', default=False)
+ filesystem.add_option('--write-annotations',
+ action='store_true', dest='writeannotations',
+ help='write video annotations to a .annotation file', default=False)
filesystem.add_option('--write-thumbnail',
action='store_true', dest='writethumbnail',
help='write thumbnail image to disk', default=False)
@@ -601,6 +631,7 @@ def _real_main(argv=None):
'nopart': opts.nopart,
'updatetime': opts.updatetime,
'writedescription': opts.writedescription,
+ 'writeannotations': opts.writeannotations,
'writeinfojson': opts.writeinfojson,
'writethumbnail': opts.writethumbnail,
'writesubtitles': opts.writesubtitles,
@@ -684,7 +715,7 @@ def _real_main(argv=None):
if opts.cookiefile is not None:
try:
jar.save()
- except (IOError, OSError) as err:
+ except (IOError, OSError):
sys.exit(u'ERROR: unable to save cookie jar')
sys.exit(retcode)
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 688196869..db69af361 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -2,7 +2,12 @@ from .appletrailers import AppleTrailersIE
from .addanime import AddAnimeIE
from .archiveorg import ArchiveOrgIE
from .ard import ARDIE
-from .arte import ArteTvIE
+from .arte import (
+ ArteTvIE,
+ ArteTVPlus7IE,
+ ArteTVCreativeIE,
+ ArteTVFutureIE,
+)
from .auengine import AUEngineIE
from .bandcamp import BandcampIE
from .bliptv import BlipTVIE, BlipTVUserIE
@@ -12,6 +17,7 @@ from .brightcove import BrightcoveIE
from .c56 import C56IE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
+from .cinemassacre import CinemassacreIE
from .cnn import CNNIE
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE
@@ -61,6 +67,7 @@ from .ign import IGNIE, OneUPIE
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE
+from .internetvideoarchive import InternetVideoArchiveIE
from .jeuxvideo import JeuxVideoIE
from .jukebox import JukeboxIE
from .justintv import JustinTVIE
@@ -82,6 +89,7 @@ from .nba import NBAIE
from .nbc import NBCNewsIE
from .newgrounds import NewgroundsIE
from .nhl import NHLIE, NHLVideocenterIE
+from .nowvideo import NowVideoIE
from .ooyala import OoyalaIE
from .orf import ORFIE
from .pbs import PBSIE
@@ -91,8 +99,10 @@ from .rbmaradio import RBMARadioIE
from .redtube import RedTubeIE
from .ringtv import RingTVIE
from .ro220 import Ro220IE
+from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE
from .rtlnow import RTLnowIE
+from .rutube import RutubeIE
from .sina import SinaIE
from .slashdot import SlashdotIE
from .slideshare import SlideshareIE
@@ -103,7 +113,9 @@ from .spiegel import SpiegelIE
from .stanfordoc import StanfordOpenClassroomIE
from .statigram import StatigramIE
from .steam import SteamIE
+from .sztvhu import SztvHuIE
from .teamcoco import TeamcocoIE
+from .techtalks import TechTalksIE
from .ted import TEDIE
from .tf1 import TF1IE
from .thisav import ThisAVIE
@@ -120,10 +132,13 @@ from .veoh import VeohIE
from .vevo import VevoIE
from .vice import ViceIE
from .viddler import ViddlerIE
+from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
+from .videopremium import VideoPremiumIE
from .vimeo import VimeoIE, VimeoChannelIE
from .vine import VineIE
from .wat import WatIE
+from .websurg import WeBSurgIE
from .weibo import WeiboIE
from .wimp import WimpIE
from .worldstarhiphop import WorldStarHipHopIE
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 4707d7cca..5ee8a67b1 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
import re
import json
import xml.etree.ElementTree
@@ -7,15 +8,15 @@ from ..utils import (
ExtractorError,
find_xpath_attr,
unified_strdate,
+ determine_ext,
+ get_element_by_id,
)
+# There are different sources of video in arte.tv, the extraction process
+# is different for each one. The videos usually expire in 7 days, so we can't
+# add tests.
+
class ArteTvIE(InfoExtractor):
- """
- There are two sources of video in arte.tv: videos.arte.tv and
- www.arte.tv/guide, the extraction process is different for each one.
- The videos expire in 7 days, so we can't add tests.
- """
- _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
_VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'
_LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
_LIVE_URL = r'index-[0-9]+\.html$'
@@ -24,7 +25,7 @@ class ArteTvIE(InfoExtractor):
@classmethod
def suitable(cls, url):
- return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL))
+ return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL))
# TODO implement Live Stream
# from ..utils import compat_urllib_parse
@@ -55,14 +56,6 @@ class ArteTvIE(InfoExtractor):
# video_url = u'%s/%s' % (info.get('url'), info.get('path'))
def _real_extract(self, url):
- mobj = re.match(self._EMISSION_URL, url)
- if mobj is not None:
- lang = mobj.group('lang')
- # This is not a real id, it can be for example AJT for the news
- # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
- video_id = mobj.group('id')
- return self._extract_emission(url, video_id, lang)
-
mobj = re.match(self._VIDEOS_URL, url)
if mobj is not None:
id = mobj.group('id')
@@ -80,59 +73,6 @@ class ArteTvIE(InfoExtractor):
# self.extractLiveStream(url)
# return
- def _extract_emission(self, url, video_id, lang):
- """Extract from www.arte.tv/guide"""
- webpage = self._download_webpage(url, video_id)
- json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
-
- json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
- self.report_extraction(video_id)
- info = json.loads(json_info)
- player_info = info['videoJsonPlayer']
-
- info_dict = {'id': player_info['VID'],
- 'title': player_info['VTI'],
- 'description': player_info.get('VDE'),
- 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
- 'thumbnail': player_info['programImage'],
- 'ext': 'flv',
- }
-
- formats = player_info['VSR'].values()
- def _match_lang(f):
- # Return true if that format is in the language of the url
- if lang == 'fr':
- l = 'F'
- elif lang == 'de':
- l = 'A'
- regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
- return any(re.match(r, f['versionCode']) for r in regexes)
- # Some formats may not be in the same language as the url
- formats = filter(_match_lang, formats)
- # Some formats use the m3u8 protocol
- formats = filter(lambda f: f['videoFormat'] != 'M3U8', formats)
- # We order the formats by quality
- formats = sorted(formats, key=lambda f: int(f['height']))
- # Prefer videos without subtitles in the same language
- formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None)
- # Pick the best quality
- def _format(format_info):
- info = {'ext': 'flv',
- 'width': format_info.get('width'),
- 'height': format_info.get('height'),
- }
- if format_info['mediaType'] == u'rtmp':
- info['url'] = format_info['streamer']
- info['play_path'] = 'mp4:' + format_info['url']
- else:
- info_dict['url'] = format_info['url']
- return info
- info_dict['formats'] = [_format(f) for f in formats]
- # TODO: Remove when #980 has been merged
- info_dict.update(info_dict['formats'][-1])
-
- return info_dict
-
def _extract_video(self, url, video_id, lang):
"""Extract from videos.arte.tv"""
ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
@@ -182,3 +122,110 @@ class ArteTvIE(InfoExtractor):
'ext': 'flv',
'thumbnail': self._og_search_thumbnail(webpage),
}
+
+
+class ArteTVPlus7IE(InfoExtractor):
+ IE_NAME = u'arte.tv:+7'
+ _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+
+ @classmethod
+ def _extract_url_info(cls, url):
+ mobj = re.match(cls._VALID_URL, url)
+ lang = mobj.group('lang')
+ # This is not a real id, it can be for example AJT for the news
+ # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
+ video_id = mobj.group('id')
+ return video_id, lang
+
+ def _real_extract(self, url):
+ video_id, lang = self._extract_url_info(url)
+ webpage = self._download_webpage(url, video_id)
+ return self._extract_from_webpage(webpage, video_id, lang)
+
+ def _extract_from_webpage(self, webpage, video_id, lang):
+ json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
+
+ json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
+ self.report_extraction(video_id)
+ info = json.loads(json_info)
+ player_info = info['videoJsonPlayer']
+
+ info_dict = {
+ 'id': player_info['VID'],
+ 'title': player_info['VTI'],
+ 'description': player_info.get('VDE'),
+ 'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]),
+ 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
+ }
+
+ formats = player_info['VSR'].values()
+ def _match_lang(f):
+ if f.get('versionCode') is None:
+ return True
+ # Return true if that format is in the language of the url
+ if lang == 'fr':
+ l = 'F'
+ elif lang == 'de':
+ l = 'A'
+ regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
+ return any(re.match(r, f['versionCode']) for r in regexes)
+ # Some formats may not be in the same language as the url
+ formats = filter(_match_lang, formats)
+ # Some formats use the m3u8 protocol
+ formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats)
+ # We order the formats by quality
+ formats = sorted(formats, key=lambda f: int(f.get('height',-1)))
+ # Prefer videos without subtitles in the same language
+ formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None)
+ # Pick the best quality
+ def _format(format_info):
+ info = {
+ 'width': format_info.get('width'),
+ 'height': format_info.get('height'),
+ }
+ if format_info['mediaType'] == u'rtmp':
+ info['url'] = format_info['streamer']
+ info['play_path'] = 'mp4:' + format_info['url']
+ info['ext'] = 'flv'
+ else:
+ info['url'] = format_info['url']
+ info['ext'] = determine_ext(info['url'])
+ return info
+ info_dict['formats'] = [_format(f) for f in formats]
+ # TODO: Remove when #980 has been merged
+ info_dict.update(info_dict['formats'][-1])
+
+ return info_dict
+
+
+# It also uses the arte_vp_url url from the webpage to extract the information
+class ArteTVCreativeIE(ArteTVPlus7IE):
+ IE_NAME = u'arte.tv:creative'
+ _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/magazine?/(?P<id>.+)'
+
+ _TEST = {
+ u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
+ u'file': u'050489-002.mp4',
+ u'info_dict': {
+ u'title': u'Agentur Amateur #2 - Corporate Design',
+ },
+ }
+
+
+class ArteTVFutureIE(ArteTVPlus7IE):
+ IE_NAME = u'arte.tv:future'
+ _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de)/(thema|sujet)/.*?#article-anchor-(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081',
+ u'file': u'050940-003.mp4',
+ u'info_dict': {
+ u'title': u'Les champignons au secours de la planète',
+ },
+ }
+
+ def _real_extract(self, url):
+ anchor_id, lang = self._extract_url_info(url)
+ webpage = self._download_webpage(url, anchor_id)
+ row = get_element_by_id(anchor_id, webpage)
+ return self._extract_from_webpage(row, anchor_id, lang)
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 745212f2f..1392f382a 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -53,6 +53,8 @@ class BrightcoveIE(InfoExtractor):
# Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553
object_str = re.sub(r'(<param name="[^"]+" value="[^"]+")>',
lambda m: m.group(1) + '/>', object_str)
+ # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
+ object_str = object_str.replace(u'<--', u'<!--')
object_doc = xml.etree.ElementTree.fromstring(object_str)
assert u'BrightcoveExperience' in object_doc.attrib['class']
@@ -96,7 +98,10 @@ class BrightcoveIE(InfoExtractor):
playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
player_key, u'Downloading playlist information')
- playlist_info = json.loads(playlist_info)['videoList']
+ json_data = json.loads(playlist_info)
+ if 'videoList' not in json_data:
+ raise ExtractorError(u'Empty playlist')
+ playlist_info = json_data['videoList']
videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
return self.playlist_result(videos, playlist_id=playlist_info['id'],
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
new file mode 100644
index 000000000..6925b96c2
--- /dev/null
+++ b/youtube_dl/extractor/cinemassacre.py
@@ -0,0 +1,91 @@
+# encoding: utf-8
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+)
+
+
+class CinemassacreIE(InfoExtractor):
+ _VALID_URL = r'(?:http://)?(?:www\.)?(?P<url>cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?)(?:[/?].*)?'
+ _TESTS = [{
+ u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
+ u'file': u'19911.flv',
+ u'info_dict': {
+ u'upload_date': u'20121110',
+ u'title': u'“Angry Video Game Nerd: The Movie” – Trailer',
+ u'description': u'md5:fb87405fcb42a331742a0dce2708560b',
+ },
+ u'params': {
+ # rtmp download
+ u'skip_download': True,
+ },
+ },
+ {
+ u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
+ u'file': u'521be8ef82b16.flv',
+ u'info_dict': {
+ u'upload_date': u'20131002',
+ u'title': u'The Mummy’s Hand (1940)',
+ },
+ u'params': {
+ # rtmp download
+ u'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ webpage_url = u'http://' + mobj.group('url')
+ webpage = self._download_webpage(webpage_url, None) # Don't know video id yet
+ video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
+ mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/(?:embed|player)\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
+ if not mobj:
+ raise ExtractorError(u'Can\'t extract embed url and video id')
+ playerdata_url = mobj.group(u'embed_url')
+ video_id = mobj.group(u'video_id')
+
+ video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|',
+ webpage, u'title')
+ video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>',
+ webpage, u'description', flags=re.DOTALL, fatal=False)
+ if len(video_description) == 0:
+ video_description = None
+
+ playerdata = self._download_webpage(playerdata_url, video_id)
+ base_url = self._html_search_regex(r'\'streamer\': \'(?P<base_url>rtmp://.*?)/(?:vod|Cinemassacre)\'',
+ playerdata, u'base_url')
+ base_url += '/Cinemassacre/'
+ # Important: The file names in playerdata are not used by the player and even wrong for some videos
+ sd_file = 'Cinemassacre-%s_high.mp4' % video_id
+ hd_file = 'Cinemassacre-%s.mp4' % video_id
+ video_thumbnail = 'http://image.screenwavemedia.com/Cinemassacre/Cinemassacre-%s_thumb_640x360.jpg' % video_id
+
+ formats = [
+ {
+ 'url': base_url + sd_file,
+ 'ext': 'flv',
+ 'format': 'sd',
+ 'format_id': 'sd',
+ },
+ {
+ 'url': base_url + hd_file,
+ 'ext': 'flv',
+ 'format': 'hd',
+ 'format_id': 'hd',
+ },
+ ]
+
+ info = {
+ 'id': video_id,
+ 'title': video_title,
+ 'formats': formats,
+ 'description': video_description,
+ 'upload_date': video_date,
+ 'thumbnail': video_thumbnail,
+ }
+ # TODO: Remove when #980 has been merged
+ info.update(formats[-1])
+ return info
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index 5edbf678a..098768361 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -1,56 +1,59 @@
import re
-import xml.etree.ElementTree
+import json
from .common import InfoExtractor
from ..utils import (
- unified_strdate,
compat_urllib_parse,
+ compat_urlparse,
+ unescapeHTML,
+ get_meta_content,
)
+
class GameSpotIE(InfoExtractor):
- _WORKING = False
_VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
_TEST = {
u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/",
- u"file": u"6410818.mp4",
+ u"file": u"gs-2300-6410818.mp4",
u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",
u"info_dict": {
u"title": u"Arma 3 - Community Guide: SITREP I",
- u"upload_date": u"20130627",
+ u'description': u'Check out this video where some of the basics of Arma 3 is explained.',
}
}
-
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- page_id = mobj.group('page_id')
+ page_id = video_id = mobj.group('page_id')
webpage = self._download_webpage(url, page_id)
- video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"',
- r'http://www\.gamespot\.com/videoembed/(\d+)'],
- webpage, 'video id')
- data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'})
- info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data
- info_xml = self._download_webpage(info_url, video_id)
- doc = xml.etree.ElementTree.fromstring(info_xml)
- clip_el = doc.find('./playList/clip')
+ data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video')
+ data_video = json.loads(unescapeHTML(data_video_json))
- http_urls = [{'url': node.find('filePath').text,
- 'rate': int(node.find('rate').text)}
- for node in clip_el.find('./httpURI')]
- best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1]
- video_url = best_quality['url']
- title = clip_el.find('./title').text
- ext = video_url.rpartition('.')[2]
- thumbnail_url = clip_el.find('./screenGrabURI').text
- view_count = int(clip_el.find('./views').text)
- upload_date = unified_strdate(clip_el.find('./postDate').text)
+ # Transform the manifest url to a link to the mp4 files
+ # they are used in mobile devices.
+ f4m_url = data_video['videoStreams']['f4m_stream']
+ f4m_path = compat_urlparse.urlparse(f4m_url).path
+ QUALITIES_RE = r'((,\d+)+,?)'
+ qualities = self._search_regex(QUALITIES_RE, f4m_path, u'qualities').strip(',').split(',')
+ http_path = f4m_path[1:].split('/', 1)[1]
+ http_template = re.sub(QUALITIES_RE, r'%s', http_path)
+ http_template = http_template.replace('.csmil/manifest.f4m', '')
+ http_template = compat_urlparse.urljoin('http://video.gamespotcdn.com/', http_template)
+ formats = []
+ for q in qualities:
+ formats.append({
+ 'url': http_template % q,
+ 'ext': 'mp4',
+ 'format_id': q,
+ })
- return [{
- 'id' : video_id,
- 'url' : video_url,
- 'ext' : ext,
- 'title' : title,
- 'thumbnail' : thumbnail_url,
- 'upload_date' : upload_date,
- 'view_count' : view_count,
- }]
+ info = {
+ 'id': data_video['guid'],
+ 'title': compat_urllib_parse.unquote(data_video['title']),
+ 'formats': formats,
+ 'description': get_meta_content('description', webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
+ # TODO: Remove when #980 has been merged
+ info.update(formats[-1])
+ return info
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 7060c6f92..89805250c 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -11,6 +11,8 @@ from ..utils import (
compat_urlparse,
ExtractorError,
+ smuggle_url,
+ unescapeHTML,
)
from .brightcove import BrightcoveIE
@@ -29,6 +31,17 @@ class GenericIE(InfoExtractor):
u"title": u"R\u00e9gis plante sa Jeep"
}
},
+ # embedded vimeo video
+ {
+ u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',
+ u'file': u'22444065.mp4',
+ u'md5': u'2903896e23df39722c33f015af0666e2',
+ u'info_dict': {
+ u'title': u'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011',
+ u"uploader_id": u"skillsmatter",
+ u"uploader": u"Skills Matter",
+ }
+ }
]
def report_download_webpage(self, video_id):
@@ -121,12 +134,20 @@ class GenericIE(InfoExtractor):
self.report_extraction(video_id)
# Look for BrightCove:
- m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)
+ m_brightcove = re.search(r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)
if m_brightcove is not None:
self.to_screen(u'Brightcove video detected.')
bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())
return self.url_result(bc_url, 'Brightcove')
+ # Look for embedded Vimeo player
+ mobj = re.search(
+ r'<iframe\s+src="(https?://player.vimeo.com/video/.*?)"', webpage)
+ if mobj:
+ player_url = unescapeHTML(mobj.group(1))
+ surl = smuggle_url(player_url, {'Referer': url})
+ return self.url_result(surl, 'Vimeo')
+
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py
new file mode 100644
index 000000000..5986459d6
--- /dev/null
+++ b/youtube_dl/extractor/internetvideoarchive.py
@@ -0,0 +1,87 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urlparse,
+ compat_urllib_parse,
+ xpath_with_ns,
+ determine_ext,
+)
+
+
+class InternetVideoArchiveIE(InfoExtractor):
+ _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?'
+
+ _TEST = {
+ u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
+ u'file': u'452693.mp4',
+ u'info_dict': {
+ u'title': u'SKYFALL',
+ u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
+ u'duration': 156,
+ },
+ }
+
+ @staticmethod
+ def _build_url(query):
+ return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query
+
+ @staticmethod
+ def _clean_query(query):
+ NEEDED_ARGS = ['publishedid', 'customerid']
+ query_dic = compat_urlparse.parse_qs(query)
+ cleaned_dic = dict((k,v[0]) for (k,v) in query_dic.items() if k in NEEDED_ARGS)
+ # Other player ids return m3u8 urls
+ cleaned_dic['playerid'] = '247'
+ cleaned_dic['videokbrate'] = '100000'
+ return compat_urllib_parse.urlencode(cleaned_dic)
+
+ def _real_extract(self, url):
+ query = compat_urlparse.urlparse(url).query
+ query_dic = compat_urlparse.parse_qs(query)
+ video_id = query_dic['publishedid'][0]
+ url = self._build_url(query)
+
+ flashconfiguration_xml = self._download_webpage(url, video_id,
+ u'Downloading flash configuration')
+ flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8'))
+ file_url = flashconfiguration.find('file').text
+ file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
+ # Replace some of the parameters in the query to get the best quality
+ # and http links (no m3u8 manifests)
+ file_url = re.sub(r'(?<=\?)(.+)$',
+ lambda m: self._clean_query(m.group()),
+ file_url)
+ info_xml = self._download_webpage(file_url, video_id,
+ u'Downloading video info')
+ info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+ item = info.find('channel/item')
+
+ def _bp(p):
+ return xpath_with_ns(p,
+ {'media': 'http://search.yahoo.com/mrss/',
+ 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'})
+ formats = []
+ for content in item.findall(_bp('media:group/media:content')):
+ attr = content.attrib
+ f_url = attr['url']
+ formats.append({
+ 'url': f_url,
+ 'ext': determine_ext(f_url),
+ 'width': int(attr['width']),
+ 'bitrate': int(attr['bitrate']),
+ })
+ formats = sorted(formats, key=lambda f: f['bitrate'])
+
+ info = {
+ 'id': video_id,
+ 'title': item.find('title').text,
+ 'formats': formats,
+ 'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'],
+ 'description': item.find('description').text,
+ 'duration': int(attr['duration']),
+ }
+ # TODO: Remove when #980 has been merged
+ info.update(formats[-1])
+ return info
diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py
new file mode 100644
index 000000000..ab52ad401
--- /dev/null
+++ b/youtube_dl/extractor/nowvideo.py
@@ -0,0 +1,43 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import compat_urlparse
+
+
+class NowVideoIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.ch/video/(?P<id>\w+)'
+ _TEST = {
+ u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
+ u'file': u'0mw0yow7b6dxa.flv',
+ u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817',
+ u'info_dict': {
+ u"title": u"youtubedl test video _BaW_jenozKc.mp4"
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ video_id = mobj.group('id')
+ webpage_url = 'http://www.nowvideo.ch/video/' + video_id
+ webpage = self._download_webpage(webpage_url, video_id)
+
+ self.report_extraction(video_id)
+
+ video_title = self._html_search_regex(r'<h4>(.*)</h4>',
+ webpage, u'video title')
+
+ video_key = self._search_regex(r'var fkzd="(.*)";',
+ webpage, u'video key')
+
+ api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key)
+ api_response = self._download_webpage(api_call, video_id,
+ u'Downloading API page')
+ video_url = compat_urlparse.parse_qs(api_response)[u'url'][0]
+
+ return [{
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'flv',
+ 'title': video_title,
+ }]
diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py
new file mode 100644
index 000000000..c79c39413
--- /dev/null
+++ b/youtube_dl/extractor/rottentomatoes.py
@@ -0,0 +1,16 @@
+from .videodetective import VideoDetectiveIE
+
+
+# It just uses the same method as videodetective.com,
+# the internetvideoarchive.com is extracted from the og:video property
+class RottenTomatoesIE(VideoDetectiveIE):
+ _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/',
+ u'file': '613340.mp4',
+ u'info_dict': {
+ u'title': u'TOY STORY 3',
+ u'description': u'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.',
+ },
+ }
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py
new file mode 100644
index 000000000..a18034fe2
--- /dev/null
+++ b/youtube_dl/extractor/rutube.py
@@ -0,0 +1,58 @@
+# encoding: utf-8
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urlparse,
+ compat_str,
+ ExtractorError,
+)
+
+
+class RutubeIE(InfoExtractor):
+ _VALID_URL = r'https?://rutube.ru/video/(?P<long_id>\w+)'
+
+ _TEST = {
+ u'url': u'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
+ u'file': u'3eac3b4561676c17df9132a9a1e62e3e.mp4',
+ u'info_dict': {
+ u'title': u'Раненный кенгуру забежал в аптеку',
+ u'uploader': u'NTDRussian',
+ u'uploader_id': u'29790',
+ },
+ u'params': {
+ # It requires ffmpeg (m3u8 download)
+ u'skip_download': True,
+ },
+ }
+
+ def _get_api_response(self, short_id, subpath):
+ api_url = 'http://rutube.ru/api/play/%s/%s/?format=json' % (subpath, short_id)
+ response_json = self._download_webpage(api_url, short_id,
+ u'Downloading %s json' % subpath)
+ return json.loads(response_json)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ long_id = mobj.group('long_id')
+ webpage = self._download_webpage(url, long_id)
+ og_video = self._og_search_video_url(webpage)
+ short_id = compat_urlparse.urlparse(og_video).path[1:]
+ options = self._get_api_response(short_id, 'options')
+ trackinfo = self._get_api_response(short_id, 'trackinfo')
+ # Some videos don't have the author field
+ author = trackinfo.get('author') or {}
+ m3u8_url = trackinfo['video_balancer'].get('m3u8')
+ if m3u8_url is None:
+ raise ExtractorError(u'Couldn\'t find m3u8 manifest url')
+
+ return {
+ 'id': trackinfo['id'],
+ 'title': trackinfo['title'],
+ 'url': m3u8_url,
+ 'ext': 'mp4',
+ 'thumbnail': options['thumbnail_url'],
+ 'uploader': author.get('name'),
+ 'uploader_id': compat_str(author['id']) if author else None,
+ }
diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py
new file mode 100644
index 000000000..81fa35c4b
--- /dev/null
+++ b/youtube_dl/extractor/sztvhu.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+import re
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+
+class SztvHuIE(InfoExtractor):
+ _VALID_URL = r'(?:http://)?(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)'
+ _TEST = {
+ u'url': u'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909',
+ u'file': u'20130909.mp4',
+ u'md5': u'a6df607b11fb07d0e9f2ad94613375cb',
+ u'info_dict': {
+ u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren",
+ u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ video_file = self._search_regex(
+ r'file: "...:(.*?)",', webpage, 'video file')
+ title = self._html_search_regex(
+ r'<meta name="title" content="([^"]*?) - [^-]*? - [^-]*?"',
+ webpage, 'video title')
+ description = self._html_search_regex(
+ r'<meta name="description" content="([^"]*)"/>',
+ webpage, 'video description', fatal=False)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ video_url = 'http://media.sztv.hu/vod/' + video_file
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'ext': determine_ext(video_url),
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/techtalks.py b/youtube_dl/extractor/techtalks.py
new file mode 100644
index 000000000..a55f236cb
--- /dev/null
+++ b/youtube_dl/extractor/techtalks.py
@@ -0,0 +1,65 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ get_element_by_attribute,
+ clean_html,
+)
+
+
+class TechTalksIE(InfoExtractor):
+ _VALID_URL = r'https?://techtalks\.tv/talks/[^/]*/(?P<id>\d+)/'
+
+ _TEST = {
+ u'url': u'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/',
+ u'playlist': [
+ {
+ u'file': u'57758.flv',
+ u'info_dict': {
+ u'title': u'Learning Topic Models --- Going beyond SVD',
+ },
+ },
+ {
+ u'file': u'57758-slides.flv',
+ u'info_dict': {
+ u'title': u'Learning Topic Models --- Going beyond SVD',
+ },
+ },
+ ],
+ u'params': {
+ # rtmp download
+ u'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ talk_id = mobj.group('id')
+ webpage = self._download_webpage(url, talk_id)
+ rtmp_url = self._search_regex(r'netConnectionUrl: \'(.*?)\'', webpage,
+ u'rtmp url')
+ play_path = self._search_regex(r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"',
+ webpage, u'presenter play path')
+ title = clean_html(get_element_by_attribute('class', 'title', webpage))
+ video_info = {
+ 'id': talk_id,
+ 'title': title,
+ 'url': rtmp_url,
+ 'play_path': play_path,
+ 'ext': 'flv',
+ }
+ m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage)
+ if m_slides is None:
+ return video_info
+ else:
+ return [
+ video_info,
+ # The slides video
+ {
+ 'id': talk_id + '-slides',
+ 'title': title,
+ 'url': rtmp_url,
+ 'play_path': m_slides.group(1),
+ 'ext': 'flv',
+ },
+ ]
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py
index 1405b73f7..79679a14a 100644
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -7,15 +7,25 @@ from .common import InfoExtractor
class TudouIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
- _TEST = {
+ _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs|albumplay)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
+ _TESTS = [{
u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
u'file': u'159448201.f4v',
u'md5': u'140a49ed444bd22f93330985d8475fcb',
u'info_dict': {
u"title": u"卡马乔国足开大脚长传冲吊集锦"
}
- }
+ },
+ {
+ u'url': u'http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html',
+ u'file': u'todo.mp4',
+ u'md5': u'todo.mp4',
+ u'info_dict': {
+ u'title': u'todo.mp4',
+ },
+ u'add_ie': [u'Youku'],
+ u'skip': u'Only works from China'
+ }]
def _url_for_id(self, id, quality = None):
info_url = "http://v2.tudou.com/f?id="+str(id)
@@ -29,14 +39,18 @@ class TudouIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(2)
webpage = self._download_webpage(url, video_id)
- title = re.search(",kw:\"(.+)\"",webpage)
- if title is None:
- title = re.search(",kw: \'(.+)\'",webpage)
- title = title.group(1)
- thumbnail_url = re.search(",pic: \'(.+?)\'",webpage)
- if thumbnail_url is None:
- thumbnail_url = re.search(",pic:\"(.+?)\"",webpage)
- thumbnail_url = thumbnail_url.group(1)
+
+ m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage)
+ if m and m.group(1):
+ return {
+ '_type': 'url',
+ 'url': u'youku:' + m.group(1),
+ 'ie_key': 'Youku'
+ }
+
+ title = self._search_regex(r",kw:['\"](.+?)[\"']", webpage, u'title')
+ thumbnail_url = self._search_regex(
+ r",pic:\s*[\"'](.+?)[\"']", webpage, u'thumbnail URL', fatal=False)
segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
segments = json.loads(segs_json)
diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py
new file mode 100644
index 000000000..d89f84094
--- /dev/null
+++ b/youtube_dl/extractor/videodetective.py
@@ -0,0 +1,30 @@
+import re
+
+from .common import InfoExtractor
+from .internetvideoarchive import InternetVideoArchiveIE
+from ..utils import (
+ compat_urlparse,
+)
+
+
+class VideoDetectiveIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.videodetective.com/movies/kick-ass-2/194487',
+ u'file': u'194487.mp4',
+ u'info_dict': {
+ u'title': u'KICK-ASS 2',
+ u'description': u'md5:65ba37ad619165afac7d432eaded6013',
+ u'duration': 138,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ og_video = self._og_search_video_url(webpage)
+ query = compat_urlparse.urlparse(og_video).query
+ return self.url_result(InternetVideoArchiveIE._build_url(query),
+ ie=InternetVideoArchiveIE.ie_key())
diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py
new file mode 100644
index 000000000..65f39b982
--- /dev/null
+++ b/youtube_dl/extractor/videopremium.py
@@ -0,0 +1,40 @@
+import re
+import random
+
+from .common import InfoExtractor
+
+
+class VideoPremiumIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.tv/(?P<id>\w+)(?:/.*)?'
+ _TEST = {
+ u'url': u'http://videopremium.tv/4w7oadjsf156',
+ u'file': u'4w7oadjsf156.f4v',
+ u'info_dict': {
+ u"title": u"youtube-dl_test_video____a_________-BaW_jenozKc.mp4.mp4"
+ },
+ u'params': {
+ u'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ video_id = mobj.group('id')
+ webpage_url = 'http://videopremium.tv/' + video_id
+ webpage = self._download_webpage(webpage_url, video_id)
+
+ self.report_extraction(video_id)
+
+ video_title = self._html_search_regex(r'<h2(?:.*?)>\s*(.+?)\s*<',
+ webpage, u'video title')
+
+ return [{
+ 'id': video_id,
+ 'url': "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16),
+ 'play_path': "mp4:%s.f4v" % video_id,
+ 'page_url': "http://videopremium.tv/" + video_id,
+ 'player_url': "http://videopremium.tv/uplayer/uppod.swf",
+ 'ext': 'f4v',
+ 'title': video_title,
+ }]
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index cea29f035..2de56ac81 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -11,6 +11,7 @@ from ..utils import (
get_element_by_attribute,
ExtractorError,
std_headers,
+ unsmuggle_url,
)
class VimeoIE(InfoExtractor):
@@ -53,7 +54,7 @@ class VimeoIE(InfoExtractor):
u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software',
u'uploader': u'The BLN & Business of Software',
},
- },
+ }
]
def _login(self):
@@ -98,6 +99,12 @@ class VimeoIE(InfoExtractor):
self._login()
def _real_extract(self, url, new_video=True):
+ url, data = unsmuggle_url(url)
+ headers = std_headers
+ if data is not None:
+ headers = headers.copy()
+ headers.update(data)
+
# Extract ID from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
@@ -112,7 +119,7 @@ class VimeoIE(InfoExtractor):
url = 'https://vimeo.com/' + video_id
# Retrieve video webpage to extract further information
- request = compat_urllib_request.Request(url, None, std_headers)
+ request = compat_urllib_request.Request(url, None, headers)
webpage = self._download_webpage(request, video_id)
# Now we begin extracting as much information as we can from what we
diff --git a/youtube_dl/extractor/websurg.py b/youtube_dl/extractor/websurg.py
new file mode 100644
index 000000000..43953bfdd
--- /dev/null
+++ b/youtube_dl/extractor/websurg.py
@@ -0,0 +1,59 @@
+# coding: utf-8
+
+import re
+
+from ..utils import (
+ compat_urllib_request,
+ compat_urllib_parse
+)
+
+from .common import InfoExtractor
+
+class WeBSurgIE(InfoExtractor):
+ IE_NAME = u'websurg.com'
+ _VALID_URL = r'http://.*?\.websurg\.com/MEDIA/\?noheader=1&doi=(.*)'
+
+ _TEST = {
+ u'url': u'http://www.websurg.com/MEDIA/?noheader=1&doi=vd01en4012',
+ u'file': u'vd01en4012.mp4',
+ u'params': {
+ u'skip_download': True,
+ },
+ u'skip': u'Requires login information',
+ }
+
+ _LOGIN_URL = 'http://www.websurg.com/inc/login/login_div.ajax.php?login=1'
+
+ def _real_initialize(self):
+
+ login_form = {
+ 'username': self._downloader.params['username'],
+ 'password': self._downloader.params['password'],
+ 'Submit': 1
+ }
+
+ request = compat_urllib_request.Request(
+ self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+ request.add_header(
+ 'Content-Type', 'application/x-www-form-urlencoded;charset=utf-8')
+ compat_urllib_request.urlopen(request).info()
+ webpage = self._download_webpage(self._LOGIN_URL, '', 'Logging in')
+
+ if webpage != 'OK':
+ self._downloader.report_error(
+ u'Unable to log in: bad username/password')
+
+ def _real_extract(self, url):
+ video_id = re.match(self._VALID_URL, url).group(1)
+
+ webpage = self._download_webpage(url, video_id)
+
+ url_info = re.search(r'streamer="(.*?)" src="(.*?)"', webpage)
+
+ return {'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'ext' : 'mp4',
+ 'url' : url_info.group(1) + '/' + url_info.group(2),
+ 'thumbnail': self._og_search_thumbnail(webpage)
+ }
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index 00fa2ccb5..9d88c17f5 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -13,7 +13,7 @@ from ..utils import (
class YoukuIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(v|player)\.youku\.com/(v_show/id_|player\.php/sid/)(?P<ID>[A-Za-z0-9]+)(\.html|/v.swf)'
+ _VALID_URL = r'(?:(?:http://)?(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|youku:)(?P<ID>[A-Za-z0-9]+)(?:\.html|/v\.swf|)'
_TEST = {
u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",
u"file": u"XNDgyMDQ2NTQw_part00.flv",
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 8222a880f..fb7c42830 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1150,7 +1150,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
list_page = self._download_webpage(list_url, video_id)
caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
original_lang_node = caption_list.find('track')
- if original_lang_node.attrib.get('kind') != 'asr' :
+ if not original_lang_node or original_lang_node.attrib.get('kind') != 'asr' :
self._downloader.report_warning(u'Video doesn\'t have automatic captions')
return {}
original_lang = original_lang_node.attrib['lang_code']
@@ -1250,6 +1250,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
url_map[itag] = format_url
return url_map
+ def _extract_annotations(self, video_id):
+ url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
+ return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
+
def _real_extract(self, url):
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re.search(self._NEXT_URL_RE, url)
@@ -1382,6 +1386,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
+ # annotations
+ video_annotations = None
+ if self._downloader.params.get('writeannotations', False):
+ video_annotations = self._extract_annotations(video_id)
+
# Decide which formats to download
try:
@@ -1495,6 +1504,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'subtitles': video_subtitles,
'duration': video_duration,
'age_limit': 18 if age_gate else 0,
+ 'annotations': video_annotations
})
return results
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 82a1daeb9..833f981f2 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -9,6 +9,7 @@ import io
import json
import locale
import os
+import pipes
import platform
import re
import socket
@@ -229,6 +230,19 @@ else:
return f
return None
+# On python2.6 the xml.etree.ElementTree.Element methods don't support
+# the namespace parameter
+def xpath_with_ns(path, ns_map):
+ components = [c.split(':') for c in path.split('/')]
+ replaced = []
+ for c in components:
+ if len(c) == 1:
+ replaced.append(c[0])
+ else:
+ ns, tag = c
+ replaced.append('{%s}%s' % (ns_map[ns], tag))
+ return '/'.join(replaced)
+
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a character.
@@ -927,3 +941,24 @@ class locked_file(object):
def read(self, *args):
return self.f.read(*args)
+
+
+def shell_quote(args):
+ return ' '.join(map(pipes.quote, args))
+
+
+def smuggle_url(url, data):
+ """ Pass additional data in a URL for internal use. """
+
+ sdata = compat_urllib_parse.urlencode(
+ {u'__youtubedl_smuggle': json.dumps(data)})
+ return url + u'#' + sdata
+
+
+def unsmuggle_url(smug_url):
+ if not '#__youtubedl_smuggle' in smug_url:
+ return smug_url, None
+ url, _, sdata = smug_url.rpartition(u'#')
+ jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
+ data = json.loads(jsond)
+ return url, data
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 1004af116..22a51ffe6 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
-__version__ = '2013.10.09'
+__version__ = '2013.10.17'