diff options
Diffstat (limited to 'youtube_dl')
24 files changed, 878 insertions, 131 deletions
diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index fbf8a7f98..13b56ede5 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -2,9 +2,15 @@ import os import subprocess import sys import time -import datetime -from .utils import * + +from .utils import ( + compat_subprocess_get_DEVNULL, + encodeFilename, + PostProcessingError, + shell_quote, + subtitles_filename, +) class PostProcessor(object): @@ -83,6 +89,8 @@ class FFmpegPostProcessor(PostProcessor): + opts + [encodeFilename(self._ffmpeg_filename_argument(out_path))]) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout,stderr = p.communicate() if p.returncode != 0: @@ -178,7 +186,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): extension = self._preferredcodec more_opts = [] if self._preferredquality is not None: - if int(self._preferredquality) < 10: + # The opus codec doesn't support the -aq option + if int(self._preferredquality) < 10 and extension != 'opus': more_opts += [self._exes['avconv'] and '-q:a' or '-aq', self._preferredquality] else: more_opts += [self._exes['avconv'] and '-b:a' or '-ab', self._preferredquality + 'k'] diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a32e50772..f22a8bd0e 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -71,6 +71,7 @@ class YoutubeDL(object): logtostderr: Log messages to stderr instead of stdout. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file + writeannotations: Write the video annotations to a .annotations.xml file writethumbnail: Write the thumbnail image to a file writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatic subtitles to a file @@ -258,6 +259,10 @@ class YoutubeDL(object): """ Report that the metadata file has been written """ self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) + def report_writeannotations(self, annofn): + """ Report that the annotations file has been written. """ + self.to_screen(u'[info] Writing video annotations to: ' + annofn) + def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" try: @@ -599,6 +604,18 @@ class YoutubeDL(object): self.report_error(u'Cannot write description file ' + descfn) return + if self.params.get('writeannotations', False): + try: + annofn = filename + u'.annotations.xml' + self.report_writeannotations(annofn) + with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + annofile.write(info_dict['annotations']) + except (KeyError, TypeError): + self.report_warning(u'There are no annotations to write.') + except (OSError, IOError): + self.report_error(u'Cannot write annotations file: ' + annofn) + return + subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index bc8e97250..cd642ce3b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -31,6 +31,7 @@ __authors__ = ( 'Huarong Huo', 'Ismael Mejía', 'Steffan \'Ruirize\' James', + 'Andras Elso', ) __license__ = 'Public Domain' @@ -46,17 +47,43 @@ import shlex import socket import subprocess import sys -import warnings +import traceback import platform -from .utils import * +from .utils import ( + compat_cookiejar, + compat_print, + compat_str, + compat_urllib_request, + DateRange, + decodeOption, + determine_ext, + DownloadError, + get_cachedir, + make_HTTPS_handler, + MaxDownloadsReached, + platform_name, + preferredencoding, + SameFileError, + std_headers, + write_string, + YoutubeDLHandler, +) from .update import update_self from .version import __version__ -from .FileDownloader import * +from .FileDownloader import ( + FileDownloader, +) from .extractor import gen_extractors from .YoutubeDL import YoutubeDL -from .PostProcessor import * +from .PostProcessor import ( + FFmpegMetadataPP, + FFmpegVideoConvertor, + FFmpegExtractAudioPP, + FFmpegEmbedSubtitlePP, +) + def parseOpts(overrideArguments=None): def _readOptions(filename_bytes): @@ -240,11 +267,11 @@ def parseOpts(overrideArguments=None): help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'') downloader.add_option('-r', '--rate-limit', - dest='ratelimit', metavar='LIMIT', help='maximum download rate (e.g. 50k or 44.6m)') + dest='ratelimit', metavar='LIMIT', help='maximum download rate in bytes per second (e.g. 50K or 4.2M)') downloader.add_option('-R', '--retries', dest='retries', metavar='RETRIES', help='number of retries (default is %default)', default=10) downloader.add_option('--buffer-size', - dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16k) (default is %default)', default="1024") + dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16K) (default is %default)', default="1024") downloader.add_option('--no-resize-buffer', action='store_true', dest='noresizebuffer', help='do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.', default=False) @@ -339,6 +366,9 @@ def parseOpts(overrideArguments=None): filesystem.add_option('--write-info-json', action='store_true', dest='writeinfojson', help='write video metadata to a .info.json file', default=False) + filesystem.add_option('--write-annotations', + action='store_true', dest='writeannotations', + help='write video annotations to a .annotation file', default=False) filesystem.add_option('--write-thumbnail', action='store_true', dest='writethumbnail', help='write thumbnail image to disk', default=False) @@ -601,6 +631,7 @@ def _real_main(argv=None): 'nopart': opts.nopart, 'updatetime': opts.updatetime, 'writedescription': opts.writedescription, + 'writeannotations': opts.writeannotations, 'writeinfojson': opts.writeinfojson, 'writethumbnail': opts.writethumbnail, 'writesubtitles': opts.writesubtitles, @@ -684,7 +715,7 @@ def _real_main(argv=None): if opts.cookiefile is not None: try: jar.save() - except (IOError, OSError) as err: + except (IOError, OSError): sys.exit(u'ERROR: unable to save cookie jar') sys.exit(retcode) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 688196869..db69af361 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -2,7 +2,12 @@ from .appletrailers import AppleTrailersIE from .addanime import AddAnimeIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE -from .arte import ArteTvIE +from .arte import ( + ArteTvIE, + ArteTVPlus7IE, + ArteTVCreativeIE, + ArteTVFutureIE, +) from .auengine import AUEngineIE from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE @@ -12,6 +17,7 @@ from .brightcove import BrightcoveIE from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .cinemassacre import CinemassacreIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE @@ -61,6 +67,7 @@ from .ign import IGNIE, OneUPIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE +from .internetvideoarchive import InternetVideoArchiveIE from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE @@ -82,6 +89,7 @@ from .nba import NBAIE from .nbc import NBCNewsIE from .newgrounds import NewgroundsIE from .nhl import NHLIE, NHLVideocenterIE +from .nowvideo import NowVideoIE from .ooyala import OoyalaIE from .orf import ORFIE from .pbs import PBSIE @@ -91,8 +99,10 @@ from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE from .ro220 import Ro220IE +from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE +from .rutube import RutubeIE from .sina import SinaIE from .slashdot import SlashdotIE from .slideshare import SlideshareIE @@ -103,7 +113,9 @@ from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE from .steam import SteamIE +from .sztvhu import SztvHuIE from .teamcoco import TeamcocoIE +from .techtalks import TechTalksIE from .ted import TEDIE from .tf1 import TF1IE from .thisav import ThisAVIE @@ -120,10 +132,13 @@ from .veoh import VeohIE from .vevo import VevoIE from .vice import ViceIE from .viddler import ViddlerIE +from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE +from .videopremium import VideoPremiumIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE from .wat import WatIE +from .websurg import WeBSurgIE from .weibo import WeiboIE from .wimp import WimpIE from .worldstarhiphop import WorldStarHipHopIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 4707d7cca..5ee8a67b1 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -1,3 +1,4 @@ +# encoding: utf-8 import re import json import xml.etree.ElementTree @@ -7,15 +8,15 @@ from ..utils import ( ExtractorError, find_xpath_attr, unified_strdate, + determine_ext, + get_element_by_id, ) +# There are different sources of video in arte.tv, the extraction process +# is different for each one. The videos usually expire in 7 days, so we can't +# add tests. + class ArteTvIE(InfoExtractor): - """ - There are two sources of video in arte.tv: videos.arte.tv and - www.arte.tv/guide, the extraction process is different for each one. - The videos expire in 7 days, so we can't add tests. - """ - _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html' _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)' _LIVE_URL = r'index-[0-9]+\.html$' @@ -24,7 +25,7 @@ class ArteTvIE(InfoExtractor): @classmethod def suitable(cls, url): - return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL)) + return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL)) # TODO implement Live Stream # from ..utils import compat_urllib_parse @@ -55,14 +56,6 @@ class ArteTvIE(InfoExtractor): # video_url = u'%s/%s' % (info.get('url'), info.get('path')) def _real_extract(self, url): - mobj = re.match(self._EMISSION_URL, url) - if mobj is not None: - lang = mobj.group('lang') - # This is not a real id, it can be for example AJT for the news - # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal - video_id = mobj.group('id') - return self._extract_emission(url, video_id, lang) - mobj = re.match(self._VIDEOS_URL, url) if mobj is not None: id = mobj.group('id') @@ -80,59 +73,6 @@ class ArteTvIE(InfoExtractor): # self.extractLiveStream(url) # return - def _extract_emission(self, url, video_id, lang): - """Extract from www.arte.tv/guide""" - webpage = self._download_webpage(url, video_id) - json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') - - json_info = self._download_webpage(json_url, video_id, 'Downloading info json') - self.report_extraction(video_id) - info = json.loads(json_info) - player_info = info['videoJsonPlayer'] - - info_dict = {'id': player_info['VID'], - 'title': player_info['VTI'], - 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]), - 'thumbnail': player_info['programImage'], - 'ext': 'flv', - } - - formats = player_info['VSR'].values() - def _match_lang(f): - # Return true if that format is in the language of the url - if lang == 'fr': - l = 'F' - elif lang == 'de': - l = 'A' - regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] - return any(re.match(r, f['versionCode']) for r in regexes) - # Some formats may not be in the same language as the url - formats = filter(_match_lang, formats) - # Some formats use the m3u8 protocol - formats = filter(lambda f: f['videoFormat'] != 'M3U8', formats) - # We order the formats by quality - formats = sorted(formats, key=lambda f: int(f['height'])) - # Prefer videos without subtitles in the same language - formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None) - # Pick the best quality - def _format(format_info): - info = {'ext': 'flv', - 'width': format_info.get('width'), - 'height': format_info.get('height'), - } - if format_info['mediaType'] == u'rtmp': - info['url'] = format_info['streamer'] - info['play_path'] = 'mp4:' + format_info['url'] - else: - info_dict['url'] = format_info['url'] - return info - info_dict['formats'] = [_format(f) for f in formats] - # TODO: Remove when #980 has been merged - info_dict.update(info_dict['formats'][-1]) - - return info_dict - def _extract_video(self, url, video_id, lang): """Extract from videos.arte.tv""" ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') @@ -182,3 +122,110 @@ class ArteTvIE(InfoExtractor): 'ext': 'flv', 'thumbnail': self._og_search_thumbnail(webpage), } + + +class ArteTVPlus7IE(InfoExtractor): + IE_NAME = u'arte.tv:+7' + _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' + + @classmethod + def _extract_url_info(cls, url): + mobj = re.match(cls._VALID_URL, url) + lang = mobj.group('lang') + # This is not a real id, it can be for example AJT for the news + # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal + video_id = mobj.group('id') + return video_id, lang + + def _real_extract(self, url): + video_id, lang = self._extract_url_info(url) + webpage = self._download_webpage(url, video_id) + return self._extract_from_webpage(webpage, video_id, lang) + + def _extract_from_webpage(self, webpage, video_id, lang): + json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') + + json_info = self._download_webpage(json_url, video_id, 'Downloading info json') + self.report_extraction(video_id) + info = json.loads(json_info) + player_info = info['videoJsonPlayer'] + + info_dict = { + 'id': player_info['VID'], + 'title': player_info['VTI'], + 'description': player_info.get('VDE'), + 'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]), + 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + } + + formats = player_info['VSR'].values() + def _match_lang(f): + if f.get('versionCode') is None: + return True + # Return true if that format is in the language of the url + if lang == 'fr': + l = 'F' + elif lang == 'de': + l = 'A' + regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] + return any(re.match(r, f['versionCode']) for r in regexes) + # Some formats may not be in the same language as the url + formats = filter(_match_lang, formats) + # Some formats use the m3u8 protocol + formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats) + # We order the formats by quality + formats = sorted(formats, key=lambda f: int(f.get('height',-1))) + # Prefer videos without subtitles in the same language + formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None) + # Pick the best quality + def _format(format_info): + info = { + 'width': format_info.get('width'), + 'height': format_info.get('height'), + } + if format_info['mediaType'] == u'rtmp': + info['url'] = format_info['streamer'] + info['play_path'] = 'mp4:' + format_info['url'] + info['ext'] = 'flv' + else: + info['url'] = format_info['url'] + info['ext'] = determine_ext(info['url']) + return info + info_dict['formats'] = [_format(f) for f in formats] + # TODO: Remove when #980 has been merged + info_dict.update(info_dict['formats'][-1]) + + return info_dict + + +# It also uses the arte_vp_url url from the webpage to extract the information +class ArteTVCreativeIE(ArteTVPlus7IE): + IE_NAME = u'arte.tv:creative' + _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/magazine?/(?P<id>.+)' + + _TEST = { + u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', + u'file': u'050489-002.mp4', + u'info_dict': { + u'title': u'Agentur Amateur #2 - Corporate Design', + }, + } + + +class ArteTVFutureIE(ArteTVPlus7IE): + IE_NAME = u'arte.tv:future' + _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de)/(thema|sujet)/.*?#article-anchor-(?P<id>\d+)' + + _TEST = { + u'url': u'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', + u'file': u'050940-003.mp4', + u'info_dict': { + u'title': u'Les champignons au secours de la planète', + }, + } + + def _real_extract(self, url): + anchor_id, lang = self._extract_url_info(url) + webpage = self._download_webpage(url, anchor_id) + row = get_element_by_id(anchor_id, webpage) + return self._extract_from_webpage(row, anchor_id, lang) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 745212f2f..1392f382a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -53,6 +53,8 @@ class BrightcoveIE(InfoExtractor): # Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553 object_str = re.sub(r'(<param name="[^"]+" value="[^"]+")>', lambda m: m.group(1) + '/>', object_str) + # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608 + object_str = object_str.replace(u'<--', u'<!--') object_doc = xml.etree.ElementTree.fromstring(object_str) assert u'BrightcoveExperience' in object_doc.attrib['class'] @@ -96,7 +98,10 @@ class BrightcoveIE(InfoExtractor): playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key, player_key, u'Downloading playlist information') - playlist_info = json.loads(playlist_info)['videoList'] + json_data = json.loads(playlist_info) + if 'videoList' not in json_data: + raise ExtractorError(u'Empty playlist') + playlist_info = json_data['videoList'] videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']] return self.playlist_result(videos, playlist_id=playlist_info['id'], diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py new file mode 100644 index 000000000..6925b96c2 --- /dev/null +++ b/youtube_dl/extractor/cinemassacre.py @@ -0,0 +1,91 @@ +# encoding: utf-8 +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, +) + + +class CinemassacreIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?(?P<url>cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?)(?:[/?].*)?' + _TESTS = [{ + u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', + u'file': u'19911.flv', + u'info_dict': { + u'upload_date': u'20121110', + u'title': u'“Angry Video Game Nerd: The Movie” – Trailer', + u'description': u'md5:fb87405fcb42a331742a0dce2708560b', + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + }, + { + u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', + u'file': u'521be8ef82b16.flv', + u'info_dict': { + u'upload_date': u'20131002', + u'title': u'The Mummy’s Hand (1940)', + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + webpage_url = u'http://' + mobj.group('url') + webpage = self._download_webpage(webpage_url, None) # Don't know video id yet + video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') + mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/(?:embed|player)\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) + if not mobj: + raise ExtractorError(u'Can\'t extract embed url and video id') + playerdata_url = mobj.group(u'embed_url') + video_id = mobj.group(u'video_id') + + video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|', + webpage, u'title') + video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>', + webpage, u'description', flags=re.DOTALL, fatal=False) + if len(video_description) == 0: + video_description = None + + playerdata = self._download_webpage(playerdata_url, video_id) + base_url = self._html_search_regex(r'\'streamer\': \'(?P<base_url>rtmp://.*?)/(?:vod|Cinemassacre)\'', + playerdata, u'base_url') + base_url += '/Cinemassacre/' + # Important: The file names in playerdata are not used by the player and even wrong for some videos + sd_file = 'Cinemassacre-%s_high.mp4' % video_id + hd_file = 'Cinemassacre-%s.mp4' % video_id + video_thumbnail = 'http://image.screenwavemedia.com/Cinemassacre/Cinemassacre-%s_thumb_640x360.jpg' % video_id + + formats = [ + { + 'url': base_url + sd_file, + 'ext': 'flv', + 'format': 'sd', + 'format_id': 'sd', + }, + { + 'url': base_url + hd_file, + 'ext': 'flv', + 'format': 'hd', + 'format_id': 'hd', + }, + ] + + info = { + 'id': video_id, + 'title': video_title, + 'formats': formats, + 'description': video_description, + 'upload_date': video_date, + 'thumbnail': video_thumbnail, + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 5edbf678a..098768361 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -1,56 +1,59 @@ import re -import xml.etree.ElementTree +import json from .common import InfoExtractor from ..utils import ( - unified_strdate, compat_urllib_parse, + compat_urlparse, + unescapeHTML, + get_meta_content, ) + class GameSpotIE(InfoExtractor): - _WORKING = False _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?' _TEST = { u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/", - u"file": u"6410818.mp4", + u"file": u"gs-2300-6410818.mp4", u"md5": u"b2a30deaa8654fcccd43713a6b6a4825", u"info_dict": { u"title": u"Arma 3 - Community Guide: SITREP I", - u"upload_date": u"20130627", + u'description': u'Check out this video where some of the basics of Arma 3 is explained.', } } - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('page_id') + page_id = video_id = mobj.group('page_id') webpage = self._download_webpage(url, page_id) - video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"', - r'http://www\.gamespot\.com/videoembed/(\d+)'], - webpage, 'video id') - data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'}) - info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data - info_xml = self._download_webpage(info_url, video_id) - doc = xml.etree.ElementTree.fromstring(info_xml) - clip_el = doc.find('./playList/clip') + data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video') + data_video = json.loads(unescapeHTML(data_video_json)) - http_urls = [{'url': node.find('filePath').text, - 'rate': int(node.find('rate').text)} - for node in clip_el.find('./httpURI')] - best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1] - video_url = best_quality['url'] - title = clip_el.find('./title').text - ext = video_url.rpartition('.')[2] - thumbnail_url = clip_el.find('./screenGrabURI').text - view_count = int(clip_el.find('./views').text) - upload_date = unified_strdate(clip_el.find('./postDate').text) + # Transform the manifest url to a link to the mp4 files + # they are used in mobile devices. + f4m_url = data_video['videoStreams']['f4m_stream'] + f4m_path = compat_urlparse.urlparse(f4m_url).path + QUALITIES_RE = r'((,\d+)+,?)' + qualities = self._search_regex(QUALITIES_RE, f4m_path, u'qualities').strip(',').split(',') + http_path = f4m_path[1:].split('/', 1)[1] + http_template = re.sub(QUALITIES_RE, r'%s', http_path) + http_template = http_template.replace('.csmil/manifest.f4m', '') + http_template = compat_urlparse.urljoin('http://video.gamespotcdn.com/', http_template) + formats = [] + for q in qualities: + formats.append({ + 'url': http_template % q, + 'ext': 'mp4', + 'format_id': q, + }) - return [{ - 'id' : video_id, - 'url' : video_url, - 'ext' : ext, - 'title' : title, - 'thumbnail' : thumbnail_url, - 'upload_date' : upload_date, - 'view_count' : view_count, - }] + info = { + 'id': data_video['guid'], + 'title': compat_urllib_parse.unquote(data_video['title']), + 'formats': formats, + 'description': get_meta_content('description', webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7060c6f92..89805250c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -11,6 +11,8 @@ from ..utils import ( compat_urlparse, ExtractorError, + smuggle_url, + unescapeHTML, ) from .brightcove import BrightcoveIE @@ -29,6 +31,17 @@ class GenericIE(InfoExtractor): u"title": u"R\u00e9gis plante sa Jeep" } }, + # embedded vimeo video + { + u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', + u'file': u'22444065.mp4', + u'md5': u'2903896e23df39722c33f015af0666e2', + u'info_dict': { + u'title': u'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011', + u"uploader_id": u"skillsmatter", + u"uploader": u"Skills Matter", + } + } ] def report_download_webpage(self, video_id): @@ -121,12 +134,20 @@ class GenericIE(InfoExtractor): self.report_extraction(video_id) # Look for BrightCove: - m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) + m_brightcove = re.search(r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) if m_brightcove is not None: self.to_screen(u'Brightcove video detected.') bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group()) return self.url_result(bc_url, 'Brightcove') + # Look for embedded Vimeo player + mobj = re.search( + r'<iframe\s+src="(https?://player.vimeo.com/video/.*?)"', webpage) + if mobj: + player_url = unescapeHTML(mobj.group(1)) + surl = smuggle_url(player_url, {'Referer': url}) + return self.url_result(surl, 'Vimeo') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py new file mode 100644 index 000000000..5986459d6 --- /dev/null +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -0,0 +1,87 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + compat_urllib_parse, + xpath_with_ns, + determine_ext, +) + + +class InternetVideoArchiveIE(InfoExtractor): + _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' + + _TEST = { + u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', + u'file': u'452693.mp4', + u'info_dict': { + u'title': u'SKYFALL', + u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', + u'duration': 156, + }, + } + + @staticmethod + def _build_url(query): + return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query + + @staticmethod + def _clean_query(query): + NEEDED_ARGS = ['publishedid', 'customerid'] + query_dic = compat_urlparse.parse_qs(query) + cleaned_dic = dict((k,v[0]) for (k,v) in query_dic.items() if k in NEEDED_ARGS) + # Other player ids return m3u8 urls + cleaned_dic['playerid'] = '247' + cleaned_dic['videokbrate'] = '100000' + return compat_urllib_parse.urlencode(cleaned_dic) + + def _real_extract(self, url): + query = compat_urlparse.urlparse(url).query + query_dic = compat_urlparse.parse_qs(query) + video_id = query_dic['publishedid'][0] + url = self._build_url(query) + + flashconfiguration_xml = self._download_webpage(url, video_id, + u'Downloading flash configuration') + flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8')) + file_url = flashconfiguration.find('file').text + file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') + # Replace some of the parameters in the query to get the best quality + # and http links (no m3u8 manifests) + file_url = re.sub(r'(?<=\?)(.+)$', + lambda m: self._clean_query(m.group()), + file_url) + info_xml = self._download_webpage(file_url, video_id, + u'Downloading video info') + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + item = info.find('channel/item') + + def _bp(p): + return xpath_with_ns(p, + {'media': 'http://search.yahoo.com/mrss/', + 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'}) + formats = [] + for content in item.findall(_bp('media:group/media:content')): + attr = content.attrib + f_url = attr['url'] + formats.append({ + 'url': f_url, + 'ext': determine_ext(f_url), + 'width': int(attr['width']), + 'bitrate': int(attr['bitrate']), + }) + formats = sorted(formats, key=lambda f: f['bitrate']) + + info = { + 'id': video_id, + 'title': item.find('title').text, + 'formats': formats, + 'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'], + 'description': item.find('description').text, + 'duration': int(attr['duration']), + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py new file mode 100644 index 000000000..ab52ad401 --- /dev/null +++ b/youtube_dl/extractor/nowvideo.py @@ -0,0 +1,43 @@ +import re + +from .common import InfoExtractor +from ..utils import compat_urlparse + + +class NowVideoIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.ch/video/(?P<id>\w+)' + _TEST = { + u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa', + u'file': u'0mw0yow7b6dxa.flv', + u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817', + u'info_dict': { + u"title": u"youtubedl test video _BaW_jenozKc.mp4" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage_url = 'http://www.nowvideo.ch/video/' + video_id + webpage = self._download_webpage(webpage_url, video_id) + + self.report_extraction(video_id) + + video_title = self._html_search_regex(r'<h4>(.*)</h4>', + webpage, u'video title') + + video_key = self._search_regex(r'var fkzd="(.*)";', + webpage, u'video key') + + api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key) + api_response = self._download_webpage(api_call, video_id, + u'Downloading API page') + video_url = compat_urlparse.parse_qs(api_response)[u'url'][0] + + return [{ + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + }] diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py new file mode 100644 index 000000000..c79c39413 --- /dev/null +++ b/youtube_dl/extractor/rottentomatoes.py @@ -0,0 +1,16 @@ +from .videodetective import VideoDetectiveIE + + +# It just uses the same method as videodetective.com, +# the internetvideoarchive.com is extracted from the og:video property +class RottenTomatoesIE(VideoDetectiveIE): + _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', + u'file': '613340.mp4', + u'info_dict': { + u'title': u'TOY STORY 3', + u'description': u'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + }, + } diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py new file mode 100644 index 000000000..a18034fe2 --- /dev/null +++ b/youtube_dl/extractor/rutube.py @@ -0,0 +1,58 @@ +# encoding: utf-8 +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + compat_str, + ExtractorError, +) + + +class RutubeIE(InfoExtractor): + _VALID_URL = r'https?://rutube.ru/video/(?P<long_id>\w+)' + + _TEST = { + u'url': u'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', + u'file': u'3eac3b4561676c17df9132a9a1e62e3e.mp4', + u'info_dict': { + u'title': u'Раненный кенгуру забежал в аптеку', + u'uploader': u'NTDRussian', + u'uploader_id': u'29790', + }, + u'params': { + # It requires ffmpeg (m3u8 download) + u'skip_download': True, + }, + } + + def _get_api_response(self, short_id, subpath): + api_url = 'http://rutube.ru/api/play/%s/%s/?format=json' % (subpath, short_id) + response_json = self._download_webpage(api_url, short_id, + u'Downloading %s json' % subpath) + return json.loads(response_json) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + long_id = mobj.group('long_id') + webpage = self._download_webpage(url, long_id) + og_video = self._og_search_video_url(webpage) + short_id = compat_urlparse.urlparse(og_video).path[1:] + options = self._get_api_response(short_id, 'options') + trackinfo = self._get_api_response(short_id, 'trackinfo') + # Some videos don't have the author field + author = trackinfo.get('author') or {} + m3u8_url = trackinfo['video_balancer'].get('m3u8') + if m3u8_url is None: + raise ExtractorError(u'Couldn\'t find m3u8 manifest url') + + return { + 'id': trackinfo['id'], + 'title': trackinfo['title'], + 'url': m3u8_url, + 'ext': 'mp4', + 'thumbnail': options['thumbnail_url'], + 'uploader': author.get('name'), + 'uploader_id': compat_str(author['id']) if author else None, + } diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py new file mode 100644 index 000000000..81fa35c4b --- /dev/null +++ b/youtube_dl/extractor/sztvhu.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class SztvHuIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)' + _TEST = { + u'url': u'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', + u'file': u'20130909.mp4', + u'md5': u'a6df607b11fb07d0e9f2ad94613375cb', + u'info_dict': { + u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren", + u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + video_file = self._search_regex( + r'file: "...:(.*?)",', webpage, 'video file') + title = self._html_search_regex( + r'<meta name="title" content="([^"]*?) - [^-]*? - [^-]*?"', + webpage, 'video title') + description = self._html_search_regex( + r'<meta name="description" content="([^"]*)"/>', + webpage, 'video description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + + video_url = 'http://media.sztv.hu/vod/' + video_file + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'ext': determine_ext(video_url), + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/techtalks.py b/youtube_dl/extractor/techtalks.py new file mode 100644 index 000000000..a55f236cb --- /dev/null +++ b/youtube_dl/extractor/techtalks.py @@ -0,0 +1,65 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + get_element_by_attribute, + clean_html, +) + + +class TechTalksIE(InfoExtractor): + _VALID_URL = r'https?://techtalks\.tv/talks/[^/]*/(?P<id>\d+)/' + + _TEST = { + u'url': u'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/', + u'playlist': [ + { + u'file': u'57758.flv', + u'info_dict': { + u'title': u'Learning Topic Models --- Going beyond SVD', + }, + }, + { + u'file': u'57758-slides.flv', + u'info_dict': { + u'title': u'Learning Topic Models --- Going beyond SVD', + }, + }, + ], + u'params': { + # rtmp download + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + talk_id = mobj.group('id') + webpage = self._download_webpage(url, talk_id) + rtmp_url = self._search_regex(r'netConnectionUrl: \'(.*?)\'', webpage, + u'rtmp url') + play_path = self._search_regex(r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"', + webpage, u'presenter play path') + title = clean_html(get_element_by_attribute('class', 'title', webpage)) + video_info = { + 'id': talk_id, + 'title': title, + 'url': rtmp_url, + 'play_path': play_path, + 'ext': 'flv', + } + m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage) + if m_slides is None: + return video_info + else: + return [ + video_info, + # The slides video + { + 'id': talk_id + '-slides', + 'title': title, + 'url': rtmp_url, + 'play_path': m_slides.group(1), + 'ext': 'flv', + }, + ] diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 1405b73f7..79679a14a 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -7,15 +7,25 @@ from .common import InfoExtractor class TudouIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?' - _TEST = { + _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs|albumplay)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?' + _TESTS = [{ u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', u'file': u'159448201.f4v', u'md5': u'140a49ed444bd22f93330985d8475fcb', u'info_dict': { u"title": u"卡马乔国足开大脚长传冲吊集锦" } - } + }, + { + u'url': u'http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html', + u'file': u'todo.mp4', + u'md5': u'todo.mp4', + u'info_dict': { + u'title': u'todo.mp4', + }, + u'add_ie': [u'Youku'], + u'skip': u'Only works from China' + }] def _url_for_id(self, id, quality = None): info_url = "http://v2.tudou.com/f?id="+str(id) @@ -29,14 +39,18 @@ class TudouIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(2) webpage = self._download_webpage(url, video_id) - title = re.search(",kw:\"(.+)\"",webpage) - if title is None: - title = re.search(",kw: \'(.+)\'",webpage) - title = title.group(1) - thumbnail_url = re.search(",pic: \'(.+?)\'",webpage) - if thumbnail_url is None: - thumbnail_url = re.search(",pic:\"(.+?)\"",webpage) - thumbnail_url = thumbnail_url.group(1) + + m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage) + if m and m.group(1): + return { + '_type': 'url', + 'url': u'youku:' + m.group(1), + 'ie_key': 'Youku' + } + + title = self._search_regex(r",kw:['\"](.+?)[\"']", webpage, u'title') + thumbnail_url = self._search_regex( + r",pic:\s*[\"'](.+?)[\"']", webpage, u'thumbnail URL', fatal=False) segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments') segments = json.loads(segs_json) diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py new file mode 100644 index 000000000..d89f84094 --- /dev/null +++ b/youtube_dl/extractor/videodetective.py @@ -0,0 +1,30 @@ +import re + +from .common import InfoExtractor +from .internetvideoarchive import InternetVideoArchiveIE +from ..utils import ( + compat_urlparse, +) + + +class VideoDetectiveIE(InfoExtractor): + _VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.videodetective.com/movies/kick-ass-2/194487', + u'file': u'194487.mp4', + u'info_dict': { + u'title': u'KICK-ASS 2', + u'description': u'md5:65ba37ad619165afac7d432eaded6013', + u'duration': 138, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + og_video = self._og_search_video_url(webpage) + query = compat_urlparse.urlparse(og_video).query + return self.url_result(InternetVideoArchiveIE._build_url(query), + ie=InternetVideoArchiveIE.ie_key()) diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py new file mode 100644 index 000000000..65f39b982 --- /dev/null +++ b/youtube_dl/extractor/videopremium.py @@ -0,0 +1,40 @@ +import re +import random + +from .common import InfoExtractor + + +class VideoPremiumIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.tv/(?P<id>\w+)(?:/.*)?' + _TEST = { + u'url': u'http://videopremium.tv/4w7oadjsf156', + u'file': u'4w7oadjsf156.f4v', + u'info_dict': { + u"title": u"youtube-dl_test_video____a_________-BaW_jenozKc.mp4.mp4" + }, + u'params': { + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage_url = 'http://videopremium.tv/' + video_id + webpage = self._download_webpage(webpage_url, video_id) + + self.report_extraction(video_id) + + video_title = self._html_search_regex(r'<h2(?:.*?)>\s*(.+?)\s*<', + webpage, u'video title') + + return [{ + 'id': video_id, + 'url': "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16), + 'play_path': "mp4:%s.f4v" % video_id, + 'page_url': "http://videopremium.tv/" + video_id, + 'player_url': "http://videopremium.tv/uplayer/uppod.swf", + 'ext': 'f4v', + 'title': video_title, + }] diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cea29f035..2de56ac81 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -11,6 +11,7 @@ from ..utils import ( get_element_by_attribute, ExtractorError, std_headers, + unsmuggle_url, ) class VimeoIE(InfoExtractor): @@ -53,7 +54,7 @@ class VimeoIE(InfoExtractor): u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software', u'uploader': u'The BLN & Business of Software', }, - }, + } ] def _login(self): @@ -98,6 +99,12 @@ class VimeoIE(InfoExtractor): self._login() def _real_extract(self, url, new_video=True): + url, data = unsmuggle_url(url) + headers = std_headers + if data is not None: + headers = headers.copy() + headers.update(data) + # Extract ID from URL mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -112,7 +119,7 @@ class VimeoIE(InfoExtractor): url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url, None, std_headers) + request = compat_urllib_request.Request(url, None, headers) webpage = self._download_webpage(request, video_id) # Now we begin extracting as much information as we can from what we diff --git a/youtube_dl/extractor/websurg.py b/youtube_dl/extractor/websurg.py new file mode 100644 index 000000000..43953bfdd --- /dev/null +++ b/youtube_dl/extractor/websurg.py @@ -0,0 +1,59 @@ +# coding: utf-8 + +import re + +from ..utils import ( + compat_urllib_request, + compat_urllib_parse +) + +from .common import InfoExtractor + +class WeBSurgIE(InfoExtractor): + IE_NAME = u'websurg.com' + _VALID_URL = r'http://.*?\.websurg\.com/MEDIA/\?noheader=1&doi=(.*)' + + _TEST = { + u'url': u'http://www.websurg.com/MEDIA/?noheader=1&doi=vd01en4012', + u'file': u'vd01en4012.mp4', + u'params': { + u'skip_download': True, + }, + u'skip': u'Requires login information', + } + + _LOGIN_URL = 'http://www.websurg.com/inc/login/login_div.ajax.php?login=1' + + def _real_initialize(self): + + login_form = { + 'username': self._downloader.params['username'], + 'password': self._downloader.params['password'], + 'Submit': 1 + } + + request = compat_urllib_request.Request( + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) + request.add_header( + 'Content-Type', 'application/x-www-form-urlencoded;charset=utf-8') + compat_urllib_request.urlopen(request).info() + webpage = self._download_webpage(self._LOGIN_URL, '', 'Logging in') + + if webpage != 'OK': + self._downloader.report_error( + u'Unable to log in: bad username/password') + + def _real_extract(self, url): + video_id = re.match(self._VALID_URL, url).group(1) + + webpage = self._download_webpage(url, video_id) + + url_info = re.search(r'streamer="(.*?)" src="(.*?)"', webpage) + + return {'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'ext' : 'mp4', + 'url' : url_info.group(1) + '/' + url_info.group(2), + 'thumbnail': self._og_search_thumbnail(webpage) + } diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 00fa2ccb5..9d88c17f5 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -13,7 +13,7 @@ from ..utils import ( class YoukuIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(v|player)\.youku\.com/(v_show/id_|player\.php/sid/)(?P<ID>[A-Za-z0-9]+)(\.html|/v.swf)' + _VALID_URL = r'(?:(?:http://)?(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|youku:)(?P<ID>[A-Za-z0-9]+)(?:\.html|/v\.swf|)' _TEST = { u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html", u"file": u"XNDgyMDQ2NTQw_part00.flv", diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8222a880f..fb7c42830 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1150,7 +1150,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): list_page = self._download_webpage(list_url, video_id) caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8')) original_lang_node = caption_list.find('track') - if original_lang_node.attrib.get('kind') != 'asr' : + if not original_lang_node or original_lang_node.attrib.get('kind') != 'asr' : self._downloader.report_warning(u'Video doesn\'t have automatic captions') return {} original_lang = original_lang_node.attrib['lang_code'] @@ -1250,6 +1250,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url_map[itag] = format_url return url_map + def _extract_annotations(self, video_id): + url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id + return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.') + def _real_extract(self, url): # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) @@ -1382,6 +1386,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) + # annotations + video_annotations = None + if self._downloader.params.get('writeannotations', False): + video_annotations = self._extract_annotations(video_id) + # Decide which formats to download try: @@ -1495,6 +1504,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'subtitles': video_subtitles, 'duration': video_duration, 'age_limit': 18 if age_gate else 0, + 'annotations': video_annotations }) return results diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 82a1daeb9..833f981f2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -9,6 +9,7 @@ import io import json import locale import os +import pipes import platform import re import socket @@ -229,6 +230,19 @@ else: return f return None +# On python2.6 the xml.etree.ElementTree.Element methods don't support +# the namespace parameter +def xpath_with_ns(path, ns_map): + components = [c.split(':') for c in path.split('/')] + replaced = [] + for c in components: + if len(c) == 1: + replaced.append(c[0]) + else: + ns, tag = c + replaced.append('{%s}%s' % (ns_map[ns], tag)) + return '/'.join(replaced) + def htmlentity_transform(matchobj): """Transforms an HTML entity to a character. @@ -927,3 +941,24 @@ class locked_file(object): def read(self, *args): return self.f.read(*args) + + +def shell_quote(args): + return ' '.join(map(pipes.quote, args)) + + +def smuggle_url(url, data): + """ Pass additional data in a URL for internal use. """ + + sdata = compat_urllib_parse.urlencode( + {u'__youtubedl_smuggle': json.dumps(data)}) + return url + u'#' + sdata + + +def unsmuggle_url(smug_url): + if not '#__youtubedl_smuggle' in smug_url: + return smug_url, None + url, _, sdata = smug_url.rpartition(u'#') + jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0] + data = json.loads(jsond) + return url, data diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1004af116..22a51ffe6 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.09' +__version__ = '2013.10.17' |