diff options
Diffstat (limited to 'youtube_dl')
54 files changed, 1924 insertions, 576 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 42cbcf699..62ccad20c 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -409,6 +409,13 @@ class YoutubeDL(object): template_dict['autonumber'] = autonumber_templ % self._num_downloads if template_dict.get('playlist_index') is not None: template_dict['playlist_index'] = '%05d' % template_dict['playlist_index'] + if template_dict.get('resolution') is None: + if template_dict.get('width') and template_dict.get('height'): + template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height']) + elif template_dict.get('height'): + res = '%sp' % template_dict['height'] + elif template_dict.get('width'): + res = '?x%d' % template_dict['width'] sanitize = lambda k, v: sanitize_filename( compat_str(v), @@ -675,6 +682,9 @@ class YoutubeDL(object): info_dict['playlist'] = None info_dict['playlist_index'] = None + if 'display_id' not in info_dict and 'id' in info_dict: + info_dict['display_id'] = info_dict['id'] + # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: if download: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 70608066c..d39eb830f 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -46,12 +46,17 @@ __authors__ = ( 'Andreas Schmitz', 'Michael Kaiser', 'Niklas Laxström', + 'David Triendl', + 'Anthony Weems', + 'David Wagner', + 'Juan C. Olivares', ) __license__ = 'Public Domain' import codecs import getpass +import io import locale import optparse import os @@ -70,6 +75,7 @@ from .utils import ( get_cachedir, MaxDownloadsReached, preferredencoding, + read_batch_urls, SameFileError, setproctitle, std_headers, @@ -424,6 +430,8 @@ def parseOpts(overrideArguments=None): '%(extractor)s for the provider (youtube, metacafe, etc), ' '%(id)s for the video id, %(playlist)s for the playlist the video is in, ' '%(playlist_index)s for the position in the playlist and %% for a literal percent. ' + '%(height)s and %(width)s for the width and height of the video format. ' + '%(resolution)s for a textual description of the resolution of the video format. ' 'Use - to output to stdout. Can also be used to download to a different directory, ' 'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .')) filesystem.add_option('--autonumber-size', @@ -551,21 +559,19 @@ def _real_main(argv=None): sys.exit(0) # Batch file verification - batchurls = [] + batch_urls = [] if opts.batchfile is not None: try: if opts.batchfile == '-': batchfd = sys.stdin else: - batchfd = open(opts.batchfile, 'r') - batchurls = batchfd.readlines() - batchurls = [x.strip() for x in batchurls] - batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)] + batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore') + batch_urls = read_batch_urls(batchfd) if opts.verbose: - write_string(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n') + write_string(u'[debug] Batch file urls: ' + repr(batch_urls) + u'\n') except IOError: sys.exit(u'ERROR: batch file could not be read') - all_urls = batchurls + args + all_urls = batch_urls + args all_urls = [url.strip() for url in all_urls] _enc = preferredencoding() all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 2a870a758..4e6abfe10 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -12,7 +12,6 @@ from .http import HttpFD from ..utils import ( struct_pack, struct_unpack, - compat_urllib_request, compat_urlparse, format_bytes, encodeFilename, @@ -117,8 +116,8 @@ class FlvReader(io.BytesIO): self.read_unsigned_char() # flags self.read(3) - # BootstrapinfoVersion - bootstrap_info_version = self.read_unsigned_int() + + self.read_unsigned_int() # BootstrapinfoVersion # Profile,Live,Update,Reserved self.read(1) # time scale @@ -127,15 +126,15 @@ class FlvReader(io.BytesIO): self.read_unsigned_long_long() # SmpteTimeCodeOffset self.read_unsigned_long_long() - # MovieIdentifier - movie_identifier = self.read_string() + + self.read_string() # MovieIdentifier server_count = self.read_unsigned_char() # ServerEntryTable for i in range(server_count): self.read_string() quality_count = self.read_unsigned_char() # QualityEntryTable - for i in range(server_count): + for i in range(quality_count): self.read_string() # DrmData self.read_string() diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 748f9f3ad..240ecb606 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -85,6 +85,7 @@ class HttpFD(FileDownloader): else: # The length does not match, we start the download over self.report_unable_to_resume() + resume_len = 0 open_mode = 'wb' break # Retry diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 229bdc595..f35ee4941 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -19,12 +19,15 @@ from .bbccouk import BBCCoUkIE from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE +from .br import BRIE from .breakcom import BreakIE from .brightcove import BrightcoveIE from .c56 import C56IE +from .canal13cl import Canal13clIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cbs import CBSIE +from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE from .cinemassacre import CinemassacreIE @@ -88,6 +91,7 @@ from .funnyordie import FunnyOrDieIE from .gamekings import GamekingsIE from .gamespot import GameSpotIE from .gametrailers import GametrailersIE +from .gdcvault import GDCVaultIE from .generic import GenericIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE @@ -132,11 +136,12 @@ from .lynda import ( ) from .m6 import M6IE from .macgamestore import MacGameStoreIE +from .mailru import MailRuIE from .malemotion import MalemotionIE from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE -from .mit import TechTVMITIE, MITIE +from .mit import TechTVMITIE, MITIE, OCWMITIE from .mixcloud import MixcloudIE from .mpora import MporaIE from .mofosex import MofosexIE @@ -151,7 +156,10 @@ from .myspass import MySpassIE from .myvideo import MyVideoIE from .naver import NaverIE from .nba import NBAIE -from .nbc import NBCNewsIE +from .nbc import ( + NBCIE, + NBCNewsIE, +) from .ndr import NDRIE from .ndtv import NDTVIE from .newgrounds import NewgroundsIE @@ -160,7 +168,7 @@ from .nhl import NHLIE, NHLVideocenterIE from .niconico import NiconicoIE from .ninegag import NineGagIE from .normalboots import NormalbootsIE -from .novamov import NovamovIE +from .novamov import NovaMovIE from .nowness import NownessIE from .nowvideo import NowVideoIE from .ooyala import OoyalaIE @@ -171,6 +179,7 @@ from .podomatic import PodomaticIE from .pornhd import PornHdIE from .pornhub import PornHubIE from .pornotube import PornotubeIE +from .prosiebensat1 import ProSiebenSat1IE from .pyvideo import PyvideoIE from .radiofrance import RadioFranceIE from .rbmaradio import RBMARadioIE @@ -225,10 +234,12 @@ from .tinypic import TinyPicIE from .toutv import TouTvIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .trutube import TruTubeIE from .tube8 import Tube8IE from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE +from .tvigle import TvigleIE from .tvp import TvpIE from .unistra import UnistraIE from .ustream import UstreamIE, UstreamChannelIE @@ -239,6 +250,7 @@ from .vesti import VestiIE from .vevo import VevoIE from .vice import ViceIE from .viddler import ViddlerIE +from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE @@ -273,19 +285,20 @@ from .youku import YoukuIE from .youporn import YouPornIE from .youtube import ( YoutubeIE, + YoutubeChannelIE, + YoutubeFavouritesIE, + YoutubeHistoryIE, YoutubePlaylistIE, - YoutubeSearchIE, + YoutubeRecommendedIE, YoutubeSearchDateIE, - YoutubeUserIE, - YoutubeChannelIE, + YoutubeSearchIE, + YoutubeSearchURLIE, YoutubeShowIE, YoutubeSubscriptionsIE, - YoutubeRecommendedIE, + YoutubeTopListIE, YoutubeTruncatedURLIE, + YoutubeUserIE, YoutubeWatchLaterIE, - YoutubeFavouritesIE, - YoutubeHistoryIE, - YoutubeTopListIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py index 72f81d01a..59d3bbba4 100644 --- a/youtube_dl/extractor/academicearth.py +++ b/youtube_dl/extractor/academicearth.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class AcademicEarthCourseIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/(?:courses|playlists)/(?P<id>[^?#/]+)' + _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P<id>[^?#/]+)' IE_NAME = 'AcademicEarth:Course' def _real_extract(self, url): @@ -14,12 +14,12 @@ class AcademicEarthCourseIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) title = self._html_search_regex( - r'<h1 class="playlist-name">(.*?)</h1>', webpage, u'title') + r'<h1 class="playlist-name"[^>]*?>(.*?)</h1>', webpage, u'title') description = self._html_search_regex( - r'<p class="excerpt">(.*?)</p>', + r'<p class="excerpt"[^>]*?>(.*?)</p>', webpage, u'description', fatal=False) urls = re.findall( - r'<h3 class="lecture-title"><a target="_blank" href="([^"]+)">', + r'<li class="lecture-preview">\s*?<a target="_blank" href="([^"]+)">', webpage) entries = [self.url_result(u) for u in urls] diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py new file mode 100644 index 000000000..5fcc1084a --- /dev/null +++ b/youtube_dl/extractor/br.py @@ -0,0 +1,80 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class BRIE(InfoExtractor): + IE_DESC = "Bayerischer Rundfunk Mediathek" + _VALID_URL = r"^https?://(?:www\.)?br\.de/mediathek/video/(?:sendungen/)?(?P<id>[a-z0-9\-]+)\.html$" + _BASE_URL = "http://www.br.de" + + _TEST = { + "url": "http://www.br.de/mediathek/video/anselm-gruen-114.html", + "md5": "c4f83cf0f023ba5875aba0bf46860df2", + "info_dict": { + "id": "2c8d81c5-6fb7-4a74-88d4-e768e5856532", + "ext": "mp4", + "title": "Feiern und Verzichten", + "description": "Anselm Grün: Feiern und Verzichten", + "uploader": "BR/Birgit Baier", + "upload_date": "20140301" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + page = self._download_webpage(url, display_id) + xml_url = self._search_regex( + r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/mediathek/video/[a-z0-9/~_.-]+)'}\)\);", page, "XMLURL") + xml = self._download_xml(self._BASE_URL + xml_url, None) + + videos = [{ + "id": xml_video.get("externalId"), + "title": xml_video.find("title").text, + "formats": self._extract_formats(xml_video.find("assets")), + "thumbnails": self._extract_thumbnails(xml_video.find("teaserImage/variants")), + "description": " ".join(xml_video.find("shareTitle").text.splitlines()), + "uploader": xml_video.find("author").text, + "upload_date": "".join(reversed(xml_video.find("broadcastDate").text.split("."))), + "webpage_url": xml_video.find("permalink").text, + } for xml_video in xml.findall("video")] + + if len(videos) > 1: + self._downloader.report_warning( + 'found multiple videos; please ' + 'report this with the video URL to http://yt-dl.org/bug') + if not videos: + raise ExtractorError('No video entries found') + return videos[0] + + def _extract_formats(self, assets): + formats = [{ + "url": asset.find("downloadUrl").text, + "ext": asset.find("mediaType").text, + "format_id": asset.get("type"), + "width": int(asset.find("frameWidth").text), + "height": int(asset.find("frameHeight").text), + "tbr": int(asset.find("bitrateVideo").text), + "abr": int(asset.find("bitrateAudio").text), + "vcodec": asset.find("codecVideo").text, + "container": asset.find("mediaType").text, + "filesize": int(asset.find("size").text), + } for asset in assets.findall("asset") + if asset.find("downloadUrl") is not None] + + self._sort_formats(formats) + return formats + + def _extract_thumbnails(self, variants): + thumbnails = [{ + "url": self._BASE_URL + variant.find("url").text, + "width": int(variant.find("width").text), + "height": int(variant.find("height").text), + } for variant in variants.findall("variant")] + thumbnails.sort(key=lambda x: x["width"] * x["height"], reverse=True) + return thumbnails diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 8ec6dda49..85635d1cc 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -23,8 +23,8 @@ class BreakIE(InfoExtractor): video_id = mobj.group(1).split("-")[-1] embed_url = 'http://www.break.com/embed/%s' % video_id webpage = self._download_webpage(embed_url, video_id) - info_json = self._search_regex(r'var embedVars = ({.*?});', webpage, - 'info json', flags=re.DOTALL) + info_json = self._search_regex(r'var embedVars = ({.*})\s*?</script>', + webpage, 'info json', flags=re.DOTALL) info = json.loads(info_json) video_url = info['videoUri'] m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) diff --git a/youtube_dl/extractor/canal13cl.py b/youtube_dl/extractor/canal13cl.py new file mode 100644 index 000000000..93241fefe --- /dev/null +++ b/youtube_dl/extractor/canal13cl.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class Canal13clIE(InfoExtractor): + _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)' + _TEST = { + 'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'md5': '4cb1fa38adcad8fea88487a078831755', + 'info_dict': { + 'id': '1403022125', + 'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'ext': 'mp4', + 'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda', + 'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + + webpage = self._download_webpage(url, display_id) + + title = self._html_search_meta( + 'twitter:title', webpage, 'title', fatal=True) + description = self._html_search_meta( + 'twitter:description', webpage, 'description') + url = self._html_search_regex( + r'articuloVideo = \"(.*?)\"', webpage, 'url') + real_id = self._search_regex( + r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id) + thumbnail = self._html_search_regex( + r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail') + + return { + 'id': real_id, + 'display_id': display_id, + 'url': url, + 'title': title, + 'description': description, + 'ext': 'mp4', + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py new file mode 100644 index 000000000..90a3dddb9 --- /dev/null +++ b/youtube_dl/extractor/ceskatelevize.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, + compat_urllib_parse, + compat_urllib_parse_urlparse, + ExtractorError, +) + + +class CeskaTelevizeIE(InfoExtractor): + _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)' + + _TESTS = [ + { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka', + 'info_dict': { + 'id': '213512120230004', + 'ext': 'flv', + 'title': 'První republika: Španělská chřipka', + 'duration': 3107.4, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + 'skip': 'Works only from Czech Republic.', + }, + { + 'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt', + 'info_dict': { + 'id': '20138143440', + 'ext': 'flv', + 'title': 'Tsatsiki, maminka a policajt', + 'duration': 6754.1, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + 'skip': 'Works only from Czech Republic.', + }, + { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', + 'info_dict': { + 'id': '14716', + 'ext': 'flv', + 'title': 'První republika: Zpěvačka z Dupárny Bobina', + 'duration': 90, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, + ] + + def _real_extract(self, url): + url = url.replace('/porady/', '/ivysilani/').replace('/video/', '') + + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' + if '%s</p>' % NOT_AVAILABLE_STRING in webpage: + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + + typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') + episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') + + data = { + 'playlist[0][type]': typ, + 'playlist[0][id]': episode_id, + 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestSource': 'iVysilani', + } + + req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url', + data=compat_urllib_parse.urlencode(data)) + + req.add_header('Content-type', 'application/x-www-form-urlencoded') + req.add_header('x-addr', '127.0.0.1') + req.add_header('X-Requested-With', 'XMLHttpRequest') + req.add_header('Referer', url) + + playlistpage = self._download_json(req, video_id) + + req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlistpage['url'])) + req.add_header('Referer', url) + + playlist = self._download_xml(req, video_id) + + formats = [] + for i in playlist.find('smilRoot/body'): + if 'AD' not in i.attrib['id']: + base_url = i.attrib['base'] + parsedurl = compat_urllib_parse_urlparse(base_url) + duration = i.attrib['duration'] + + for video in i.findall('video'): + if video.attrib['label'] != 'AD': + format_id = video.attrib['label'] + play_path = video.attrib['src'] + vbr = int(video.attrib['system-bitrate']) + + formats.append({ + 'format_id': format_id, + 'url': base_url, + 'vbr': vbr, + 'play_path': play_path, + 'app': parsedurl.path[1:] + '?' + parsedurl.query, + 'rtmp_live': True, + 'ext': 'flv', + }) + + self._sort_formats(formats) + + return { + 'id': episode_id, + 'title': self._html_search_regex(r'<title>(.+?) — iVysílání — Česká televize</title>', webpage, 'title'), + 'duration': float(duration), + 'formats': formats, + } diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index f0d08cebf..bfbffefdc 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -1,4 +1,5 @@ # encoding: utf-8 +from __future__ import unicode_literals import re from .common import InfoExtractor @@ -8,73 +9,63 @@ from ..utils import ( class CinemassacreIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?(?P<url>cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?)(?:[/?].*)?' - _TESTS = [{ - u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', - u'file': u'19911.flv', - u'info_dict': { - u'upload_date': u'20121110', - u'title': u'“Angry Video Game Nerd: The Movie” – Trailer', - u'description': u'md5:fb87405fcb42a331742a0dce2708560b', - }, - u'params': { - # rtmp download - u'skip_download': True, - }, - }, - { - u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', - u'file': u'521be8ef82b16.flv', - u'info_dict': { - u'upload_date': u'20131002', - u'title': u'The Mummy’s Hand (1940)', - }, - u'params': { - # rtmp download - u'skip_download': True, + _VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?' + _TESTS = [ + { + 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', + 'file': '19911.mp4', + 'md5': 'fde81fbafaee331785f58cd6c0d46190', + 'info_dict': { + 'upload_date': '20121110', + 'title': '“Angry Video Game Nerd: The Movie” – Trailer', + 'description': 'md5:fb87405fcb42a331742a0dce2708560b', + }, }, - }] + { + 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', + 'file': '521be8ef82b16.mp4', + 'md5': 'd72f10cd39eac4215048f62ab477a511', + 'info_dict': { + 'upload_date': '20131002', + 'title': 'The Mummy’s Hand (1940)', + }, + } + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - webpage_url = u'http://' + mobj.group('url') - webpage = self._download_webpage(webpage_url, None) # Don't know video id yet + webpage = self._download_webpage(url, None) # Don't know video id yet video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) if not mobj: - raise ExtractorError(u'Can\'t extract embed url and video id') - playerdata_url = mobj.group(u'embed_url') - video_id = mobj.group(u'video_id') + raise ExtractorError('Can\'t extract embed url and video id') + playerdata_url = mobj.group('embed_url') + video_id = mobj.group('video_id') video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|', - webpage, u'title') + webpage, 'title') video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>', - webpage, u'description', flags=re.DOTALL, fatal=False) + webpage, 'description', flags=re.DOTALL, fatal=False) if len(video_description) == 0: video_description = None playerdata = self._download_webpage(playerdata_url, video_id) - url = self._html_search_regex(r'\'streamer\': \'(?P<url>[^\']+)\'', playerdata, u'url') - sd_file = self._html_search_regex(r'\'file\': \'(?P<sd_file>[^\']+)\'', playerdata, u'sd_file') - hd_file = self._html_search_regex(r'\'?file\'?: "(?P<hd_file>[^"]+)"', playerdata, u'hd_file') - video_thumbnail = self._html_search_regex(r'\'image\': \'(?P<thumbnail>[^\']+)\'', playerdata, u'thumbnail', fatal=False) + sd_url = self._html_search_regex(r'file: \'(?P<sd_file>[^\']+)\', label: \'SD\'', playerdata, 'sd_file') + hd_url = self._html_search_regex(r'file: \'(?P<hd_file>[^\']+)\', label: \'HD\'', playerdata, 'hd_file') + video_thumbnail = self._html_search_regex(r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False) formats = [ { - 'url': url, - 'play_path': 'mp4:' + sd_file, - 'rtmp_live': True, # workaround - 'ext': 'flv', + 'url': sd_url, + 'ext': 'mp4', 'format': 'sd', 'format_id': 'sd', }, { - 'url': url, - 'play_path': 'mp4:' + hd_file, - 'rtmp_live': True, # workaround - 'ext': 'flv', + 'url': hd_url, + 'ext': 'mp4', 'format': 'hd', 'format_id': 'hd', }, diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 10c925dfe..6b9fa4209 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -40,7 +40,7 @@ class CollegeHumorIE(InfoExtractor): 'id': 'W5gMp3ZjYg4', 'ext': 'mp4', 'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]', - 'uploader': 'Funnyplox TV', + 'uploader': 'FunnyPlox TV', 'uploader_id': 'funnyploxtv', 'description': 'md5:7ded37421526d54afdf005e25bc2b7a3', 'upload_date': '20140128', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 84fca8ba0..7ee95fe39 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -88,6 +88,10 @@ class InfoExtractor(object): The following fields are optional: + display_id An alternative identifier for the video, not necessarily + unique, but available before title. Typically, id is + something like "4234987", title "Dancing naked mole rats", + and display_id "dancing-naked-mole-rats" thumbnails: A list of dictionaries (with the entries "resolution" and "url") for the varying thumbnails thumbnail: Full URL to a video thumbnail image. @@ -432,14 +436,14 @@ class InfoExtractor(object): if secure: regexes = self._og_regexes('video:secure_url') + regexes return self._html_search_regex(regexes, html, name, **kargs) - def _html_search_meta(self, name, html, display_name=None): + def _html_search_meta(self, name, html, display_name=None, fatal=False): if display_name is None: display_name = name return self._html_search_regex( r'''(?ix)<meta (?=[^>]+(?:itemprop|name|property)=["\']%s["\']) [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), - html, display_name, fatal=False) + html, display_name, fatal=fatal) def _dc_search_uploader(self, html): return self._html_search_meta('dc.creator', html, 'uploader') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 920728e01..026a9177e 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -1,7 +1,11 @@ # encoding: utf-8 from __future__ import unicode_literals -import re, base64, zlib +import re +import json +import base64 +import zlib + from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor @@ -19,13 +23,15 @@ from ..aes import ( inc, ) + class CrunchyrollIE(InfoExtractor): - _VALID_URL = r'(?:https?://)?(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)' - _TESTS = [{ + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)' + _TEST = { 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', - 'file': '645513.flv', #'md5': 'b1639fd6ddfaa43788c85f6d1dddd412', 'info_dict': { + 'id': '645513', + 'ext': 'flv', 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', 'description': 'md5:2d17137920c64f2f49981a7797d275ef', 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', @@ -36,7 +42,7 @@ class CrunchyrollIE(InfoExtractor): # rtmp 'skip_download': True, }, - }] + } _FORMAT_IDS = { '360': ('60', '106'), @@ -68,7 +74,7 @@ class CrunchyrollIE(InfoExtractor): shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest()) # Extend 160 Bit hash to 256 Bit return shaHash + [0] * 12 - + key = obfuscate_key(id) class Counter: __value = iv @@ -80,9 +86,8 @@ class CrunchyrollIE(InfoExtractor): return zlib.decompress(decrypted_data) def _convert_subtitles_to_srt(self, subtitles): - i=1 output = '' - for start, end, text in re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles): + for i, (start, end, text) in enumerate(re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles), 1): start = start.replace('.', ',') end = end.replace('.', ',') text = clean_html(text) @@ -90,7 +95,6 @@ class CrunchyrollIE(InfoExtractor): if not text: continue output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text) - i+=1 return output def _real_extract(self,url): @@ -108,6 +112,12 @@ class CrunchyrollIE(InfoExtractor): if note_m: raise ExtractorError(note_m) + mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage) + if mobj: + msg = json.loads(mobj.group('msg')) + if msg.get('type') == 'error': + raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) + video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL) video_title = re.sub(r' {2,}', ' ', video_title) video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') @@ -123,7 +133,7 @@ class CrunchyrollIE(InfoExtractor): playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url}) playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info') - + stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id') video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False) @@ -161,7 +171,7 @@ class CrunchyrollIE(InfoExtractor): data = base64.b64decode(data) subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') - lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, 'subtitle_lang_code', fatal=False) + lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) if not lang_code: continue subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 6685c94a3..10b97d8ca 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -12,6 +12,7 @@ from ..utils import ( get_element_by_id, orderedSet, str_to_int, + int_or_none, ExtractorError, ) @@ -124,7 +125,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): if video_url is not None: m_size = re.search(r'H264-(\d+)x(\d+)', video_url) if m_size is not None: - width, height = m_size.group(1), m_size.group(2) + width, height = map(int_or_none, (m_size.group(1), m_size.group(2))) else: width, height = None, None formats.append({ diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 8f9154c0e..f6b5f589a 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import json import re import socket @@ -26,20 +28,21 @@ class FacebookIE(InfoExtractor): _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _NETRC_MACHINE = 'facebook' - IE_NAME = u'facebook' + IE_NAME = 'facebook' _TEST = { - u'url': u'https://www.facebook.com/photo.php?v=120708114770723', - u'file': u'120708114770723.mp4', - u'md5': u'48975a41ccc4b7a581abd68651c1a5a8', - u'info_dict': { - u"duration": 279, - u"title": u"PEOPLE ARE AWESOME 2013" + 'url': 'https://www.facebook.com/photo.php?v=120708114770723', + 'md5': '48975a41ccc4b7a581abd68651c1a5a8', + 'info_dict': { + 'id': '120708114770723', + 'ext': 'mp4', + 'duration': 279, + 'title': 'PEOPLE ARE AWESOME 2013' } } def report_login(self): """Report attempt to log in.""" - self.to_screen(u'Logging in') + self.to_screen('Logging in') def _login(self): (useremail, password) = self._get_login_info() @@ -50,9 +53,11 @@ class FacebookIE(InfoExtractor): login_page_req.add_header('Cookie', 'locale=en_US') self.report_login() login_page = self._download_webpage(login_page_req, None, note=False, - errnote=u'Unable to download login page') - lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd') - lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd') + errnote='Unable to download login page') + lsd = self._search_regex( + r'<input type="hidden" name="lsd" value="([^"]*)"', + login_page, 'lsd') + lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd') login_form = { 'email': useremail, @@ -70,22 +75,22 @@ class FacebookIE(InfoExtractor): try: login_results = compat_urllib_request.urlopen(request).read() if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None: - self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') + self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') return check_form = { - 'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'), - 'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'), + 'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, 'fb_dtsg'), + 'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, 'nh'), 'name_action_selected': 'dont_save', - 'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'), + 'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, 'continue'), } check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form)) check_req.add_header('Content-Type', 'application/x-www-form-urlencoded') check_response = compat_urllib_request.urlopen(check_req).read() if re.search(r'id="checkpointSubmitButton"', check_response) is not None: - self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.') + self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) + self._downloader.report_warning('unable to log in: %s' % compat_str(err)) return def _real_initialize(self): @@ -94,7 +99,7 @@ class FacebookIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) video_id = mobj.group('id') url = 'https://www.facebook.com/video/video.php?v=%s' % video_id @@ -107,10 +112,10 @@ class FacebookIE(InfoExtractor): m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) if m_msg is not None: raise ExtractorError( - u'The video is not available, Facebook said: "%s"' % m_msg.group(1), + 'The video is not available, Facebook said: "%s"' % m_msg.group(1), expected=True) else: - raise ExtractorError(u'Cannot parse data') + raise ExtractorError('Cannot parse data') data = dict(json.loads(m.group(1))) params_raw = compat_urllib_parse.unquote(data['params']) params = json.loads(params_raw) @@ -119,12 +124,12 @@ class FacebookIE(InfoExtractor): if not video_url: video_url = video_data['sd_src'] if not video_url: - raise ExtractorError(u'Cannot find video URL') + raise ExtractorError('Cannot find video URL') video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] video_title = self._html_search_regex( - r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, u'title') + r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title') info = { 'id': video_id, diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index 8db7fc6cb..7d56b9be9 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -8,8 +8,8 @@ from ..utils import ( unified_strdate, str_to_int, parse_duration, + clean_html, ) -from youtube_dl.utils import clean_html class FourTubeIE(InfoExtractor): diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py new file mode 100644 index 000000000..89d5994ee --- /dev/null +++ b/youtube_dl/extractor/gdcvault.py @@ -0,0 +1,134 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + compat_urllib_request, +) + +class GDCVaultIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)' + _TESTS = [ + { + 'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple', + 'md5': '7ce8388f544c88b7ac11c7ab1b593704', + 'info_dict': { + 'id': '1019721', + 'ext': 'mp4', + 'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)' + } + }, + { + 'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of', + 'info_dict': { + 'id': '1015683', + 'ext': 'flv', + 'title': 'Embracing the Dark Art of Mathematical Modeling in AI' + }, + 'params': { + 'skip_download': True, # Requires rtmpdump + } + }, + ] + + def _parse_mp4(self, xml_description): + video_formats = [] + mp4_video = xml_description.find('./metadata/mp4video') + if mp4_video is None: + return None + + mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text) + video_root = mobj.group('root') + formats = xml_description.findall('./metadata/MBRVideos/MBRVideo') + for format in formats: + mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text) + url = video_root + mobj.group('path') + vbr = format.find('bitrate').text + video_formats.append({ + 'url': url, + 'vbr': int(vbr), + }) + return video_formats + + def _parse_flv(self, xml_description): + video_formats = [] + akami_url = xml_description.find('./metadata/akamaiHost').text + slide_video_path = xml_description.find('./metadata/slideVideo').text + video_formats.append({ + 'url': 'rtmp://' + akami_url + '/' + slide_video_path, + 'format_note': 'slide deck video', + 'quality': -2, + 'preference': -2, + 'format_id': 'slides', + }) + speaker_video_path = xml_description.find('./metadata/speakerVideo').text + video_formats.append({ + 'url': 'rtmp://' + akami_url + '/' + speaker_video_path, + 'format_note': 'speaker video', + 'quality': -1, + 'preference': -1, + 'format_id': 'speaker', + }) + return video_formats + + def _login(self, webpage_url, video_id): + (username, password) = self._get_login_info() + if username is None or password is None: + self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.') + return None + + mobj = re.match(r'(?P<root_url>https?://.*?/).*', webpage_url) + login_url = mobj.group('root_url') + 'api/login.php' + logout_url = mobj.group('root_url') + 'logout' + + login_form = { + 'email': username, + 'password': password, + } + + request = compat_urllib_request.Request(login_url, compat_urllib_parse.urlencode(login_form)) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + self._download_webpage(request, video_id, 'Logging in') + start_page = self._download_webpage(webpage_url, video_id, 'Getting authenticated video page') + self._download_webpage(logout_url, video_id, 'Logging out') + + return start_page + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage_url = 'http://www.gdcvault.com/play/' + video_id + start_page = self._download_webpage(webpage_url, video_id) + + xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root', None, False) + + if xml_root is None: + # Probably need to authenticate + start_page = self._login(webpage_url, video_id) + if start_page is None: + self.report_warning('Could not login.') + else: + # Grab the url from the authenticated page + xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root') + + xml_name = self._html_search_regex(r'<iframe src=".*?\?xml=(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename', None, False) + if xml_name is None: + # Fallback to the older format + xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename') + + xml_decription_url = xml_root + 'xml/' + xml_name + xml_description = self._download_xml(xml_decription_url, video_id) + + video_title = xml_description.find('./metadata/title').text + video_formats = self._parse_mp4(xml_description) + if video_formats is None: + video_formats = self._parse_flv(xml_description) + + return { + 'id': video_id, + 'title': video_title, + 'formats': video_formats, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e84c022a5..dd60bc418 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -83,10 +83,10 @@ class GenericIE(InfoExtractor): # Direct link to a video { 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', - 'file': 'trailer.mp4', 'md5': '67d406c2bcb6af27fa886f31aa934bbe', 'info_dict': { 'id': 'trailer', + 'ext': 'mp4', 'title': 'trailer', 'upload_date': '20100513', } @@ -94,7 +94,6 @@ class GenericIE(InfoExtractor): # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', - 'file': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4', 'md5': '5644c6ca5d5782c1d0d350dad9bd840c', 'info_dict': { 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', @@ -102,6 +101,22 @@ class GenericIE(InfoExtractor): 'title': '2cc213299525360.mov', # that's what we get }, }, + # google redirect + { + 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', + 'info_dict': { + 'id': 'cmQHVoWB5FY', + 'ext': 'mp4', + 'upload_date': '20130224', + 'uploader_id': 'TheVerge', + 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.', + 'uploader': 'The Verge', + 'title': 'First Firefox OS phones side-by-side', + }, + 'params': { + 'skip_download': False, + } + }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -375,11 +390,17 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group(1), 'Mpora') - # Look for embedded Novamov player + # Look for embedded NovaMov player mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?novamov\.com/embed\.php.+?)\1', webpage) if mobj is not None: - return self.url_result(mobj.group('url'), 'Novamov') + return self.url_result(mobj.group('url'), 'NovaMov') + + # Look for embedded NowVideo player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?nowvideo\.(?:ch|sx|eu)/embed\.php.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'NowVideo') # Look for embedded Facebook player mobj = re.search( @@ -387,6 +408,11 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'Facebook') + # Look for embedded VK player + mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'VK') + # Look for embedded Huffington Post player mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage) @@ -426,6 +452,18 @@ class GenericIE(InfoExtractor): # HTML5 video mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL) if mobj is None: + mobj = re.search( + r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' + r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"', + webpage) + if mobj: + new_url = mobj.group(1) + self.report_following_redirect(new_url) + return { + '_type': 'url', + 'url': new_url, + } + if mobj is None: raise ExtractorError('Unsupported URL: %s' % url) # It's possible that one of the regexes diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index dde482998..7956e7624 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -10,7 +10,7 @@ from ..utils import compat_urllib_request class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://play\.iprima\.cz/(?P<videogroup>.+)/(?P<videoid>.+)' + _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)' _TESTS = [{ 'url': 'http://play.iprima.cz/particka/particka-92', @@ -22,20 +22,32 @@ class IPrimaIE(InfoExtractor): 'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg', }, 'params': { - 'skip_download': True, + 'skip_download': True, # requires rtmpdump }, - }, - ] + }, { + 'url': 'http://play.iprima.cz/particka/tchibo-particka-jarni-moda', + 'info_dict': { + 'id': '9718337', + 'ext': 'flv', + 'title': 'Tchibo Partička - Jarní móda', + 'description': 'md5:589f8f59f414220621ff8882eb3ce7be', + 'thumbnail': 're:^http:.*\.jpg$', + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('videoid') + video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - player_url = 'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' % ( - floor(random()*1073741824), - floor(random()*1073741824)) + player_url = ( + 'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' % + (floor(random()*1073741824), floor(random()*1073741824)) + ) req = compat_urllib_request.Request(player_url) req.add_header('Referer', url) @@ -44,18 +56,20 @@ class IPrimaIE(InfoExtractor): base_url = ''.join(re.findall(r"embed\['stream'\] = '(.+?)'.+'(\?auth=)'.+'(.+?)';", playerpage)[1]) zoneGEO = self._html_search_regex(r'"zoneGEO":(.+?),', webpage, 'zoneGEO') - if zoneGEO != '0': - base_url = base_url.replace('token', 'token_'+zoneGEO) + base_url = base_url.replace('token', 'token_' + zoneGEO) formats = [] for format_id in ['lq', 'hq', 'hd']: - filename = self._html_search_regex(r'"%s_id":(.+?),' % format_id, webpage, 'filename') + filename = self._html_search_regex( + r'"%s_id":(.+?),' % format_id, webpage, 'filename') if filename == 'null': continue - real_id = self._search_regex(r'Prima-[0-9]{10}-([0-9]+)_', filename, 'real video id') + real_id = self._search_regex( + r'Prima-(?:[0-9]{10}|WEB)-([0-9]+)[-_]', + filename, 'real video id') if format_id == 'lq': quality = 0 @@ -63,13 +77,13 @@ class IPrimaIE(InfoExtractor): quality = 1 elif format_id == 'hd': quality = 2 - filename = 'hq/'+filename + filename = 'hq/' + filename formats.append({ 'format_id': format_id, 'url': base_url, 'quality': quality, - 'play_path': 'mp4:'+filename.replace('"', '')[:-4], + 'play_path': 'mp4:' + filename.replace('"', '')[:-4], 'rtmp_live': True, 'ext': 'flv', }) diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index 7b7185f9a..7a431a274 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -6,7 +6,8 @@ import re from .common import InfoExtractor from ..utils import ( int_or_none, - unified_strdate + unified_strdate, + ExtractorError, ) @@ -32,13 +33,11 @@ class LifeNewsIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage('http://lifenews.ru/mobile/news/%s' % video_id, video_id, 'Downloading page') + webpage = self._download_webpage('http://lifenews.ru/news/%s' % video_id, video_id, 'Downloading page') - video_url = self._html_search_regex( - r'<video.*?src="([^"]+)".*?></video>', webpage, 'video URL') - - thumbnail = self._html_search_regex( - r'<video.*?poster="([^"]+)".*?"></video>', webpage, 'video thumbnail') + videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage) + if not videos: + raise ExtractorError('No media links available for %s' % video_id) title = self._og_search_title(webpage) TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS' @@ -50,20 +49,26 @@ class LifeNewsIE(InfoExtractor): view_count = self._html_search_regex( r'<div class=\'views\'>(\d+)</div>', webpage, 'view count', fatal=False) comment_count = self._html_search_regex( - r'<div class=\'comments\'>(\d+)</div>', webpage, 'comment count', fatal=False) + r'<div class=\'comments\'>\s*<span class=\'counter\'>(\d+)</span>', webpage, 'comment count', fatal=False) upload_date = self._html_search_regex( r'<time datetime=\'([^\']+)\'>', webpage, 'upload date',fatal=False) if upload_date is not None: upload_date = unified_strdate(upload_date) - return { - 'id': video_id, - 'url': video_url, - 'thumbnail': thumbnail, - 'title': title, - 'description': description, - 'view_count': int_or_none(view_count), - 'comment_count': int_or_none(comment_count), - 'upload_date': upload_date, - }
\ No newline at end of file + def make_entry(video_id, media, video_number=None): + return { + 'id': video_id, + 'url': media[1], + 'thumbnail': media[0], + 'title': title if video_number is None else '%s-video%s' % (title, video_number), + 'description': description, + 'view_count': int_or_none(view_count), + 'comment_count': int_or_none(comment_count), + 'upload_date': upload_date, + } + + if len(videos) == 1: + return make_entry(video_id, videos[0]) + else: + return [make_entry(video_id, media, video_number+1) for video_number, media in enumerate(videos)]
\ No newline at end of file diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py new file mode 100644 index 000000000..f819c09b3 --- /dev/null +++ b/youtube_dl/extractor/mailru.py @@ -0,0 +1,66 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re +import datetime + +from .common import InfoExtractor + + +class MailRuIE(InfoExtractor): + IE_NAME = 'mailru' + IE_DESC = 'Видео@Mail.Ru' + _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/video/.*#video=/?(?P<id>[^/]+/[^/]+/[^/]+/\d+)' + + _TEST = { + 'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76', + 'md5': 'dea205f03120046894db4ebb6159879a', + 'info_dict': { + 'id': '46301138', + 'ext': 'mp4', + 'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро', + 'upload_date': '20140224', + 'uploader': 'sonypicturesrus', + 'uploader_id': 'sonypicturesrus@mail.ru', + 'duration': 184, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + video_data = self._download_json( + 'http://videoapi.my.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON') + + author = video_data['author'] + uploader = author['name'] + uploader_id = author['id'] + + movie = video_data['movie'] + content_id = str(movie['contentId']) + title = movie['title'] + thumbnail = movie['poster'] + duration = movie['duration'] + + upload_date = datetime.datetime.fromtimestamp(video_data['timestamp']).strftime('%Y%m%d') + view_count = video_data['views_count'] + + formats = [ + { + 'url': video['url'], + 'format_id': video['name'], + } for video in video_data['videos'] + ] + + return { + 'id': content_id, + 'title': title, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 99d3c83a5..301031197 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -166,6 +166,7 @@ class MetacafeIE(InfoExtractor): video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, u'title') description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) video_uploader = self._html_search_regex( r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, u'uploader nickname', fatal=False) @@ -183,6 +184,7 @@ class MetacafeIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': None, 'title': video_title, + 'thumbnail':thumbnail, 'ext': video_ext, 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 76b717fe5..807b1dc89 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -1,24 +1,30 @@ +from __future__ import unicode_literals + import re import json from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( + compat_urlparse, clean_html, + ExtractorError, get_element_by_id, ) class TechTVMITIE(InfoExtractor): - IE_NAME = u'techtv.mit.edu' + IE_NAME = 'techtv.mit.edu' _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)' _TEST = { - u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', - u'file': u'25418.mp4', - u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f', - u'info_dict': { - u'title': u'MIT DNA Learning Center Set', - u'description': u'md5:82313335e8a8a3f243351ba55bc1b474', + 'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', + 'md5': '1f8cb3e170d41fd74add04d3c9330e5f', + 'info_dict': { + 'id': '25418', + 'ext': 'mp4', + 'title': 'MIT DNA Learning Center Set', + 'description': 'md5:82313335e8a8a3f243351ba55bc1b474', }, } @@ -27,12 +33,12 @@ class TechTVMITIE(InfoExtractor): video_id = mobj.group('id') raw_page = self._download_webpage( 'http://techtv.mit.edu/videos/%s' % video_id, video_id) - clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page) + clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page) - base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', - raw_page, u'base url') - formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page, - u'video formats') + base_url = self._search_regex( + r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url') + formats_json = self._search_regex( + r'bitrates: (\[.+?\])', raw_page, 'video formats') formats_mit = json.loads(formats_json) formats = [ { @@ -48,28 +54,31 @@ class TechTVMITIE(InfoExtractor): title = get_element_by_id('edit-title', clean_page) description = clean_html(get_element_by_id('edit-description', clean_page)) - thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', - raw_page, u'thumbnail', flags=re.DOTALL) + thumbnail = self._search_regex( + r'playlist:.*?url: \'(.+?)\'', + raw_page, 'thumbnail', flags=re.DOTALL) - return {'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'thumbnail': thumbnail, - } + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'thumbnail': thumbnail, + } class MITIE(TechTVMITIE): - IE_NAME = u'video.mit.edu' + IE_NAME = 'video.mit.edu' _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)' _TEST = { - u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', - u'file': u'21783.mp4', - u'md5': u'7db01d5ccc1895fc5010e9c9e13648da', - u'info_dict': { - u'title': u'The Government is Profiling You', - u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd', + 'url': 'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', + 'md5': '7db01d5ccc1895fc5010e9c9e13648da', + 'info_dict': { + 'id': '21783', + 'ext': 'mp4', + 'title': 'The Government is Profiling You', + 'description': 'md5:ad5795fe1e1623b73620dbfd47df9afd', }, } @@ -77,7 +86,73 @@ class MITIE(TechTVMITIE): mobj = re.match(self._VALID_URL, url) page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) - self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME)) - embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage, - u'embed url') + embed_url = self._search_regex( + r'<iframe .*?src="(.+?)"', webpage, 'embed url') return self.url_result(embed_url, ie='TechTVMIT') + + +class OCWMITIE(InfoExtractor): + IE_NAME = 'ocw.mit.edu' + _VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' + _BASE_URL = 'http://ocw.mit.edu/' + + _TESTS = [ + { + 'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/', + 'info_dict': { + 'id': 'EObHWIEKGjA', + 'ext': 'mp4', + 'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence', + 'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.', + #'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt' + } + }, + { + 'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/', + 'info_dict': { + 'id': '7K1sB05pE0A', + 'ext': 'mp4', + 'title': 'Session 1: Introduction to Derivatives', + 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.', + #'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT' + } + } + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + topic = mobj.group('topic') + + webpage = self._download_webpage(url, topic) + title = self._html_search_meta('WT.cg_s', webpage) + description = self._html_search_meta('Description', webpage) + + # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file) + embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage) + if embed_chapter_media: + metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1)) + metadata = re.split(r', ?', metadata) + yt = metadata[1] + subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7]) + else: + # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file) + embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage) + if embed_media: + metadata = re.sub(r'[\'"]', '', embed_media.group(1)) + metadata = re.split(r', ?', metadata) + yt = metadata[1] + subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5]) + else: + raise ExtractorError('Unable to find embedded YouTube video.') + video_id = YoutubeIE.extract_id(yt) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': title, + 'description': description, + 'url': yt, + 'url_transparent' + 'subtitles': subs, + 'ie_key': 'Youtube', + } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index f3356db50..c4bd53fe7 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -5,18 +5,20 @@ import re from .common import InfoExtractor from ..utils import ( unified_strdate, + compat_urllib_parse, ExtractorError, ) class MixcloudIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' + _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)' IE_NAME = 'mixcloud' _TEST = { 'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/', - 'file': 'dholbach-cryptkeeper.mp3', 'info_dict': { + 'id': 'dholbach-cryptkeeper', + 'ext': 'mp3', 'title': 'Cryptkeeper', 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', 'uploader': 'Daniel Holbach', @@ -45,7 +47,7 @@ class MixcloudIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) uploader = mobj.group(1) cloudcast_name = mobj.group(2) - track_id = '-'.join((uploader, cloudcast_name)) + track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name))) webpage = self._download_webpage(url, track_id) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index e8bbfff7b..1a63ab56a 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -1,19 +1,46 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor from ..utils import find_xpath_attr, compat_str +class NBCIE(InfoExtractor): + _VALID_URL = r'http://www\.nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+)' + + _TEST = { + 'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188', + 'md5': '54d0fbc33e0b853a65d7b4de5c06d64e', + 'info_dict': { + 'id': 'u1RInQZRN7QJ', + 'ext': 'flv', + 'title': 'I Am a Firefighter', + 'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + theplatform_url = self._search_regex('class="video-player video-player-full" data-mpx-url="(.*?)"', webpage, 'theplatform url') + if theplatform_url.startswith('//'): + theplatform_url = 'http:' + theplatform_url + return self.url_result(theplatform_url) + + class NBCNewsIE(InfoExtractor): _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)' _TEST = { - u'url': u'http://www.nbcnews.com/video/nbc-news/52753292', - u'file': u'52753292.flv', - u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179', - u'info_dict': { - u'title': u'Crew emerges after four-month Mars food study', - u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1', + 'url': 'http://www.nbcnews.com/video/nbc-news/52753292', + 'md5': '47abaac93c6eaf9ad37ee6c4463a5179', + 'info_dict': { + 'id': '52753292', + 'ext': 'flv', + 'title': 'Crew emerges after four-month Mars food study', + 'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1', }, } @@ -23,10 +50,11 @@ class NBCNewsIE(InfoExtractor): all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) info = all_info.find('video') - return {'id': video_id, - 'title': info.find('headline').text, - 'ext': 'flv', - 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, - 'description': compat_str(info.find('caption').text), - 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, - } + return { + 'id': video_id, + 'title': info.find('headline').text, + 'ext': 'flv', + 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, + 'description': compat_str(info.find('caption').text), + 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, + } diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py index 81b7855b0..25e71a56e 100644 --- a/youtube_dl/extractor/normalboots.py +++ b/youtube_dl/extractor/normalboots.py @@ -1,61 +1,51 @@ +# encoding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor from ..utils import ( - ExtractorError, unified_strdate, ) + class NormalbootsIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$' + _VALID_URL = r'http://(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$' _TEST = { - u'url': u'http://normalboots.com/video/home-alone-games-jontron/', - u'file': u'home-alone-games-jontron.mp4', - u'md5': u'8bf6de238915dd501105b44ef5f1e0f6', - u'info_dict': { - u'title': u'Home Alone Games - JonTron - NormalBoots', - u'description': u'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for \u2018Tense Battle Theme\u2019:\xa0http://www.youtube.com/Kiamet/', - u'uploader': u'JonTron', - u'upload_date': u'20140125', + 'url': 'http://normalboots.com/video/home-alone-games-jontron/', + 'md5': '8bf6de238915dd501105b44ef5f1e0f6', + 'info_dict': { + 'id': 'home-alone-games-jontron', + 'ext': 'mp4', + 'title': 'Home Alone Games - JonTron - NormalBoots', + 'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/', + 'uploader': 'JonTron', + 'upload_date': '20140125', } } - + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('videoid') - - info = { - 'id': video_id, - 'uploader': None, - 'upload_date': None, - } - - if url[:4] != 'http': - url = 'http://' + url - + webpage = self._download_webpage(url, video_id) - video_title = self._og_search_title(webpage) - video_description = self._og_search_description(webpage) - video_thumbnail = self._og_search_thumbnail(webpage) video_uploader = self._html_search_regex(r'Posted\sby\s<a\shref="[A-Za-z0-9/]*">(?P<uploader>[A-Za-z]*)\s</a>', webpage, 'uploader') - raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>', + raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>', webpage, 'date') video_upload_date = unified_strdate(raw_upload_date) - video_upload_date = unified_strdate(raw_upload_date) - + player_url = self._html_search_regex(r'<iframe\swidth="[0-9]+"\sheight="[0-9]+"\ssrc="(?P<url>[\S]+)"', webpage, 'url') player_page = self._download_webpage(player_url, video_id) - video_url = u'http://player.screenwavemedia.com/' + self._html_search_regex(r"'file':\s'(?P<file>[0-9A-Za-z-_\.]+)'", player_page, 'file') - - info['url'] = video_url - info['title'] = video_title - info['description'] = video_description - info['thumbnail'] = video_thumbnail - info['uploader'] = video_uploader - info['upload_date'] = video_upload_date - - return info + video_url = self._html_search_regex(r"file:\s'(?P<file>[^']+\.mp4)'", player_page, 'file') + + return { + 'id': video_id, + 'url': video_url, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'uploader': video_uploader, + 'upload_date': video_upload_date, + } diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index 6af8d934c..fd310e219 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -9,14 +9,25 @@ from ..utils import ( ) -class NovamovIE(InfoExtractor): - _VALID_URL = r'http://(?:(?:www\.)?novamov\.com/video/|(?:(?:embed|www)\.)novamov\.com/embed\.php\?v=)(?P<videoid>[a-z\d]{13})' +class NovaMovIE(InfoExtractor): + IE_NAME = 'novamov' + IE_DESC = 'NovaMov' + + _VALID_URL = r'http://(?:(?:www\.)?%(host)s/video/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<videoid>[a-z\d]{13})' % {'host': 'novamov\.com'} + + _HOST = 'www.novamov.com' + + _FILE_DELETED_REGEX = r'This file no longer exists on our servers!</h2>' + _FILEKEY_REGEX = r'flashvars\.filekey="(?P<filekey>[^"]+)";' + _TITLE_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>' + _DESCRIPTION_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>' _TEST = { 'url': 'http://www.novamov.com/video/4rurhn9x446jj', - 'file': '4rurhn9x446jj.flv', 'md5': '7205f346a52bbeba427603ba10d4b935', 'info_dict': { + 'id': '4rurhn9x446jj', + 'ext': 'flv', 'title': 'search engine optimization', 'description': 'search engine optimization is used to rank the web page in the google search engine' }, @@ -27,31 +38,26 @@ class NovamovIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') - page = self._download_webpage('http://www.novamov.com/video/%s' % video_id, - video_id, 'Downloading video page') + page = self._download_webpage( + 'http://%s/video/%s' % (self._HOST, video_id), video_id, 'Downloading video page') - if re.search(r'This file no longer exists on our servers!</h2>', page) is not None: + if re.search(self._FILE_DELETED_REGEX, page) is not None: raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) - filekey = self._search_regex( - r'flashvars\.filekey="(?P<filekey>[^"]+)";', page, 'filekey') + filekey = self._search_regex(self._FILEKEY_REGEX, page, 'filekey') - title = self._html_search_regex( - r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>', - page, 'title', fatal=False) + title = self._html_search_regex(self._TITLE_REGEX, page, 'title', fatal=False) - description = self._html_search_regex( - r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>', - page, 'description', fatal=False) + description = self._html_search_regex(self._DESCRIPTION_REGEX, page, 'description', default='', fatal=False) api_response = self._download_webpage( - 'http://www.novamov.com/api/player.api.php?key=%s&file=%s' % (filekey, video_id), - video_id, 'Downloading video api response') + 'http://%s/api/player.api.php?key=%s&file=%s' % (self._HOST, filekey, video_id), video_id, + 'Downloading video api response') response = compat_urlparse.parse_qs(api_response) if 'error_msg' in response: - raise ExtractorError('novamov returned error: %s' % response['error_msg'][0], expected=True) + raise ExtractorError('%s returned error: %s' % (self.IE_NAME, response['error_msg'][0]), expected=True) video_url = response['url'][0] @@ -60,4 +66,4 @@ class NovamovIE(InfoExtractor): 'url': video_url, 'title': title, 'description': description - } + }
\ No newline at end of file diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py index 168ca8b9f..dd665874d 100644 --- a/youtube_dl/extractor/nowvideo.py +++ b/youtube_dl/extractor/nowvideo.py @@ -1,46 +1,28 @@ -import re +from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import compat_urlparse +from .novamov import NovaMovIE -class NowVideoIE(InfoExtractor): - _VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.(?:ch|sx)/video/(?P<id>\w+)' - _TEST = { - u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa', - u'file': u'0mw0yow7b6dxa.flv', - u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817', - u'info_dict': { - u"title": u"youtubedl test video _BaW_jenozKc.mp4" - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') - webpage_url = 'http://www.nowvideo.ch/video/' + video_id - embed_url = 'http://embed.nowvideo.ch/embed.php?v=' + video_id - webpage = self._download_webpage(webpage_url, video_id) - embed_page = self._download_webpage(embed_url, video_id, - u'Downloading embed page') +class NowVideoIE(NovaMovIE): + IE_NAME = 'nowvideo' + IE_DESC = 'NowVideo' - self.report_extraction(video_id) + _VALID_URL = r'http://(?:(?:www\.)?%(host)s/video/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<videoid>[a-z\d]{13})' % {'host': 'nowvideo\.(?:ch|sx|eu)'} - video_title = self._html_search_regex(r'<h4>(.*)</h4>', - webpage, u'video title') + _HOST = 'www.nowvideo.ch' - video_key = self._search_regex(r'var fkzd="(.*)";', - embed_page, u'video key') + _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' + _FILEKEY_REGEX = r'var fkzd="([^"]+)";' + _TITLE_REGEX = r'<h4>([^<]+)</h4>' + _DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>' - api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key) - api_response = self._download_webpage(api_call, video_id, - u'Downloading API page') - video_url = compat_urlparse.parse_qs(api_response)[u'url'][0] - - return [{ - 'id': video_id, - 'url': video_url, - 'ext': 'flv', - 'title': video_title, - }] + _TEST = { + 'url': 'http://www.nowvideo.ch/video/0mw0yow7b6dxa', + 'md5': 'f8fbbc8add72bd95b7850c6a02fc8817', + 'info_dict': { + 'id': '0mw0yow7b6dxa', + 'ext': 'flv', + 'title': 'youtubedl test video _BaW_jenozKc.mp4', + 'description': 'Description', + } + }
\ No newline at end of file diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 5f5694393..03421d1d5 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -8,6 +8,7 @@ from .common import InfoExtractor from ..utils import ( HEADRequest, unified_strdate, + ExtractorError, ) @@ -35,7 +36,15 @@ class ORFIE(InfoExtractor): data_json = self._search_regex( r'initializeAdworx\((.+?)\);\n', webpage, 'video info') all_data = json.loads(data_json) - sdata = all_data[0]['values']['segments'] + + def get_segments(all_data): + for data in all_data: + if data['name'] == 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM': + return data['values']['segments'] + + sdata = get_segments(all_data) + if not sdata: + raise ExtractorError('Unable to extract segments') def quality_to_int(s): m = re.search('([0-9]+)', s) diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py index 58200971b..19ad45c98 100644 --- a/youtube_dl/extractor/podomatic.py +++ b/youtube_dl/extractor/podomatic.py @@ -1,7 +1,10 @@ +from __future__ import unicode_literals + import json import re from .common import InfoExtractor +from ..utils import int_or_none class PodomaticIE(InfoExtractor): @@ -9,14 +12,14 @@ class PodomaticIE(InfoExtractor): _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)' _TEST = { - u"url": u"http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00", - u"file": u"2009-01-02T16_03_35-08_00.mp3", - u"md5": u"84bb855fcf3429e6bf72460e1eed782d", - u"info_dict": { - u"uploader": u"Science Teaching Tips", - u"uploader_id": u"scienceteachingtips", - u"title": u"64. When the Moon Hits Your Eye", - u"duration": 446, + "url": "http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00", + "file": "2009-01-02T16_03_35-08_00.mp3", + "md5": "84bb855fcf3429e6bf72460e1eed782d", + "info_dict": { + "uploader": "Science Teaching Tips", + "uploader_id": "scienceteachingtips", + "title": "64. When the Moon Hits Your Eye", + "duration": 446, } } @@ -36,7 +39,7 @@ class PodomaticIE(InfoExtractor): uploader = data['podcast'] title = data['title'] thumbnail = data['imageLocation'] - duration = int(data['length'] / 1000.0) + duration = int_or_none(data.get('length'), 1000) return { 'id': video_id, diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py new file mode 100644 index 000000000..3f585bebf --- /dev/null +++ b/youtube_dl/extractor/prosiebensat1.py @@ -0,0 +1,297 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from hashlib import sha1 +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + unified_strdate, + clean_html, + RegexNotFoundError, +) + + +class ProSiebenSat1IE(InfoExtractor): + IE_NAME = 'prosiebensat1' + IE_DESC = 'ProSiebenSat.1 Digital' + _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)' + + _TESTS = [ + { + 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', + 'info_dict': { + 'id': '2104602', + 'ext': 'mp4', + 'title': 'Staffel 2, Episode 18 - Jahresrückblick', + 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', + 'upload_date': '20131231', + 'duration': 5845.04, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html', + 'info_dict': { + 'id': '2570327', + 'ext': 'mp4', + 'title': 'Lady-Umstyling für Audrina', + 'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d', + 'upload_date': '20131014', + 'duration': 606.76, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Seems to be broken', + }, + { + 'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge', + 'info_dict': { + 'id': '2429369', + 'ext': 'mp4', + 'title': 'Countdown für die Autowerkstatt', + 'description': 'md5:809fc051a457b5d8666013bc40698817', + 'upload_date': '20140223', + 'duration': 2595.04, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', + 'info_dict': { + 'id': '2904997', + 'ext': 'mp4', + 'title': 'Sexy laufen in Ugg Boots', + 'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6', + 'upload_date': '20140122', + 'duration': 245.32, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', + 'info_dict': { + 'id': '2906572', + 'ext': 'mp4', + 'title': 'Im Interview: Kai Wiesinger', + 'description': 'md5:e4e5370652ec63b95023e914190b4eb9', + 'upload_date': '20140225', + 'duration': 522.56, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', + 'info_dict': { + 'id': '2992323', + 'ext': 'mp4', + 'title': 'Jagd auf Fertigkost im Elsthal - Teil 2', + 'description': 'md5:2669cde3febe9bce13904f701e774eb6', + 'upload_date': '20140225', + 'duration': 2410.44, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', + 'info_dict': { + 'id': '3004256', + 'ext': 'mp4', + 'title': 'Schalke: Tönnies möchte Raul zurück', + 'description': 'md5:4b5b271d9bcde223b54390754c8ece3f', + 'upload_date': '20140226', + 'duration': 228.96, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', + 'info_dict': { + 'id': '2572814', + 'ext': 'mp4', + 'title': 'Andreas Kümmert: Rocket Man', + 'description': 'md5:6ddb02b0781c6adf778afea606652e38', + 'upload_date': '20131017', + 'duration': 469.88, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html', + 'info_dict': { + 'id': '2156342', + 'ext': 'mp4', + 'title': 'Kurztrips zum Valentinstag', + 'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528', + 'upload_date': '20130206', + 'duration': 307.24, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + ] + + _CLIPID_REGEXES = [ + r'"clip_id"\s*:\s+"(\d+)"', + r'clipid: "(\d+)"', + ] + _TITLE_REGEXES = [ + r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>', + r'<header class="clearfix">\s*<h3>(.+?)</h3>', + r'<!-- start video -->\s*<h1>(.+?)</h1>', + r'<div class="ep-femvideos-pi4-video-txt">\s*<h2>(.+?)</h2>', + ] + _DESCRIPTION_REGEXES = [ + r'<p itemprop="description">\s*(.+?)</p>', + r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>', + r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>', + r'<p>(.+?)</p>\s*<div class="ep-femvideos-pi4-video-footer">', + ] + _UPLOAD_DATE_REGEXES = [ + r'<meta property="og:published_time" content="(.+?)">', + r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"', + r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr', + r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>', + r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>', + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + page = self._download_webpage(url, video_id, 'Downloading page') + + def extract(patterns, name, page, fatal=False): + for pattern in patterns: + mobj = re.search(pattern, page) + if mobj: + return clean_html(mobj.group(1)) + if fatal: + raise RegexNotFoundError(u'Unable to extract %s' % name) + return None + + clip_id = extract(self._CLIPID_REGEXES, 'clip id', page, fatal=True) + + access_token = 'testclient' + client_name = 'kolibri-1.2.5' + client_location = url + + videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({ + 'access_token': access_token, + 'client_location': client_location, + 'client_name': client_name, + 'ids': clip_id, + }) + + videos = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON') + + duration = float(videos[0]['duration']) + source_ids = [source['id'] for source in videos[0]['sources']] + source_ids_str = ','.join(map(str, source_ids)) + + g = '01!8d8F_)r9]4s[qeuXfP%' + + client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]) + .encode('utf-8')).hexdigest() + + sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse.urlencode({ + 'access_token': access_token, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': client_name, + })) + + sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON') + server_id = sources['server_id'] + + client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, + client_location, source_ids_str, g, client_name]) + .encode('utf-8')).hexdigest() + + url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse.urlencode({ + 'access_token': access_token, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': client_name, + 'server_id': server_id, + 'source_ids': source_ids_str, + })) + + urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON') + + title = extract(self._TITLE_REGEXES, 'title', page, fatal=True) + description = extract(self._DESCRIPTION_REGEXES, 'description', page) + thumbnail = self._og_search_thumbnail(page) + + upload_date = extract(self._UPLOAD_DATE_REGEXES, 'upload date', page) + if upload_date: + upload_date = unified_strdate(upload_date) + + formats = [] + + urls_sources = urls['sources'] + if isinstance(urls_sources, dict): + urls_sources = urls_sources.values() + + def fix_bitrate(bitrate): + return bitrate / 1000 if bitrate % 1000 == 0 else bitrate + + for source in urls_sources: + protocol = source['protocol'] + if protocol == 'rtmp' or protocol == 'rtmpe': + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url']) + if not mobj: + continue + formats.append({ + 'url': mobj.group('url'), + 'app': mobj.group('app'), + 'play_path': mobj.group('playpath'), + 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', + 'page_url': 'http://www.prosieben.de', + 'vbr': fix_bitrate(source['bitrate']), + 'ext': 'mp4', + 'format_id': '%s_%s' % (source['cdn'], source['bitrate']), + }) + else: + formats.append({ + 'url': source['url'], + 'vbr': fix_bitrate(source['bitrate']), + }) + + self._sort_formats(formats) + + return { + 'id': clip_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'formats': formats, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index cd50f708d..4835ec5ec 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -1,148 +1,165 @@ # encoding: utf-8 - from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( - clean_html, ExtractorError, + clean_html, + unified_strdate, + int_or_none, ) class RTLnowIE(InfoExtractor): """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" - _VALID_URL = r'(?:http://)?(?P<url>(?P<domain>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' - _TESTS = [{ - 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', - 'file': '90419.flv', - 'info_dict': { - 'upload_date': '20070416', - 'title': 'Ahornallee - Folge 1 - Der Einzug', - 'description': 'Folge 1 - Der Einzug', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only works from Germany', - }, - { - 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', - 'file': '69756.flv', - 'info_dict': { - 'upload_date': '20120519', - 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit...', - 'description': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.', - 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only works from Germany', - }, - { - 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', - 'file': '13883.flv', - 'info_dict': { - 'upload_date': '20090627', - 'title': 'Voxtours - Südafrika-Reporter II', - 'description': 'Südafrika-Reporter II', - }, - 'params': { - 'skip_download': True, + _VALID_URL = r'''(?x) + (?:https?://)? + (?P<url> + (?P<domain> + rtl-now\.rtl\.de| + rtl2now\.rtl2\.de| + (?:www\.)?voxnow\.de| + (?:www\.)?rtlnitronow\.de| + (?:www\.)?superrtlnow\.de| + (?:www\.)?n-tvnow\.de) + /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\? + (?:container_id|film_id)=(?P<video_id>[0-9]+)& + player=1(?:&season=[0-9]+)?(?:&.*)? + )''' + + _TESTS = [ + { + 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', + 'info_dict': { + 'id': '90419', + 'ext': 'flv', + 'title': 'Ahornallee - Folge 1 - Der Einzug', + 'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de', + 'upload_date': '20070416', + 'duration': 1685, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Only works from Germany', }, - }, - { - 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1', - 'file': '99205.flv', - 'info_dict': { - 'upload_date': '20080928', - 'title': 'Medicopter 117 - Angst!', - 'description': 'Angst!', - 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg' + { + 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', + 'info_dict': { + 'id': '69756', + 'ext': 'flv', + 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.', + 'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0', + 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg', + 'upload_date': '20120519', + 'duration': 1245, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Only works from Germany', }, - 'params': { - 'skip_download': True, + { + 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', + 'info_dict': { + 'id': '13883', + 'ext': 'flv', + 'title': 'Voxtours - Südafrika-Reporter II', + 'description': 'md5:de7f8d56be6fd4fed10f10f57786db00', + 'upload_date': '20090627', + 'duration': 1800, + }, + 'params': { + 'skip_download': True, + }, }, - }, - { - 'url': 'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10', - 'file': '124903.flv', - 'info_dict': { - 'upload_date': '20130101', - 'title': 'Top Gear vom 01.01.2013', - 'description': 'Episode 1', + { + 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1', + 'info_dict': { + 'id': '99205', + 'ext': 'flv', + 'title': 'Medicopter 117 - Angst!', + 'description': 'md5:895b1df01639b5f61a04fc305a5cb94d', + 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg', + 'upload_date': '20080928', + 'duration': 2691, + }, + 'params': { + 'skip_download': True, + }, }, - 'params': { - 'skip_download': True, + { + 'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0', + 'info_dict': { + 'id': '153819', + 'ext': 'flv', + 'title': 'Deluxe - Alles was Spaß macht - Thema u.a.: Luxushotel für Vierbeiner', + 'description': 'md5:c3705e1bb32e1a5b2bcd634fc065c631', + 'thumbnail': 'http://autoimg.static-fra.de/ntvnow/383157/1500x1500/image2.jpg', + 'upload_date': '20140221', + 'duration': 2429, + }, + 'skip': 'Only works from Germany', }, - 'skip': 'Only works from Germany', - }] + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - - webpage_url = 'http://' + mobj.group('url') - video_page_url = 'http://' + mobj.group('domain') + '/' + video_page_url = 'http://%s/' % mobj.group('domain') video_id = mobj.group('video_id') - webpage = self._download_webpage(webpage_url, video_id) + webpage = self._download_webpage('http://' + mobj.group('url'), video_id) - note_m = re.search(r'''(?sx) - <div[ ]style="margin-left:[ ]20px;[ ]font-size:[ ]13px;">(.*?) - <div[ ]id="playerteaser">''', webpage) - if note_m: - msg = clean_html(note_m.group(1)) - raise ExtractorError(msg) + mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage) + if mobj: + raise ExtractorError(clean_html(mobj.group(1)), expected=True) - video_title = self._html_search_regex( - r'<title>(?P<title>[^<]+?)( \| [^<]*)?</title>', - webpage, 'title') - playerdata_url = self._html_search_regex( - r'\'playerdata\': \'(?P<playerdata_url>[^\']+)\'', - webpage, 'playerdata_url') + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage, default=None) - playerdata = self._download_webpage(playerdata_url, video_id) - mobj = re.search(r'<title><!\[CDATA\[(?P<description>.+?)(?:\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr)?\]\]></title>', playerdata) - if mobj: - video_description = mobj.group('description') - if mobj.group('upload_date_Y'): - video_upload_date = mobj.group('upload_date_Y') - elif mobj.group('upload_date_y'): - video_upload_date = '20' + mobj.group('upload_date_y') - else: - video_upload_date = None - if video_upload_date: - video_upload_date += mobj.group('upload_date_m') + mobj.group('upload_date_d') - else: - video_description = None - video_upload_date = None - self._downloader.report_warning('Unable to extract description and upload date') + upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date')) - # Thumbnail: not every video has an thumbnail - mobj = re.search(r'<meta property="og:image" content="(?P<thumbnail>[^"]+)">', webpage) - if mobj: - video_thumbnail = mobj.group('thumbnail') - else: - video_thumbnail = None + mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage) + duration = int(mobj.group('seconds')) if mobj else None - mobj = re.search(r'<filename [^>]+><!\[CDATA\[(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>[^\]]+)\]\]></filename>', playerdata) - if mobj is None: - raise ExtractorError('Unable to extract media URL') - video_url = mobj.group('url') - video_play_path = 'mp4:' + mobj.group('play_path') - video_player_url = video_page_url + 'includes/vodplayer.swf' + playerdata_url = self._html_search_regex( + r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url') + + playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML') + + videoinfo = playerdata.find('./playlist/videoinfo') + + formats = [] + for filename in videoinfo.findall('filename'): + mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text) + if mobj: + fmt = { + 'url': mobj.group('url'), + 'play_path': 'mp4:' + mobj.group('play_path'), + 'page_url': video_page_url, + 'player_url': video_page_url + 'includes/vodplayer.swf', + } + else: + fmt = { + 'url': filename.text, + } + fmt.update({ + 'width': int_or_none(filename.get('width')), + 'height': int_or_none(filename.get('height')), + 'vbr': int_or_none(filename.get('bitrate')), + 'ext': 'flv', + }) + formats.append(fmt) return { 'id': video_id, - 'url': video_url, - 'play_path': video_play_path, - 'page_url': video_page_url, - 'player_url': video_player_url, - 'ext': 'flv', - 'title': video_title, - 'description': video_description, - 'upload_date': video_upload_date, - 'thumbnail': video_thumbnail, - } + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'formats': formats, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 393b5f17c..1cc0dcb15 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -217,7 +217,7 @@ class SoundcloudIE(InfoExtractor): return self._extract_info_dict(info, full_title, secret_token=token) class SoundcloudSetIE(SoundcloudIE): - _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$' + _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)' IE_NAME = 'soundcloud:set' # it's in tests/test_playlists.py _TESTS = [] diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 8b31caa92..b9e65447f 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -6,115 +6,111 @@ import re from .subtitles import SubtitlesInfoExtractor from ..utils import ( + compat_str, RegexNotFoundError, ) class TEDIE(SubtitlesInfoExtractor): - _VALID_URL=r'''http://www\.ted\.com/ - ( - ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist - | - ((?P<type_talk>talks)) # We have a simple talk - ) - (/lang/(.*?))? # The url may contain the language - /(?P<name>\w+) # Here goes the name and then ".html" - ''' + _VALID_URL = r'''(?x)http://www\.ted\.com/ + ( + (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist + | + ((?P<type_talk>talks)) # We have a simple talk + ) + (/lang/(.*?))? # The url may contain the language + /(?P<name>\w+) # Here goes the name and then ".html" + ''' _TEST = { 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', 'file': '102.mp4', 'md5': '4ea1dada91e4174b53dac2bb8ace429d', 'info_dict': { - "description": "md5:c6fa72e6eedbd938c9caf6b2702f5922", - "title": "Dan Dennett: The illusion of consciousness" + 'title': 'The illusion of consciousness', + 'description': ('Philosopher Dan Dennett makes a compelling ' + 'argument that not only don\'t we understand our own ' + 'consciousness, but that half the time our brains are ' + 'actively fooling us.'), + 'uploader': 'Dan Dennett', } } - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + _FORMATS_PREFERENCE = { + 'low': 1, + 'medium': 2, + 'high': 3, + } + + def _extract_info(self, webpage): + info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>', + webpage, 'info json') + return json.loads(info_json) def _real_extract(self, url): - m=re.match(self._VALID_URL, url, re.VERBOSE) + m = re.match(self._VALID_URL, url, re.VERBOSE) + name = m.group('name') if m.group('type_talk'): - return self._talk_info(url) - else : - playlist_id=m.group('playlist_id') - name=m.group('name') - self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) - return [self._playlist_videos_info(url,name,playlist_id)] + return self._talk_info(url, name) + else: + return self._playlist_videos_info(url, name) - - def _playlist_videos_info(self, url, name, playlist_id): + def _playlist_videos_info(self, url, name): '''Returns the videos of the playlist''' - webpage = self._download_webpage( - url, playlist_id, 'Downloading playlist webpage') - matches = re.finditer( - r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>', - webpage) - - playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>', - webpage, 'playlist title') + webpage = self._download_webpage(url, name, + 'Downloading playlist webpage') + info = self._extract_info(webpage) + playlist_info = info['playlist'] playlist_entries = [ - self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED') - for m in matches + self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key()) + for talk in info['talks'] ] return self.playlist_result( - playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title) + playlist_entries, + playlist_id=compat_str(playlist_info['id']), + playlist_title=playlist_info['title']) - def _talk_info(self, url, video_id=0): - """Return the video for the talk in the url""" - m = re.match(self._VALID_URL, url,re.VERBOSE) - video_name = m.group('name') - webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name) + def _talk_info(self, url, video_name): + webpage = self._download_webpage(url, video_name) self.report_extraction(video_name) - # If the url includes the language we get the title translated - title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>', - webpage, 'title') - json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>', - webpage, 'json data') - info = json.loads(json_data) - desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>', - webpage, 'description', flags = re.DOTALL) - - thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"', - webpage, 'thumbnail') - formats = [{ - 'ext': 'mp4', - 'url': stream['file'], - 'format': stream['id'] - } for stream in info['htmlStreams']] - video_id = info['id'] + talk_info = self._extract_info(webpage)['talks'][0] + formats = [{ + 'ext': 'mp4', + 'url': format_url, + 'format_id': format_id, + 'format': format_id, + 'preference': self._FORMATS_PREFERENCE.get(format_id, -1), + } for (format_id, format_url) in talk_info['nativeDownloads'].items()] + self._sort_formats(formats) + + video_id = talk_info['id'] # subtitles - video_subtitles = self.extract_subtitles(video_id, webpage) + video_subtitles = self.extract_subtitles(video_id, talk_info) if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, webpage) + self._list_available_subtitles(video_id, talk_info) return return { 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'description': desc, + 'title': talk_info['title'], + 'uploader': talk_info['speaker'], + 'thumbnail': talk_info['thumb'], + 'description': self._og_search_description(webpage), 'subtitles': video_subtitles, 'formats': formats, } - def _get_available_subtitles(self, video_id, webpage): - try: - options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL) - languages = re.findall(r'(?:<option value=")(\S+)"', options) - if languages: - sub_lang_list = {} - for l in languages: - url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) - sub_lang_list[l] = url - return sub_lang_list - except RegexNotFoundError: + def _get_available_subtitles(self, video_id, talk_info): + languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] + if languages: + sub_lang_list = {} + for l in languages: + url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) + sub_lang_list[l] = url + return sub_lang_list + else: self._downloader.report_warning(u'video doesn\'t have subtitles') - return {} + return {} diff --git a/youtube_dl/extractor/testurl.py b/youtube_dl/extractor/testurl.py index bdc6e2064..c7d559315 100644 --- a/youtube_dl/extractor/testurl.py +++ b/youtube_dl/extractor/testurl.py @@ -39,6 +39,8 @@ class TestURLIE(InfoExtractor): ('Found multiple matching extractors: %s' % ' '.join(ie.IE_NAME for ie in matching_extractors)), expected=True) + else: + extractor = matching_extractors[0] num_str = mobj.group('num') num = int(num_str) if num_str else 0 diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index d60702325..91f2453eb 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -13,7 +13,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language class ThePlatformIE(InfoExtractor): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/ - (?P<config>[^/\?]+/(?:swf|config)/select/)? + (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)? |theplatform:)(?P<id>[^/\?&]+)''' _TEST = { @@ -54,10 +54,15 @@ class ThePlatformIE(InfoExtractor): f4m_node = body.find(_x('smil:seq/smil:video')) if f4m_node is not None: + f4m_url = f4m_node.attrib['src'] + if 'manifest.f4m?' not in f4m_url: + f4m_url += '?' + # the parameters are from syfy.com, other sites may use others, + # they also work for nbc.com + f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3' formats = [{ 'ext': 'flv', - # the parameters are from syfy.com, other sites may use others - 'url': f4m_node.attrib['src'] + '?g=UXWGVKRWHFSP&hdcore=3.0.3', + 'url': f4m_url, }] else: base_url = head.find(_x('smil:meta')).attrib['base'] @@ -95,9 +100,10 @@ class ThePlatformIE(InfoExtractor): if mobj.group('config'): config_url = url+ '&form=json' config_url = config_url.replace('swf/', 'config/') + config_url = config_url.replace('onsite/', 'onsite/config/') config_json = self._download_webpage(config_url, video_id, u'Downloading config') config = json.loads(config_json) - smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4' + smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m' else: smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' 'format=smil&mbr=true'.format(video_id)) diff --git a/youtube_dl/extractor/tinypic.py b/youtube_dl/extractor/tinypic.py index 2246d27b2..a4aa25f66 100644 --- a/youtube_dl/extractor/tinypic.py +++ b/youtube_dl/extractor/tinypic.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from youtube_dl.utils import ExtractorError +from ..utils import ExtractorError class TinyPicIE(InfoExtractor): diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py new file mode 100644 index 000000000..57f956683 --- /dev/null +++ b/youtube_dl/extractor/trutube.py @@ -0,0 +1,44 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class TruTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?trutube\.tv/video/(?P<id>[0-9]+)/.*' + _TEST = { + 'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-', + 'md5': 'c5b6e301b0a2040b074746cbeaa26ca1', + 'info_dict': { + 'id': '14880', + 'ext': 'flv', + 'title': 'Ramses II - Proven To Be A Red Headed Caucasoid', + 'thumbnail': 're:^http:.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + video_title = self._og_search_title(webpage).strip() + thumbnail = self._search_regex( + r"var splash_img = '([^']+)';", webpage, 'thumbnail', fatal=False) + + all_formats = re.finditer( + r"var (?P<key>[a-z]+)_video_file\s*=\s*'(?P<url>[^']+)';", webpage) + formats = [{ + 'format_id': m.group('key'), + 'quality': -i, + 'url': m.group('url'), + } for i, m in enumerate(all_formats)] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_title, + 'formats': formats, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py new file mode 100644 index 000000000..0921cc5f8 --- /dev/null +++ b/youtube_dl/extractor/tvigle.py @@ -0,0 +1,84 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + unified_strdate, + clean_html, + int_or_none, +) + + +class TvigleIE(InfoExtractor): + IE_NAME = 'tvigle' + IE_DESC = 'Интернет-телевидение Tvigle.ru' + _VALID_URL = r'http://(?:www\.)?tvigle\.ru/category/.+?[\?&]v(?:ideo)?=(?P<id>\d+)' + + _TESTS = [ + { + 'url': 'http://www.tvigle.ru/category/cinema/1608/?video=503081', + 'md5': '09afba4616666249f087efc6dcf83cb3', + 'info_dict': { + 'id': '503081', + 'ext': 'flv', + 'title': 'Брат 2 ', + 'description': 'md5:f5a42970f50648cee3d7ad740f3ae769', + 'upload_date': '20110919', + }, + }, + { + 'url': 'http://www.tvigle.ru/category/men/vysotskiy_vospominaniya02/?flt=196&v=676433', + 'md5': 'e7efe5350dd5011d0de6550b53c3ba7b', + 'info_dict': { + 'id': '676433', + 'ext': 'flv', + 'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком', + 'description': 'md5:027f7dc872948f14c96d19b4178428a4', + 'upload_date': '20121218', + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + video_data = self._download_xml( + 'http://www.tvigle.ru/xml/single.php?obj=%s' % video_id, video_id, 'Downloading video XML') + + video = video_data.find('./video') + + title = video.get('name') + description = video.get('anons') + if description: + description = clean_html(description) + thumbnail = video_data.get('img') + upload_date = unified_strdate(video.get('date')) + like_count = int_or_none(video.get('vtp')) + + formats = [] + for num, (format_id, format_note) in enumerate([['low_file', 'SQ'], ['file', 'HQ'], ['hd', 'HD 720']]): + video_url = video.get(format_id) + if not video_url: + continue + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'format_note': format_note, + 'quality': num, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'like_count': like_count, + 'age_limit': 18, + 'formats': formats, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index baa57f343..c90feefd2 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -4,6 +4,7 @@ import re import json from .common import InfoExtractor +from ..utils import compat_urllib_request class VeohIE(InfoExtractor): @@ -24,6 +25,13 @@ class VeohIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) + age_limit = 0 + if 'class="adultwarning-container"' in webpage: + self.report_age_confirmation() + age_limit = 18 + request = compat_urllib_request.Request(url) + request.add_header('Cookie', 'confirmedAdult=true') + webpage = self._download_webpage(request, video_id) m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage) if m_youtube is not None: @@ -44,4 +52,5 @@ class VeohIE(InfoExtractor): 'thumbnail': info.get('highResImage') or info.get('medResImage'), 'description': info['description'], 'view_count': info['views'], + 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py index f51d4dcfa..417282129 100644 --- a/youtube_dl/extractor/vesti.py +++ b/youtube_dl/extractor/vesti.py @@ -113,8 +113,8 @@ class VestiIE(InfoExtractor): priority_transport = playlist['priority_transport'] thumbnail = media['picture'] - width = media['width'] - height = media['height'] + width = int_or_none(media['width']) + height = int_or_none(media['height']) description = media['anons'] title = media['title'] duration = int_or_none(media.get('duration')) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index e458ac961..fa147a575 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -24,9 +24,10 @@ class VevoIE(InfoExtractor): (?P<id>[^&?#]+)''' _TESTS = [{ 'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', - 'file': 'GB1101300280.mp4', "md5": "06bea460acb744eab74a9d7dcb4bfd61", 'info_dict': { + 'id': 'GB1101300280', + 'ext': 'mp4', "upload_date": "20130624", "uploader": "Hurts", "title": "Somebody to Die For", @@ -34,6 +35,33 @@ class VevoIE(InfoExtractor): "width": 1920, "height": 1080, } + }, { + 'note': 'v3 SMIL format', + 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923', + 'md5': '893ec0e0d4426a1d96c01de8f2bdff58', + 'info_dict': { + 'id': 'USUV71302923', + 'ext': 'mp4', + 'upload_date': '20140219', + 'uploader': 'Cassadee Pope', + 'title': 'I Wish I Could Break Your Heart', + 'duration': 226.101, + 'age_limit': 0, + } + }, { + 'note': 'Age-limited video', + 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282', + 'info_dict': { + 'id': 'USRV81300282', + 'ext': 'mp4', + 'age_limit': 18, + 'title': 'Tunnel Vision (Explicit)', + 'uploader': 'Justin Timberlake', + 'upload_date': '20130704', + }, + 'params': { + 'skip_download': 'true', + } }] _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/' @@ -105,9 +133,31 @@ class VevoIE(InfoExtractor): video_info = self._download_json(json_url, video_id)['video'] formats = self._formats_from_json(video_info) + + is_explicit = video_info.get('isExplicit') + if is_explicit is True: + age_limit = 18 + elif is_explicit is False: + age_limit = 0 + else: + age_limit = None + + # Download SMIL + smil_blocks = sorted(( + f for f in video_info['videoVersions'] + if f['sourceType'] == 13), + key=lambda f: f['version']) + + smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( + self._SMIL_BASE_URL, video_id, video_id.lower()) + if smil_blocks: + smil_url_m = self._search_regex( + r'url="([^"]+)"', smil_blocks[-1]['data'], 'SMIL URL', + fatal=False) + if smil_url_m is not None: + smil_url = smil_url_m + try: - smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( - self._SMIL_BASE_URL, video_id, video_id.lower()) smil_xml = self._download_webpage(smil_url, video_id, 'Downloading SMIL info') formats.extend(self._formats_from_smil(smil_xml)) @@ -128,4 +178,5 @@ class VevoIE(InfoExtractor): 'upload_date': upload_date.strftime('%Y%m%d'), 'uploader': video_info['mainArtists'][0]['artistName'], 'duration': video_info['duration'], + 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/videobam.py b/youtube_dl/extractor/videobam.py new file mode 100644 index 000000000..cdfff05ae --- /dev/null +++ b/youtube_dl/extractor/videobam.py @@ -0,0 +1,80 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import int_or_none + + +class VideoBamIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?videobam\.com/(?:videos/download/)?(?P<id>[a-zA-Z]+)' + + _TESTS = [ + { + 'url': 'http://videobam.com/OiJQM', + 'md5': 'db471f27763a531f10416a0c58b5a1e0', + 'info_dict': { + 'id': 'OiJQM', + 'ext': 'mp4', + 'title': 'Is Alcohol Worse Than Ecstasy?', + 'description': 'md5:d25b96151515c91debc42bfbb3eb2683', + 'uploader': 'frihetsvinge', + }, + }, + { + 'url': 'http://videobam.com/pqLvq', + 'md5': 'd9a565b5379a99126ef94e1d7f9a383e', + 'note': 'HD video', + 'info_dict': { + 'id': 'pqLvq', + 'ext': 'mp4', + } + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + page = self._download_webpage('http://videobam.com/%s' % video_id, video_id, 'Downloading page') + + formats = [] + + for preference, format_id in enumerate(['low', 'high']): + mobj = re.search(r"%s: '(?P<url>[^']+)'" % format_id, page) + if not mobj: + continue + formats.append({ + 'url': mobj.group('url'), + 'ext': 'mp4', + 'format_id': format_id, + 'preference': preference, + }) + + if not formats: + player_config = json.loads(self._html_search_regex(r'var player_config = ({.+?});', page, 'player config')) + formats = [{ + 'url': item['url'], + 'ext': 'mp4', + } for item in player_config['playlist'] if 'autoPlay' in item] + + self._sort_formats(formats) + + title = self._og_search_title(page, default='VideoBam', fatal=False) + description = self._og_search_description(page, default=None) + thumbnail = self._og_search_thumbnail(page) + uploader = self._html_search_regex(r'Upload by ([^<]+)</a>', page, 'uploader', fatal=False, default=None) + view_count = int_or_none( + self._html_search_regex(r'<strong>Views:</strong> (\d+) ', page, 'view count', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'view_count': view_count, + 'formats': formats, + 'age_limit': 18, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index c5ee84807..10c3d992d 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -221,7 +221,9 @@ class VimeoIE(SubtitlesInfoExtractor): # Extract video thumbnail video_thumbnail = config["video"].get("thumbnail") if video_thumbnail is None: - _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1] + video_thumbs = config["video"].get("thumbs") + if video_thumbs and isinstance(video_thumbs, dict): + _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in video_thumbs.items())[-1] # Extract video description video_description = None diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index e14ff91d4..5bbc8ba88 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -1,8 +1,10 @@ from __future__ import unicode_literals import re +import json from .common import InfoExtractor +from ..utils import unified_strdate class VineIE(InfoExtractor): @@ -13,31 +15,46 @@ class VineIE(InfoExtractor): 'info_dict': { 'id': 'b9KOOWX7HUx', 'ext': 'mp4', - 'uploader': 'Jack Dorsey', 'title': 'Chicken.', + 'description': 'Chicken.', + 'upload_date': '20130519', + 'uploader': 'Jack Dorsey', + 'uploader_id': '76', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage_url = 'https://vine.co/v/' + video_id - webpage = self._download_webpage(webpage_url, video_id) - self.report_extraction(video_id) + webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id) - video_url = self._html_search_meta('twitter:player:stream', webpage, - 'video URL') + data = json.loads(self._html_search_regex( + r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data')) - uploader = self._html_search_regex(r'<p class="username">(.*?)</p>', - webpage, 'uploader', fatal=False, flags=re.DOTALL) + formats = [ + { + 'url': data['videoLowURL'], + 'ext': 'mp4', + 'format_id': 'low', + }, + { + 'url': data['videoUrl'], + 'ext': 'mp4', + 'format_id': 'standard', + } + ] return { 'id': video_id, - 'url': video_url, - 'ext': 'mp4', 'title': self._og_search_title(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': uploader, - } + 'description': data['description'], + 'thumbnail': data['thumbnailUrl'], + 'upload_date': unified_strdate(data['created']), + 'uploader': data['username'], + 'uploader_id': data['userIdStr'], + 'like_count': data['likes']['count'], + 'comment_count': data['comments']['count'], + 'repost_count': data['reposts']['count'], + 'formats': formats, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index a293b8875..3b3bec92f 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -16,7 +16,7 @@ from ..utils import ( class VKIE(InfoExtractor): IE_NAME = 'vk.com' - _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)' + _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))' _NETRC_MACHINE = 'vk' _TESTS = [ @@ -43,6 +43,18 @@ class VKIE(InfoExtractor): } }, { + 'note': 'Embedded video', + 'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1', + 'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a', + 'info_dict': { + 'id': '162925554', + 'ext': 'mp4', + 'uploader': 'Vladimir Gavrin', + 'title': 'Lin Dan', + 'duration': 101, + } + }, + { 'url': 'http://vk.com/video-8871596_164049491', 'md5': 'a590bcaf3d543576c9bd162812387666', 'note': 'Only available for registered users', @@ -54,7 +66,7 @@ class VKIE(InfoExtractor): 'duration': 8352, }, 'skip': 'Requires vk account credentials', - } + }, ] def _login(self): @@ -82,7 +94,10 @@ class VKIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = mobj.group('videoid') + + if not video_id: + video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id info_page = self._download_webpage(info_url, video_id) diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index 3237596a3..fc9237a3f 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -22,8 +22,8 @@ class WorldStarHipHopIE(InfoExtractor): webpage_src = self._download_webpage(url, video_id) m_vevo_id = re.search(r'videoId=(.*?)&?', - webpage_src) - + webpage_src) + if m_vevo_id is not None: self.to_screen(u'Vevo video detected:') return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo') diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index a75e1380d..5374495f9 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -103,6 +103,7 @@ class XHamsterIE(InfoExtractor): }] if not hd: + mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url') webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage') if is_hd(webpage): video_url = extract_video_url(webpage) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 982619922..d3eefd086 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -7,19 +7,24 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_parse_urlparse, compat_urllib_request, + parse_duration, + str_to_int, ) + class XTubeIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))' _TEST = { 'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_', - 'file': 'kVTUy_G222_.mp4', 'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab', 'info_dict': { - "title": "strange erotica", - "description": "surreal gay themed erotica...almost an ET kind of thing", - "uploader": "greenshowers", - "age_limit": 18, + 'id': 'kVTUy_G222_', + 'ext': 'mp4', + 'title': 'strange erotica', + 'description': 'surreal gay themed erotica...almost an ET kind of thing', + 'uploader': 'greenshowers', + 'duration': 450, + 'age_limit': 18, } } @@ -32,10 +37,23 @@ class XTubeIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) - video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, 'title') - video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False) - video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, 'description', fatal=False) - video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/') + video_title = self._html_search_regex(r'<p class="title">([^<]+)', webpage, 'title') + video_uploader = self._html_search_regex( + r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False) + video_description = self._html_search_regex( + r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False) + video_url = self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/') + duration = parse_duration(self._html_search_regex( + r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False)) + view_count = self._html_search_regex( + r'<span class="bold">Views:</span> ([\d,\.]+)</p>', webpage, 'view count', fatal=False) + if view_count: + view_count = str_to_int(view_count) + comment_count = self._html_search_regex( + r'<div id="commentBar">([\d,\.]+) Comments</div>', webpage, 'comment count', fatal=False) + if comment_count: + comment_count = str_to_int(comment_count) + path = compat_urllib_parse_urlparse(video_url).path extension = os.path.splitext(path)[1][1:] format = path.split('/')[5].split('_')[:2] @@ -48,6 +66,9 @@ class XTubeIE(InfoExtractor): 'title': video_title, 'uploader': video_uploader, 'description': video_description, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, 'url': video_url, 'ext': extension, 'format': format, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e1ef90e38..166a0cf70 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -29,7 +29,6 @@ from ..utils import ( ExtractorError, int_or_none, PagedList, - RegexNotFoundError, unescapeHTML, unified_strdate, orderedSet, @@ -200,9 +199,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40}, '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40}, '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40}, - '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40}, + '138': {'ext': 'mp4', 'height': 2160, 'resolution': '2160p', 'format_note': 'DASH video', 'preference': -40}, '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40}, - '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40}, + '264': {'ext': 'mp4', 'height': 1440, 'resolution': '1440p', 'format_note': 'DASH video', 'preference': -40}, # Dash mp4 audio '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50}, @@ -1489,11 +1488,15 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): # the id of the playlist is just 'RD' + video_id url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix') - title_span = (get_element_by_attribute('class', 'title long-title', webpage) or - get_element_by_attribute('class', 'title ', webpage)) + search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) + title_span = (search_title('playlist-title') or + search_title('title long-title') or search_title('title')) title = clean_html(title_span) - video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id) - ids = orderedSet(re.findall(video_re, webpage)) + video_re = r'''(?x)data-video-username="(.*?)".*? + href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id) + matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL)) + # Some of the videos may have been deleted, their username field is empty + ids = [video_id for (username, video_id) in matches if username] url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, title) @@ -1642,7 +1645,7 @@ class YoutubeChannelIE(InfoExtractor): class YoutubeUserIE(InfoExtractor): IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s' _GDATA_PAGE_SIZE = 50 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' @@ -1741,12 +1744,50 @@ class YoutubeSearchIE(SearchInfoExtractor): for video_id in video_ids] return self.playlist_result(videos, query) + class YoutubeSearchDateIE(YoutubeSearchIE): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' _SEARCH_KEY = 'ytsearchdate' IE_DESC = u'YouTube.com searches, newest videos first' + +class YoutubeSearchURLIE(InfoExtractor): + IE_DESC = u'YouTube.com search URLs' + IE_NAME = u'youtube:search_url' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + query = compat_urllib_parse.unquote_plus(mobj.group('query')) + + webpage = self._download_webpage(url, query) + result_code = self._search_regex( + r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML') + + part_codes = re.findall( + r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code) + entries = [] + for part_code in part_codes: + part_title = self._html_search_regex( + r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False) + part_url_snippet = self._html_search_regex( + r'(?s)href="([^"]+)"', part_code, 'item URL') + part_url = compat_urlparse.urljoin( + 'https://www.youtube.com/', part_url_snippet) + entries.append({ + '_type': 'url', + 'url': part_url, + 'title': part_title, + }) + + return { + '_type': 'playlist', + 'entries': entries, + 'title': query, + } + + class YoutubeShowIE(InfoExtractor): IE_DESC = u'YouTube.com (multi-season) shows' _VALID_URL = r'https?://www\.youtube\.com/show/(.*)' diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 829f002cf..3b1ac4e9f 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,4 +1,5 @@ # coding: utf-8 +from __future__ import unicode_literals import re @@ -13,52 +14,42 @@ class ZDFIE(InfoExtractor): _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?' _TEST = { - u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt", - u"file": u"2037704.webm", - u"info_dict": { - u"upload_date": u"20131127", - u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".", - u"uploader": u"spezial", - u"title": u"ZDFspezial - Ende des Machtpokers" + 'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt', + 'info_dict': { + 'id': '2037704', + 'ext': 'webm', + 'title': 'ZDFspezial - Ende des Machtpokers', + 'description': 'Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial "Ende des Machtpokers - Große Koalition für Deutschland".', + 'duration': 1022, + 'uploader': 'spezial', + 'uploader_id': '225948', + 'upload_date': '20131127', }, - u"skip": u"Videos on ZDF.de are depublicised in short order", + 'skip': 'Videos on ZDF.de are depublicised in short order', } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') - xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id doc = self._download_xml( xml_url, video_id, - note=u'Downloading video info', - errnote=u'Failed to download video info') + note='Downloading video info', + errnote='Failed to download video info') title = doc.find('.//information/title').text description = doc.find('.//information/detail').text + duration = int(doc.find('.//details/lengthSec').text) uploader_node = doc.find('.//details/originChannelTitle') uploader = None if uploader_node is None else uploader_node.text - duration_str = doc.find('.//details/length').text - duration_m = re.match(r'''(?x)^ - (?P<hours>[0-9]{2}) - :(?P<minutes>[0-9]{2}) - :(?P<seconds>[0-9]{2}) - (?:\.(?P<ms>[0-9]+)?) - ''', duration_str) - duration = ( - ( - (int(duration_m.group('hours')) * 60 * 60) + - (int(duration_m.group('minutes')) * 60) + - int(duration_m.group('seconds')) - ) - if duration_m - else None - ) + uploader_id_node = doc.find('.//details/originChannelId') + uploader_id = None if uploader_id_node is None else uploader_id_node.text upload_date = unified_strdate(doc.find('.//details/airtime').text) def xml_to_format(fnode): video_url = fnode.find('url').text - is_available = u'http://www.metafilegenerator' not in video_url + is_available = 'http://www.metafilegenerator' not in video_url format_id = fnode.attrib['basetype'] format_m = re.match(r'''(?x) @@ -71,22 +62,28 @@ class ZDFIE(InfoExtractor): quality = fnode.find('./quality').text abr = int(fnode.find('./audioBitrate').text) // 1000 - vbr = int(fnode.find('./videoBitrate').text) // 1000 + vbr_node = fnode.find('./videoBitrate') + vbr = None if vbr_node is None else int(vbr_node.text) // 1000 - format_note = u'' + width_node = fnode.find('./width') + width = None if width_node is None else int_or_none(width_node.text) + height_node = fnode.find('./height') + height = None if height_node is None else int_or_none(height_node.text) + + format_note = '' if not format_note: format_note = None return { - 'format_id': format_id + u'-' + quality, + 'format_id': format_id + '-' + quality, 'url': video_url, 'ext': ext, 'acodec': format_m.group('acodec'), 'vcodec': format_m.group('vcodec'), 'abr': abr, 'vbr': vbr, - 'width': int_or_none(fnode.find('./width').text), - 'height': int_or_none(fnode.find('./height').text), + 'width': width, + 'height': height, 'filesize': int_or_none(fnode.find('./filesize').text), 'format_note': format_note, 'protocol': proto, @@ -103,9 +100,10 @@ class ZDFIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'formats': formats, 'description': description, - 'uploader': uploader, 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, 'upload_date': upload_date, - } + 'formats': formats, + }
\ No newline at end of file diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 471516b8f..02b8f7c45 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import contextlib import ctypes import datetime import email.utils @@ -771,6 +772,7 @@ def unified_strdate(date_str): '%B %d %Y', '%b %d %Y', '%Y-%m-%d', + '%d.%m.%Y', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%Y-%m-%d %H:%M:%S', @@ -779,6 +781,7 @@ def unified_strdate(date_str): '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f0Z', '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M', ] for expression in format_expressions: @@ -1244,3 +1247,19 @@ except TypeError: else: struct_pack = struct.pack struct_unpack = struct.unpack + + +def read_batch_urls(batch_fd): + def fixup(url): + if not isinstance(url, compat_str): + url = url.decode('utf-8', 'replace') + BOM_UTF8 = u'\xef\xbb\xbf' + if url.startswith(BOM_UTF8): + url = url[len(BOM_UTF8):] + url = url.strip() + if url.startswith(('#', ';', ']')): + return False + return url + + with contextlib.closing(batch_fd) as fd: + return [url for url in map(fixup, fd) if url] diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a92faa5a7..30cf4c188 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.02.21.1' +__version__ = '2014.03.04.2' |