diff options
author | Ismael Mejia <iemejia@gmail.com> | 2013-09-06 23:23:23 +0200 |
---|---|---|
committer | Ismael Mejia <iemejia@gmail.com> | 2013-09-06 23:24:41 +0200 |
commit | 72836fcee453386f4f16325c5b8fa4c1ba1bb442 (patch) | |
tree | 58efd36f4a56269a07774969e2ac385aacf8eae6 /youtube_dl/extractor | |
parent | d6e203b3dcef8f291b57021903e629d3e30e1f0b (diff) | |
parent | a7130543fa0368175740f5fa173ef920671db866 (diff) |
Merge branch 'master' into subtitles_rework
Diffstat (limited to 'youtube_dl/extractor')
33 files changed, 1241 insertions, 76 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b4db8f0bf..fbe0b8cb7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,3 +1,5 @@ +from .appletrailers import AppleTrailersIE +from .addanime import AddAnimeIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE from .arte import ArteTvIE @@ -6,16 +8,21 @@ from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .breakcom import BreakIE from .brightcove import BrightcoveIE +from .c56 import C56IE from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE +from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE from .criterion import CriterionIE from .cspan import CSpanIE from .dailymotion import DailymotionIE, DailymotionPlaylistIE +from .daum import DaumIE from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE +from .defense import DefenseGouvFrIE from .ehow import EHowIE from .eighttracks import EightTracksIE from .escapist import EscapistIE @@ -29,6 +36,7 @@ from .gametrailers import GametrailersIE from .generic import GenericIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE +from .hark import HarkIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .hypem import HypemIE @@ -44,23 +52,30 @@ from .keek import KeekIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE from .metacafe import MetacafeIE +from .metacritic import MetacriticIE +from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE from .mtv import MTVIE from .muzu import MuzuTVIE from .myspass import MySpassIE from .myvideo import MyVideoIE +from .naver import NaverIE from .nba import NBAIE +from .nbc import NBCNewsIE from .ooyala import OoyalaIE +from .orf import ORFIE from .pbs import PBSIE from .photobucket import PhotobucketIE from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE +from .ro220 import Ro220IE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE from .sina import SinaIE from .slashdot import SlashdotIE +from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE @@ -71,18 +86,19 @@ from .ted import TEDIE from .tf1 import TF1IE from .thisav import ThisAVIE from .traileraddict import TrailerAddictIE +from .trilulilu import TriluliluIE from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE -from .ustream import UstreamIE from .unistra import UnistraIE +from .ustream import UstreamIE from .vbox7 import Vbox7IE +from .veehd import VeeHDIE from .veoh import VeohIE from .vevo import VevoIE from .videofyme import VideofyMeIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE -from .c56 import C56IE from .wat import WatIE from .weibo import WeiboIE from .wimp import WimpIE @@ -116,12 +132,14 @@ _ALL_CLASSES = [ ] _ALL_CLASSES.append(GenericIE) + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. """ return [klass() for klass in _ALL_CLASSES] + def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" return globals()[ie_name+'IE'] diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py new file mode 100644 index 000000000..82a785a19 --- /dev/null +++ b/youtube_dl/extractor/addanime.py @@ -0,0 +1,75 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + compat_HTTPError, + compat_str, + compat_urllib_parse, + compat_urllib_parse_urlparse, + + ExtractorError, +) + + +class AddAnimeIE(InfoExtractor): + + _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)' + IE_NAME = u'AddAnime' + _TEST = { + u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', + u'file': u'24MR3YO5SAS9.flv', + u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1', + u'info_dict': { + u"description": u"One Piece 606", + u"title": u"One Piece 606" + } + } + + def _real_extract(self, url): + try: + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + webpage = self._download_webpage(url, video_id) + except ExtractorError as ee: + if not isinstance(ee.cause, compat_HTTPError): + raise + + redir_webpage = ee.cause.read().decode('utf-8') + action = self._search_regex( + r'<form id="challenge-form" action="([^"]+)"', + redir_webpage, u'Redirect form') + vc = self._search_regex( + r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>', + redir_webpage, u'redirect vc value') + av = re.search( + r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', + redir_webpage) + if av is None: + raise ExtractorError(u'Cannot find redirect math task') + av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3)) + + parsed_url = compat_urllib_parse_urlparse(url) + av_val = av_res + len(parsed_url.netloc) + confirm_url = ( + parsed_url.scheme + u'://' + parsed_url.netloc + + action + '?' + + compat_urllib_parse.urlencode({ + 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) + self._download_webpage( + confirm_url, video_id, + note=u'Confirming after redirect') + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex(r"var normal_video_file = '(.*?)';", + webpage, u'video file URL') + video_title = self._og_search_title(webpage) + video_description = self._og_search_description(webpage) + + return { + '_type': 'video', + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'description': video_description + } diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py new file mode 100644 index 000000000..8b191c196 --- /dev/null +++ b/youtube_dl/extractor/appletrailers.py @@ -0,0 +1,166 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + determine_ext, +) + + +class AppleTrailersIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' + _TEST = { + u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", + u"playlist": [ + { + u"file": u"manofsteel-trailer4.mov", + u"md5": u"11874af099d480cc09e103b189805d5f", + u"info_dict": { + u"duration": 111, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", + u"title": u"Trailer 4", + u"upload_date": u"20130523", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-trailer3.mov", + u"md5": u"07a0a262aae5afe68120eed61137ab34", + u"info_dict": { + u"duration": 182, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", + u"title": u"Trailer 3", + u"upload_date": u"20130417", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-trailer.mov", + u"md5": u"e401fde0813008e3307e54b6f384cff1", + u"info_dict": { + u"duration": 148, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", + u"title": u"Trailer", + u"upload_date": u"20121212", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-teaser.mov", + u"md5": u"76b392f2ae9e7c98b22913c10a639c97", + u"info_dict": { + u"duration": 93, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", + u"title": u"Teaser", + u"upload_date": u"20120721", + u"uploader_id": u"wb", + }, + } + ] + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + movie = mobj.group('movie') + uploader_id = mobj.group('company') + + playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' + playlist_snippet = self._download_webpage(playlist_url, movie) + playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet) + playlist_html = u'<html>' + playlist_cleaned + u'</html>' + + size_cache = {} + + doc = xml.etree.ElementTree.fromstring(playlist_html) + playlist = [] + for li in doc.findall('./div/ul/li'): + title = li.find('.//h3').text + video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() + thumbnail = li.find('.//img').attrib['src'] + + date_el = li.find('.//p') + upload_date = None + m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text) + if m: + upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') + runtime_el = date_el.find('./br') + m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail) + duration = None + if m: + duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) + + formats = [] + for formats_el in li.findall('.//a'): + if formats_el.attrib['class'] != 'OverlayPanel': + continue + target = formats_el.attrib['target'] + + format_code = formats_el.text + if 'Automatic' in format_code: + continue + + size_q = formats_el.attrib['href'] + size_id = size_q.rpartition('#videos-')[2] + if size_id not in size_cache: + size_url = url + size_q + sizepage_html = self._download_webpage( + size_url, movie, + note=u'Downloading size info %s' % size_id, + errnote=u'Error while downloading size info %s' % size_id, + ) + _doc = xml.etree.ElementTree.fromstring(sizepage_html) + size_cache[size_id] = _doc + + sizepage_doc = size_cache[size_id] + links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') + for vid_a in links: + href = vid_a.get('href') + if not href.endswith(target): + continue + detail_q = href.partition('#')[0] + detail_url = url + '/' + detail_q + + m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q) + detail_id = m.group('detail_id') + + detail_html = self._download_webpage( + detail_url, movie, + note=u'Downloading detail %s %s' % (detail_id, size_id), + errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) + ) + detail_doc = xml.etree.ElementTree.fromstring(detail_html) + movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') + assert movie_link_el.get('class') == 'movieLink' + movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') + ext = determine_ext(movie_link) + assert ext == 'mov' + + formats.append({ + 'format': format_code, + 'ext': ext, + 'url': movie_link, + }) + + info = { + '_type': 'video', + 'id': video_id, + 'title': title, + 'formats': formats, + 'title': title, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'user_agent': 'QuickTime compatible (youtube-dl)', + } + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = formats[-1]['ext'] + + playlist.append(info) + + return { + '_type': 'playlist', + 'id': movie, + 'entries': playlist, + } diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py index 4c8a8af09..dc3a8d47d 100644 --- a/youtube_dl/extractor/c56.py +++ b/youtube_dl/extractor/c56.py @@ -12,8 +12,8 @@ class C56IE(InfoExtractor): _TEST ={ u'url': u'http://www.56.com/u39/v_OTM0NDA3MTY.html', - u'file': u'93440716.mp4', - u'md5': u'9dc07b5c8e978112a6441f9e75d2b59e', + u'file': u'93440716.flv', + u'md5': u'e59995ac63d0457783ea05f93f12a866', u'info_dict': { u'title': u'网事知多少 第32期:车怒', }, diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py new file mode 100644 index 000000000..50832217a --- /dev/null +++ b/youtube_dl/extractor/canalc2.py @@ -0,0 +1,35 @@ +# coding: utf-8 +import re + +from .common import InfoExtractor + + +class Canalc2IE(InfoExtractor): + _IE_NAME = 'canalc2.tv' + _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui' + + _TEST = { + u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', + u'file': u'12163.mp4', + u'md5': u'060158428b650f896c542dfbb3d6487f', + u'info_dict': { + u'title': u'Terrasses du Numérique' + } + } + + def _real_extract(self, url): + video_id = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, video_id) + file_name = self._search_regex( + r"so\.addVariable\('file','(.*?)'\);", + webpage, 'file name') + video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name + + title = self._html_search_regex( + r'class="evenement8">(.*?)</a>', webpage, u'title') + + return {'id': video_id, + 'ext': 'mp4', + 'url': video_url, + 'title': title, + } diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 3b1c88876..1f02519a0 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -5,7 +5,7 @@ from .common import InfoExtractor from ..utils import unified_strdate class CanalplusIE(InfoExtractor): - _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)' + _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P<id>\d+)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' IE_NAME = u'canalplus.fr' diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py new file mode 100644 index 000000000..a79f881cd --- /dev/null +++ b/youtube_dl/extractor/cnn.py @@ -0,0 +1,58 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import determine_ext + + +class CNNIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/ + (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))''' + + _TESTS = [{ + u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', + u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4', + u'md5': u'3e6121ea48df7e2259fe73a0628605c4', + u'info_dict': { + u'title': u'Nadal wins 8th French Open title', + u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', + }, + }, + { + u"url": u"http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29", + u"file": u"us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4", + u"md5": u"b5cc60c60a3477d185af8f19a2a26f4e", + u"info_dict": { + u"title": "Student's epic speech stuns new freshmen", + u"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"" + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + path = mobj.group('path') + page_title = mobj.group('title') + info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path + info_xml = self._download_webpage(info_url, page_title) + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + + formats = [] + for f in info.findall('files/file'): + mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate']) + if mf is not None: + formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text)) + formats = sorted(formats) + (_,_,_, video_path) = formats[-1] + video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path + + thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')]) + thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails] + + return {'id': info.attrib['id'], + 'title': info.find('headline').text, + 'url': video_url, + 'ext': determine_ext(video_url), + 'thumbnail': thumbnails[-1][1], + 'thumbnails': thumbs_dict, + 'description': info.find('description').text, + } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 52c4483c9..77726ee24 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -114,6 +114,11 @@ class InfoExtractor(object): """Real extraction process. Redefine in subclasses.""" pass + @classmethod + def ie_key(cls): + """A string for getting the InfoExtractor with get_info_extractor""" + return cls.__name__[:-2] + @property def IE_NAME(self): return type(self).__name__[:-2] @@ -129,7 +134,7 @@ class InfoExtractor(object): except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if errnote is None: errnote = u'Unable to download webpage' - raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2]) + raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err) def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None): """ Returns a tuple (page content as string, URL handle) """ @@ -140,12 +145,17 @@ class InfoExtractor(object): urlh = self._request_webpage(url_or_request, video_id, note, errnote) content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) else: - encoding = 'utf-8' - webpage_bytes = urlh.read() + m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', + webpage_bytes[:1024]) + if m: + encoding = m.group(1).decode('ascii') + else: + encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 003b1d8c3..f7dffd4cc 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -37,14 +37,14 @@ class DailyMotionSubtitlesIE(NoAutoSubtitlesIE): class DailymotionIE(DailyMotionSubtitlesIE, InfoExtractor): """Information Extractor for Dailymotion""" - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' IE_NAME = u'dailymotion' _TEST = { u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech', u'file': u'x33vw9.mp4', u'md5': u'392c4b85a60a90dc4792da41ce3144eb', u'info_dict': { - u"uploader": u"Alex and Van .", + u"uploader": u"Amphora Alex and Van .", u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" } } @@ -56,6 +56,7 @@ class DailymotionIE(DailyMotionSubtitlesIE, InfoExtractor): video_id = mobj.group(1).split('_')[0].split('?')[0] video_extension = 'mp4' + url = 'http://www.dailymotion.com/video/%s' % video_id # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url) @@ -78,7 +79,8 @@ class DailymotionIE(DailyMotionSubtitlesIE, InfoExtractor): embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id embed_page = self._download_webpage(embed_url, video_id, u'Downloading embed page') - info = self._search_regex(r'var info = ({.*?}),', embed_page, 'video info') + info = self._search_regex(r'var info = ({.*?}),$', embed_page, + 'video info', flags=re.MULTILINE) info = json.loads(info) # TODO: support choosing qualities diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py new file mode 100644 index 000000000..a804e83bd --- /dev/null +++ b/youtube_dl/extractor/daum.py @@ -0,0 +1,74 @@ +# encoding: utf-8 +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + determine_ext, +) + + +class DaumIE(InfoExtractor): + _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' + IE_NAME = u'daum.net' + + _TEST = { + u'url': u'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690', + u'file': u'52554690.mp4', + u'info_dict': { + u'title': u'DOTA 2GETHER 시즌2 6회 - 2부', + u'description': u'DOTA 2GETHER 시즌2 6회 - 2부', + u'upload_date': u'20130831', + u'duration': 3868, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + canonical_url = 'http://tvpot.daum.net/v/%s' % video_id + webpage = self._download_webpage(canonical_url, video_id) + full_id = self._search_regex(r'<link rel="video_src" href=".+?vid=(.+?)"', + webpage, u'full id') + query = compat_urllib_parse.urlencode({'vid': full_id}) + info_xml = self._download_webpage( + 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, + u'Downloading video info') + urls_xml = self._download_webpage( + 'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query, + video_id, u'Downloading video formats info') + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8')) + + self.to_screen(u'%s: Getting video urls' % video_id) + formats = [] + for format_el in urls.findall('result/output_list/output_list'): + profile = format_el.attrib['profile'] + format_query = compat_urllib_parse.urlencode({ + 'vid': full_id, + 'profile': profile, + }) + url_xml = self._download_webpage( + 'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query, + video_id, note=False) + url_doc = xml.etree.ElementTree.fromstring(url_xml.encode('utf-8')) + format_url = url_doc.find('result/url').text + formats.append({ + 'url': format_url, + 'ext': determine_ext(format_url), + 'format_id': profile, + }) + + info = { + 'id': video_id, + 'title': info.find('TITLE').text, + 'formats': formats, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': info.find('CONTENTS').text, + 'duration': int(info.find('DURATION').text), + 'upload_date': info.find('REGDTTM').text[:8], + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py new file mode 100644 index 000000000..424d960da --- /dev/null +++ b/youtube_dl/extractor/defense.py @@ -0,0 +1,39 @@ +import re +import json + +from .common import InfoExtractor + + +class DefenseGouvFrIE(InfoExtractor): + _IE_NAME = 'defense.gouv.fr' + _VALID_URL = (r'http://.*?\.defense\.gouv\.fr/layout/set/' + r'ligthboxvideo/base-de-medias/webtv/(.*)') + + _TEST = { + u'url': (u'http://www.defense.gouv.fr/layout/set/ligthboxvideo/' + u'base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1'), + u'file': u'11213.mp4', + u'md5': u'75bba6124da7e63d2d60b5244ec9430c', + "info_dict": { + "title": "attaque-chimique-syrienne-du-21-aout-2013-1" + } + } + + def _real_extract(self, url): + title = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, title) + video_id = self._search_regex( + r"flashvars.pvg_id=\"(\d+)\";", + webpage, 'ID') + + json_url = ('http://static.videos.gouv.fr/brightcovehub/export/json/' + + video_id) + info = self._download_webpage(json_url, title, + 'Downloading JSON config') + video_url = json.loads(info)['renditions'][0]['url'] + + return {'id': video_id, + 'ext': 'mp4', + 'url': video_url, + 'title': title, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index da016f7ee..f92e61fea 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -8,11 +8,13 @@ from ..utils import ( compat_urllib_error, compat_urllib_parse, compat_urllib_request, + compat_urlparse, ExtractorError, ) from .brightcove import BrightcoveIE + class GenericIE(InfoExtractor): IE_DESC = u'Generic downloader that works on some sites' _VALID_URL = r'.*' @@ -23,7 +25,7 @@ class GenericIE(InfoExtractor): u'file': u'13601338388002.mp4', u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89', u'info_dict': { - u"uploader": u"www.hodiho.fr", + u"uploader": u"www.hodiho.fr", u"title": u"R\u00e9gis plante sa Jeep" } }, @@ -107,6 +109,11 @@ class GenericIE(InfoExtractor): return new_url def _real_extract(self, url): + parsed_url = compat_urlparse.urlparse(url) + if not parsed_url.scheme: + self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') + return self.url_result('http://' + url) + try: new_url = self._test_redirect(url) if new_url: @@ -124,7 +131,7 @@ class GenericIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) self.report_extraction(video_id) - # Look for BrigthCove: + # Look for BrightCove: m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) if m_brightcove is not None: self.to_screen(u'Brightcove video detected.') @@ -151,7 +158,7 @@ class GenericIE(InfoExtractor): mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) if mobj is None: # HTML5 video - mobj = re.search(r'<video[^<]*>.*?<source .*?src="([^"]+)"', webpage, flags=re.DOTALL) + mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) @@ -160,8 +167,9 @@ class GenericIE(InfoExtractor): if mobj.group(1) is None: raise ExtractorError(u'Invalid URL: %s' % url) - video_url = compat_urllib_parse.unquote(mobj.group(1)) - video_id = os.path.basename(video_url) + video_url = mobj.group(1) + video_url = compat_urlparse.urljoin(url, video_url) + video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) # here's a fun little line of code for you: video_extension = os.path.splitext(video_id)[1][1:] diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index 9f7fc19a4..f1cd88983 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -57,8 +57,8 @@ class GooglePlusIE(InfoExtractor): webpage, 'title', default=u'NA') # Step 2, Simulate clicking the image box to launch video - DOMAIN = 'https://plus.google.com' - video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN), + DOMAIN = 'https://plus.google.com/' + video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN), webpage, u'video page URL') if not video_page.startswith(DOMAIN): video_page = DOMAIN + video_page diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py new file mode 100644 index 000000000..5bdd08afa --- /dev/null +++ b/youtube_dl/extractor/hark.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- + +import re +import json + +from .common import InfoExtractor +from ..utils import determine_ext + +class HarkIE(InfoExtractor): + _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+' + _TEST = { + u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', + u'file': u'mmbzyhkgny.mp3', + u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', + u'info_dict': { + u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013", + u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', + u'duration': 11, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + json_url = "http://www.hark.com/clips/%s.json" %(video_id) + info_json = self._download_webpage(json_url, video_id) + info = json.loads(info_json) + final_url = info['url'] + + return {'id': video_id, + 'url' : final_url, + 'title': info['name'], + 'ext': determine_ext(final_url), + 'description': info['description'], + 'thumbnail': info['image_original'], + 'duration': info['duration'], + } diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 62abab655..b1c84278a 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -13,7 +13,7 @@ class IGNIE(InfoExtractor): Some videos of it.ign.com are also supported """ - _VALID_URL = r'https?://.+?\.ign\.com/(?:videos|show_videos)(/.+)?/(?P<name_or_id>.+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles)(/.+)?/(?P<name_or_id>.+)' IE_NAME = u'ign.com' _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' @@ -41,7 +41,11 @@ class IGNIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) name_or_id = mobj.group('name_or_id') + page_type = mobj.group('type') webpage = self._download_webpage(url, name_or_id) + if page_type == 'articles': + video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, u'video url') + return self.url_result(video_url, ie='IGN') video_id = self._find_video_id(webpage) result = self._get_video_info(video_id) description = self._html_search_regex(self._DESCRIPTION_RE, @@ -68,7 +72,7 @@ class IGNIE(InfoExtractor): class OneUPIE(IGNIE): """Extractor for 1up.com, it uses the ign videos system.""" - _VALID_URL = r'https?://gamevideos.1up.com/video/id/(?P<name_or_id>.+)' + _VALID_URL = r'https?://gamevideos.1up.com/(?P<type>video)/id/(?P<name_or_id>.+)' IE_NAME = '1up.com' _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>' diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py index 8537ba584..445d46501 100644 --- a/youtube_dl/extractor/kankan.py +++ b/youtube_dl/extractor/kankan.py @@ -21,8 +21,10 @@ class KankanIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title') - gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid') + title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, u'video title') + surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0) + gcids = re.findall(r"http://.+?/.+?/(.+?)/", surls) + gcid = gcids[-1] video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid, video_id, u'Downloading video url info') diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index e38dc98b4..e537648ff 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -122,7 +122,7 @@ class MetacafeIE(InfoExtractor): video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, u'title') description = self._og_search_description(webpage) video_uploader = self._html_search_regex( - r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("channel","([^"]+)"\);', + r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, u'uploader nickname', fatal=False) return { diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py new file mode 100644 index 000000000..449138b56 --- /dev/null +++ b/youtube_dl/extractor/metacritic.py @@ -0,0 +1,55 @@ +import re +import xml.etree.ElementTree +import operator + +from .common import InfoExtractor + + +class MetacriticIE(InfoExtractor): + _VALID_URL = r'https?://www\.metacritic\.com/.+?/trailers/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222', + u'file': u'3698222.mp4', + u'info_dict': { + u'title': u'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors', + u'description': u'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.', + u'duration': 221, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + # The xml is not well formatted, there are raw '&' + info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id, + video_id, u'Downloading info xml').replace('&', '&') + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + + clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) + formats = [] + for videoFile in clip.findall('httpURI/videoFile'): + rate_str = videoFile.find('rate').text + video_url = videoFile.find('filePath').text + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'format_id': rate_str, + 'rate': int(rate_str), + }) + formats.sort(key=operator.itemgetter('rate')) + + description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>', + webpage, u'description', flags=re.DOTALL) + + info = { + 'id': video_id, + 'title': clip.find('title').text, + 'formats': formats, + 'description': description, + 'duration': int(clip.find('duration').text), + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py new file mode 100644 index 000000000..52be9232f --- /dev/null +++ b/youtube_dl/extractor/mit.py @@ -0,0 +1,74 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_id, +) + + +class TechTVMITIE(InfoExtractor): + IE_NAME = u'techtv.mit.edu' + _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)' + + _TEST = { + u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', + u'file': u'25418.mp4', + u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f', + u'info_dict': { + u'title': u'MIT DNA Learning Center Set', + u'description': u'md5:82313335e8a8a3f243351ba55bc1b474', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + raw_page = self._download_webpage( + 'http://techtv.mit.edu/videos/%s' % video_id, video_id) + clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page) + + base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', + raw_page, u'base url') + formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page, + u'video formats') + formats = json.loads(formats_json) + formats = sorted(formats, key=lambda f: f['bitrate']) + + title = get_element_by_id('edit-title', clean_page) + description = clean_html(get_element_by_id('edit-description', clean_page)) + thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', + raw_page, u'thumbnail', flags=re.DOTALL) + + return {'id': video_id, + 'title': title, + 'url': base_url + formats[-1]['url'].replace('mp4:', ''), + 'ext': 'mp4', + 'description': description, + 'thumbnail': thumbnail, + } + + +class MITIE(TechTVMITIE): + IE_NAME = u'video.mit.edu' + _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)' + + _TEST = { + u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', + u'file': u'21783.mp4', + u'md5': u'7db01d5ccc1895fc5010e9c9e13648da', + u'info_dict': { + u'title': u'The Government is Profiling You', + u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + page_title = mobj.group('title') + webpage = self._download_webpage(url, page_title) + self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME)) + embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage, + u'embed url') + return self.url_result(embed_url, ie='TechTVMIT') diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py new file mode 100644 index 000000000..9df236d69 --- /dev/null +++ b/youtube_dl/extractor/naver.py @@ -0,0 +1,73 @@ +# encoding: utf-8 +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + ExtractorError, +) + + +class NaverIE(InfoExtractor): + _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P<id>\d+)' + + _TEST = { + u'url': u'http://tvcast.naver.com/v/81652', + u'file': u'81652.mp4', + u'info_dict': { + u'title': u'[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', + u'description': u'합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', + u'upload_date': u'20130903', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + webpage = self._download_webpage(url, video_id) + m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"', + webpage) + if m_id is None: + raise ExtractorError(u'couldn\'t extract vid and key') + vid = m_id.group(1) + key = m_id.group(2) + query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key,}) + query_urls = compat_urllib_parse.urlencode({ + 'masterVid': vid, + 'protocol': 'p2p', + 'inKey': key, + }) + info_xml = self._download_webpage( + 'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query, + video_id, u'Downloading video info') + urls_xml = self._download_webpage( + 'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls, + video_id, u'Downloading video formats info') + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8')) + + formats = [] + for format_el in urls.findall('EncodingOptions/EncodingOption'): + domain = format_el.find('Domain').text + if domain.startswith('rtmp'): + continue + formats.append({ + 'url': domain + format_el.find('uri').text, + 'ext': 'mp4', + 'width': int(format_el.find('width').text), + 'height': int(format_el.find('height').text), + }) + + info = { + 'id': video_id, + 'title': info.find('Subject').text, + 'formats': formats, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': info.find('WriteDate').text.replace('.', ''), + 'view_count': int(info.find('PlayCount').text), + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py new file mode 100644 index 000000000..3bc9dae6d --- /dev/null +++ b/youtube_dl/extractor/nbc.py @@ -0,0 +1,33 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import find_xpath_attr, compat_str + + +class NBCNewsIE(InfoExtractor): + _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.nbcnews.com/video/nbc-news/52753292', + u'file': u'52753292.flv', + u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179', + u'info_dict': { + u'title': u'Crew emerges after four-month Mars food study', + u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video') + + return {'id': video_id, + 'title': info.find('headline').text, + 'ext': 'flv', + 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, + 'description': compat_str(info.find('caption').text), + 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, + } diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py new file mode 100644 index 000000000..cfca2a063 --- /dev/null +++ b/youtube_dl/extractor/orf.py @@ -0,0 +1,54 @@ +# coding: utf-8 + +import re +import xml.etree.ElementTree +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + ExtractorError, + find_xpath_attr, +) + +class ORFIE(InfoExtractor): + _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + webpage = self._download_webpage(url, playlist_id) + + flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml') + flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0] + flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8')) + playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"') + playlist = json.loads(playlist_json) + + videos = [] + ns = '{http://tempuri.org/XMLSchema.xsd}' + xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns} + webpage_description = self._og_search_description(webpage) + for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1): + # Get best quality url + rtmp_url = None + for q in ['Q6A', 'Q4A', 'Q1A']: + video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q) + if video_url is not None: + rtmp_url = video_url.text + break + if rtmp_url is None: + raise ExtractorError(u'Couldn\'t get video url: %s' % info['id']) + description = self._html_search_regex( + r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage, + u'description', default=webpage_description, flags=re.DOTALL) + videos.append({ + '_type': 'video', + 'id': info['id'], + 'title': info['title'], + 'url': rtmp_url, + 'ext': 'flv', + 'description': description, + }) + + return videos diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py new file mode 100644 index 000000000..c32f64d99 --- /dev/null +++ b/youtube_dl/extractor/ro220.py @@ -0,0 +1,42 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + compat_parse_qs, +) + + +class Ro220IE(InfoExtractor): + IE_NAME = '220.ro' + _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<video_id>[^/]+)' + _TEST = { + u"url": u"http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/", + u'file': u'LYV6doKo7f.mp4', + u'md5': u'03af18b73a07b4088753930db7a34add', + u'info_dict': { + u"title": u"Luati-le Banii sez 4 ep 1", + u"description": u"Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + flashVars_str = self._search_regex( + r'<param name="flashVars" value="([^"]+)"', + webpage, u'flashVars') + flashVars = compat_parse_qs(flashVars_str) + + info = { + '_type': 'video', + 'id': video_id, + 'ext': 'mp4', + 'url': flashVars['videoURL'][0], + 'title': flashVars['title'][0], + 'description': clean_html(flashVars['desc'][0]), + 'thumbnail': flashVars['preview'][0], + } + return info diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index 2f134e6a7..7bb236c2b 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -8,8 +8,8 @@ from ..utils import ( ) class RTLnowIE(InfoExtractor): - """Information Extractor for RTLnow, RTL2now and VOXnow""" - _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl(?:(?P<is_rtl2>2)|-)now\.rtl(?(is_rtl2)2|)\.de/|(?:www\.)?voxnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' + """Information Extractor for RTL NOW, RTL2 NOW, SUPER RTL NOW and VOX NOW""" + _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?superrtlnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' _TESTS = [{ u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', u'file': u'90419.flv', @@ -48,6 +48,19 @@ class RTLnowIE(InfoExtractor): u'params': { u'skip_download': True, }, + }, + { + u'url': u'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1', + u'file': u'99205.flv', + u'info_dict': { + u'upload_date': u'20080928', + u'title': u'Medicopter 117 - Angst!', + u'description': u'Angst!', + u'thumbnail': u'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg' + }, + u'params': { + u'skip_download': True, + }, }] def _real_extract(self,url): diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py new file mode 100644 index 000000000..77bb0a8dc --- /dev/null +++ b/youtube_dl/extractor/sohu.py @@ -0,0 +1,90 @@ +# encoding: utf-8 + +import json +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class SohuIE(InfoExtractor): + _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?' + + _TEST = { + u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super', + u'file': u'382479172.mp4', + u'md5': u'bde8d9a6ffd82c63a1eefaef4eeefec7', + u'info_dict': { + u'title': u'MV:Far East Movement《The Illest》', + }, + } + + def _real_extract(self, url): + + def _fetch_data(vid_id): + base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid=' + data_url = base_data_url + str(vid_id) + data_json = self._download_webpage( + data_url, video_id, + note=u'Downloading JSON data for ' + str(vid_id)) + return json.loads(data_json) + + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + raw_title = self._html_search_regex(r'(?s)<title>(.+?)</title>', + webpage, u'video title') + title = raw_title.partition('-')[0].strip() + + vid = self._html_search_regex(r'var vid="(\d+)"', webpage, + u'video path') + data = _fetch_data(vid) + + QUALITIES = ('ori', 'super', 'high', 'nor') + vid_ids = [data['data'][q + 'Vid'] + for q in QUALITIES + if data['data'][q + 'Vid'] != 0] + if not vid_ids: + raise ExtractorError(u'No formats available for this video') + + # For now, we just pick the highest available quality + vid_id = vid_ids[-1] + + format_data = data if vid == vid_id else _fetch_data(vid_id) + part_count = format_data['data']['totalBlocks'] + allot = format_data['allot'] + prot = format_data['prot'] + clipsURL = format_data['data']['clipsURL'] + su = format_data['data']['su'] + + playlist = [] + for i in range(part_count): + part_url = ('http://%s/?prot=%s&file=%s&new=%s' % + (allot, prot, clipsURL[i], su[i])) + part_str = self._download_webpage( + part_url, video_id, + note=u'Downloading part %d of %d' % (i+1, part_count)) + + part_info = part_str.split('|') + video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) + + video_info = { + 'id': '%s_part%02d' % (video_id, i + 1), + 'title': title, + 'url': video_url, + 'ext': 'mp4', + } + playlist.append(video_info) + + if len(playlist) == 1: + info = playlist[0] + info['id'] = video_id + else: + info = { + '_type': 'playlist', + 'entries': playlist, + 'id': video_id, + } + + return info diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py new file mode 100644 index 000000000..f278951ba --- /dev/null +++ b/youtube_dl/extractor/trilulilu.py @@ -0,0 +1,73 @@ +import json +import re +import xml.etree.ElementTree + +from .common import InfoExtractor + + +class TriluliluIE(InfoExtractor): + _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?trilulilu\.ro/video-(?P<category>[^/]+)/(?P<video_id>[^/]+)' + _TEST = { + u"url": u"http://www.trilulilu.ro/video-animatie/big-buck-bunny-1", + u'file': u"big-buck-bunny-1.mp4", + u'info_dict': { + u"title": u"Big Buck Bunny", + u"description": u":) pentru copilul din noi", + }, + # Server ignores Range headers (--test) + u"params": { + u"skip_download": True + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + + log_str = self._search_regex( + r'block_flash_vars[ ]=[ ]({[^}]+})', webpage, u'log info') + log = json.loads(log_str) + + format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/' + u'video-formats2' % log) + format_str = self._download_webpage( + format_url, video_id, + note=u'Downloading formats', + errnote=u'Error while downloading formats') + + format_doc = xml.etree.ElementTree.fromstring(format_str) + + video_url_template = ( + u'http://fs%(server)s.trilulilu.ro/stream.php?type=video' + u'&source=site&hash=%(hash)s&username=%(userid)s&' + u'key=ministhebest&format=%%s&sig=&exp=' % + log) + formats = [ + { + 'format': fnode.text, + 'url': video_url_template % fnode.text, + } + + for fnode in format_doc.findall('./formats/format') + ] + + info = { + '_type': 'video', + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } + + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = formats[-1]['format'].partition('-')[0] + + return info diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py index 5ba0a9061..516e18914 100644 --- a/youtube_dl/extractor/unistra.py +++ b/youtube_dl/extractor/unistra.py @@ -11,7 +11,7 @@ class UnistraIE(InfoExtractor): u'md5': u'736f605cfdc96724d55bb543ab3ced24', u'info_dict': { u'title': u'M!ss Yella', - u'description': u'md5:75e8439a3e2981cd5d4b6db232e8fdfc', + u'description': u'md5:104892c71bd48e55d70b902736b81bbf', }, } diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py new file mode 100644 index 000000000..3a99a29c6 --- /dev/null +++ b/youtube_dl/extractor/veehd.py @@ -0,0 +1,56 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + get_element_by_id, + clean_html, +) + +class VeeHDIE(InfoExtractor): + _VALID_URL = r'https?://veehd.com/video/(?P<id>\d+)' + + _TEST = { + u'url': u'http://veehd.com/video/4686958', + u'file': u'4686958.mp4', + u'info_dict': { + u'title': u'Time Lapse View from Space ( ISS)', + u'uploader_id': u'spotted', + u'description': u'md5:f0094c4cf3a72e22bc4e4239ef767ad7', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + player_path = self._search_regex(r'\$\("#playeriframe"\).attr\({src : "(.+?)"', + webpage, u'player path') + player_url = compat_urlparse.urljoin(url, player_path) + player_page = self._download_webpage(player_url, video_id, + u'Downloading player page') + config_json = self._search_regex(r'value=\'config=({.+?})\'', + player_page, u'config json') + config = json.loads(config_json) + + video_url = compat_urlparse.unquote(config['clip']['url']) + title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0]) + uploader_id = self._html_search_regex(r'<a href="/profile/\d+">(.+?)</a>', + webpage, u'uploader') + thumbnail = self._search_regex(r'<img id="veehdpreview" src="(.+?)"', + webpage, u'thumbnail') + description = self._html_search_regex(r'<td class="infodropdown".*?<div>(.*?)<ul', + webpage, u'description', flags=re.DOTALL) + + return { + '_type': 'video', + 'id': video_id, + 'title': title, + 'url': video_url, + 'ext': 'mp4', + 'uploader_id': uploader_id, + 'thumbnail': thumbnail, + 'description': description, + } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 512e06e2a..4a7d82b7a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -44,6 +44,16 @@ class VimeoIE(InfoExtractor): u'title': u'Andy Allan - Putting the Carto into OpenStreetMap Cartography', }, }, + { + u'url': u'http://player.vimeo.com/video/54469442', + u'file': u'54469442.mp4', + u'md5': u'619b811a4417aa4abe78dc653becf511', + u'note': u'Videos that embed the url in the player page', + u'info_dict': { + u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software', + u'uploader': u'The BLN & Business of Software', + }, + }, ] def _login(self): @@ -112,7 +122,8 @@ class VimeoIE(InfoExtractor): # Extract the config JSON try: - config = webpage.split(' = {config:')[1].split(',assets:')[0] + config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'], + webpage, u'info section', flags=re.DOTALL) config = json.loads(config) except: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): @@ -132,12 +143,22 @@ class VimeoIE(InfoExtractor): video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None # Extract video thumbnail - video_thumbnail = config["video"]["thumbnail"] + video_thumbnail = config["video"].get("thumbnail") + if video_thumbnail is None: + _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1] # Extract video description - video_description = get_element_by_attribute("itemprop", "description", webpage) - if video_description: video_description = clean_html(video_description) - else: video_description = u'' + video_description = None + try: + video_description = get_element_by_attribute("itemprop", "description", webpage) + if video_description: video_description = clean_html(video_description) + except AssertionError as err: + # On some pages like (http://player.vimeo.com/video/54469442) the + # html tags are not closed, python 2.6 cannot handle it + if err.args[0] == 'we should not get here!': + pass + else: + raise # Extract upload date video_upload_date = None @@ -154,14 +175,15 @@ class VimeoIE(InfoExtractor): # TODO bind to format param codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] files = { 'hd': [], 'sd': [], 'other': []} + config_files = config["video"].get("files") or config["request"].get("files") for codec_name, codec_extension in codecs: - if codec_name in config["video"]["files"]: - if 'hd' in config["video"]["files"][codec_name]: + if codec_name in config_files: + if 'hd' in config_files[codec_name]: files['hd'].append((codec_name, codec_extension, 'hd')) - elif 'sd' in config["video"]["files"][codec_name]: + elif 'sd' in config_files[codec_name]: files['sd'].append((codec_name, codec_extension, 'sd')) else: - files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0])) + files['other'].append((codec_name, codec_extension, config_files[codec_name][0])) for quality in ('hd', 'sd', 'other'): if len(files[quality]) > 0: @@ -173,8 +195,12 @@ class VimeoIE(InfoExtractor): else: raise ExtractorError(u'No known codec found') - video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ - %(video_id, sig, timestamp, video_quality, video_codec.upper()) + video_url = None + if isinstance(config_files[video_codec], dict): + video_url = config_files[video_codec][video_quality].get("url") + if video_url is None: + video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ + %(video_id, sig, timestamp, video_quality, video_codec.upper()) return [{ 'id': video_id, diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 7d228edac..29c25f0e3 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..utils import ( - compat_urllib_parse, unified_strdate, ) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 0f1feeffd..88b8b6be0 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -3,7 +3,8 @@ import re from .common import InfoExtractor from ..utils import ( compat_urllib_parse, - + unescapeHTML, + determine_ext, ExtractorError, ) @@ -36,15 +37,16 @@ class XHamsterIE(InfoExtractor): video_url = compat_urllib_parse.unquote(mobj.group('file')) else: video_url = mobj.group('server')+'/key='+mobj.group('file') - video_extension = video_url.split('.')[-1] video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, u'title') - # Can't see the description anywhere in the UI - # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)', - # webpage, u'description', fatal=False) - # if video_description: video_description = unescapeHTML(video_description) + # Only a few videos have an description + mobj = re.search('<span>Description: </span>(?P<description>[^<]+)', webpage) + if mobj: + video_description = unescapeHTML(mobj.group('description')) + else: + video_description = None mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) if mobj: @@ -62,9 +64,9 @@ class XHamsterIE(InfoExtractor): return [{ 'id': video_id, 'url': video_url, - 'ext': video_extension, + 'ext': determine_ext(video_url), 'title': video_title, - # 'description': video_description, + 'description': video_description, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, 'thumbnail': video_thumbnail diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index d1156bf42..c85fd4b5a 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -12,14 +12,16 @@ from ..utils import ( unescapeHTML, unified_strdate, ) - +from ..aes import ( + aes_decrypt_text +) class YouPornIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)' _TEST = { u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', u'file': u'505835.mp4', - u'md5': u'c37ddbaaa39058c76a7e86c6813423c1', + u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89', u'info_dict': { u"upload_date": u"20101221", u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", @@ -75,7 +77,15 @@ class YouPornIE(InfoExtractor): # Get all of the links from the page LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' links = re.findall(LINK_RE, download_list_html) - if(len(links) == 0): + + # Get link of hd video if available + mobj = re.search(r'var encryptedQuality720URL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';', webpage) + if mobj != None: + encrypted_video_url = mobj.group(u'encrypted_video_url') + video_url = aes_decrypt_text(encrypted_video_url, video_title, 32).decode('utf-8') + links = [video_url] + links + + if not links: raise ExtractorError(u'ERROR: no known formats available for video') self.to_screen(u'Links found: %d' % len(links)) @@ -112,7 +122,7 @@ class YouPornIE(InfoExtractor): self._print_formats(formats) return - req_format = self._downloader.params.get('format', None) + req_format = self._downloader.params.get('format', 'best') self.to_screen(u'Format: %s' % req_format) if req_format is None or req_format == 'best': diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b3400df0a..11611f10d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -194,7 +194,7 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor): _VALID_URL = r"""^ ( (?:https?://)? # http(s):// (optional) - (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| + (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/| tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: @@ -205,15 +205,18 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor): (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) v= ) - )? # optional -> youtube.com/xxxx is OK + )) + |youtu\.be/ # just youtu.be/xxxx + ) )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]+) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow $""" _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' # Listed in order of quality - _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13', - '95', '94', '93', '92', '132', '151', + _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13', + # Apple HTTP Live Streaming + '96', '95', '94', '93', '92', '132', '151', # 3D '85', '84', '102', '83', '101', '82', '100', # Dash video @@ -222,8 +225,10 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor): # Dash audio '141', '172', '140', '171', '139', ] - _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13', - '95', '94', '93', '92', '132', '151', + _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13', + # Apple HTTP Live Streaming + '96', '95', '94', '93', '92', '132', '151', + # 3D '85', '102', '84', '101', '83', '100', '82', # Dash video '138', '248', '137', '247', '136', '246', '245', @@ -231,11 +236,18 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor): # Dash audio '172', '141', '171', '140', '139', ] + _video_formats_map = { + 'flv': ['35', '34', '6', '5'], + '3gp': ['36', '17', '13'], + 'mp4': ['38', '37', '22', '18'], + 'webm': ['46', '45', '44', '43'], + } _video_extensions = { '13': '3gp', - '17': 'mp4', + '17': '3gp', '18': 'mp4', '22': 'mp4', + '36': '3gp', '37': 'mp4', '38': 'mp4', '43': 'webm', @@ -252,7 +264,7 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor): '101': 'webm', '102': 'webm', - # videos that use m3u8 + # Apple HTTP Live Streaming '92': 'mp4', '93': 'mp4', '94': 'mp4', @@ -293,6 +305,7 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor): '22': '720x1280', '34': '360x640', '35': '480x854', + '36': '240x320', '37': '1080x1920', '38': '3072x4096', '43': '360x640', @@ -394,7 +407,7 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor): u"info_dict": { u"upload_date": u"20120506", u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", - u"description": u"md5:b085c9804f5ab69f4adea963a2dceb3c", + u"description": u"md5:3e2666e0a55044490499ea45fe9037b7", u"uploader": u"Icona Pop", u"uploader_id": u"IconaPop" } @@ -432,7 +445,7 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False + if YoutubePlaylistIE.suitable(url): return False return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_video_webpage_download(self, video_id): @@ -465,15 +478,15 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor): elif len(s) == 89: return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1] elif len(s) == 88: - return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12] + return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28] elif len(s) == 87: return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: - return s[5:20] + s[2] + s[21:] + return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53] elif len(s) == 85: return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27] elif len(s) == 84: - return s[83:27:-1] + s[0] + s[26:5:-1] + s[2:0:-1] + s[27] + return s[81:36:-1] + s[0] + s[35:2:-1] elif len(s) == 83: return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0] elif len(s) == 82: @@ -537,13 +550,25 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor): video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats else: # Specific formats. We pick the first in a slash-delimeted sequence. - # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. + # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality + # available in the specified format. For example, + # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. + # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'. + # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'. req_formats = req_format.split('/') video_url_list = None for rf in req_formats: if rf in url_map: video_url_list = [(rf, url_map[rf])] break + if rf in self._video_formats_map: + for srf in self._video_formats_map[rf]: + if srf in url_map: + video_url_list = [(srf, url_map[srf])] + break + else: + continue + break if video_url_list is None: raise ExtractorError(u'requested format not available') return video_url_list @@ -558,7 +583,7 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor): manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest') formats_urls = _get_urls(manifest) for format_url in formats_urls: - itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag') + itag = self._search_regex(r'itag%3D(\d+?)/', format_url, 'itag') url_map[itag] = format_url return url_map @@ -860,8 +885,11 @@ class YoutubePlaylistIE(InfoExtractor): for entry in response['feed']['entry']: index = entry['yt$position']['$t'] - if 'media$group' in entry and 'media$player' in entry['media$group']: - videos.append((index, entry['media$group']['media$player']['url'])) + if 'media$group' in entry and 'yt$videoid' in entry['media$group']: + videos.append(( + index, + 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t'] + )) videos = [v[1] for v in sorted(videos)] @@ -927,13 +955,20 @@ class YoutubeChannelIE(InfoExtractor): class YoutubeUserIE(InfoExtractor): IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' _GDATA_PAGE_SIZE = 50 - _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' - _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]' + _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' IE_NAME = u'youtube:user' + @classmethod + def suitable(cls, url): + # Don't return True if the url can be extracted with other youtube + # extractor, the regex would is too permissive and it would match. + other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls) + if any(ie.suitable(url) for ie in other_ies): return False + else: return super(YoutubeUserIE, cls).suitable(url) + def _real_extract(self, url): # Extract username mobj = re.match(self._VALID_URL, url) @@ -956,13 +991,15 @@ class YoutubeUserIE(InfoExtractor): page = self._download_webpage(gdata_url, username, u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) + try: + response = json.loads(page) + except ValueError as err: + raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) + # Extract video identifiers ids_in_page = [] - - for mobj in re.finditer(self._VIDEO_INDICATOR, page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - + for entry in response['feed']['entry']: + ids_in_page.append(entry['id']['$t'].split('/')[-1]) video_ids.extend(ids_in_page) # A little optimization - if current page is not @@ -1101,7 +1138,7 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = u'youtube:favorites' IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:o?rites)?' + _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?' _LOGIN_REQUIRED = True def _real_extract(self, url): |