diff options
author | Allan Zhou <allanzp@gmail.com> | 2013-08-26 15:16:13 -0700 |
---|---|---|
committer | Allan Zhou <allanzp@gmail.com> | 2013-08-26 15:16:13 -0700 |
commit | 99859d436cdee9acc9c869254e734eba5b748260 (patch) | |
tree | 5d3a425aa0e6fdc65890c46713ce901fa5b5489b /youtube_dl/extractor | |
parent | 39c6f507df5f69e5d9b41b054205ec310f6427a5 (diff) | |
parent | 1b01e2b085987b06bd7b360d779a6cb537d4752c (diff) |
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/c56.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/dailymotion.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 10 | ||||
-rw-r--r-- | youtube_dl/extractor/hark.py | 35 | ||||
-rw-r--r-- | youtube_dl/extractor/ro220.py | 42 | ||||
-rw-r--r-- | youtube_dl/extractor/rtlnow.py | 17 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 2 |
8 files changed, 108 insertions, 8 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b4db8f0bf..f71ae2713 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -29,6 +29,7 @@ from .gametrailers import GametrailersIE from .generic import GenericIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE +from .hark import HarkIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .hypem import HypemIE @@ -57,6 +58,7 @@ from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE +from .ro220 import Ro220IE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE from .sina import SinaIE @@ -116,12 +118,14 @@ _ALL_CLASSES = [ ] _ALL_CLASSES.append(GenericIE) + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. """ return [klass() for klass in _ALL_CLASSES] + def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" return globals()[ie_name+'IE'] diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py index 4c8a8af09..dc3a8d47d 100644 --- a/youtube_dl/extractor/c56.py +++ b/youtube_dl/extractor/c56.py @@ -12,8 +12,8 @@ class C56IE(InfoExtractor): _TEST ={ u'url': u'http://www.56.com/u39/v_OTM0NDA3MTY.html', - u'file': u'93440716.mp4', - u'md5': u'9dc07b5c8e978112a6441f9e75d2b59e', + u'file': u'93440716.flv', + u'md5': u'e59995ac63d0457783ea05f93f12a866', u'info_dict': { u'title': u'网事知多少 第32期:车怒', }, diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index fa8c630d0..1ea449ca8 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -21,7 +21,7 @@ class DailymotionIE(InfoExtractor): u'file': u'x33vw9.mp4', u'md5': u'392c4b85a60a90dc4792da41ce3144eb', u'info_dict': { - u"uploader": u"Alex and Van .", + u"uploader": u"Amphora Alex and Van .", u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" } } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index da016f7ee..d034a11bb 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -7,12 +7,14 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_error, compat_urllib_parse, + compat_urllib_parse_urlparse, compat_urllib_request, ExtractorError, ) from .brightcove import BrightcoveIE + class GenericIE(InfoExtractor): IE_DESC = u'Generic downloader that works on some sites' _VALID_URL = r'.*' @@ -23,7 +25,7 @@ class GenericIE(InfoExtractor): u'file': u'13601338388002.mp4', u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89', u'info_dict': { - u"uploader": u"www.hodiho.fr", + u"uploader": u"www.hodiho.fr", u"title": u"R\u00e9gis plante sa Jeep" } }, @@ -124,7 +126,7 @@ class GenericIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) self.report_extraction(video_id) - # Look for BrigthCove: + # Look for BrightCove: m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) if m_brightcove is not None: self.to_screen(u'Brightcove video detected.') @@ -161,6 +163,10 @@ class GenericIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_url = compat_urllib_parse.unquote(mobj.group(1)) + if video_url.startswith('//'): + video_url = compat_urllib_parse_urlparse(url).scheme + ':' + video_url + if '://' not in video_url: + video_url = url + ('' if url.endswith('/') else '/') + video_url video_id = os.path.basename(video_url) # here's a fun little line of code for you: diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py new file mode 100644 index 000000000..ab0a69697 --- /dev/null +++ b/youtube_dl/extractor/hark.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + +class HarkIE(InfoExtractor): + _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+' + _TEST = { + u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', + u'file': u'mmbzyhkgny.mp3', + u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', + u'info_dict': { + u"title": u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' On May 23, 2013 ", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + embed_url = "http://www.hark.com/clips/%s/homepage_embed" %(video_id) + webpage = self._download_webpage(embed_url, video_id) + + final_url = self._search_regex(r'src="(.+?).mp3"', + webpage, 'video url')+'.mp3' + title = self._html_search_regex(r'<title>(.+?)</title>', + webpage, 'video title').replace(' Sound Clip and Quote - Hark','').replace( + 'Sound Clip , Quote, MP3, and Ringtone - Hark','') + + return {'id': video_id, + 'url' : final_url, + 'title': title, + 'ext': determine_ext(final_url), + } diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py new file mode 100644 index 000000000..c32f64d99 --- /dev/null +++ b/youtube_dl/extractor/ro220.py @@ -0,0 +1,42 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + compat_parse_qs, +) + + +class Ro220IE(InfoExtractor): + IE_NAME = '220.ro' + _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<video_id>[^/]+)' + _TEST = { + u"url": u"http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/", + u'file': u'LYV6doKo7f.mp4', + u'md5': u'03af18b73a07b4088753930db7a34add', + u'info_dict': { + u"title": u"Luati-le Banii sez 4 ep 1", + u"description": u"Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + flashVars_str = self._search_regex( + r'<param name="flashVars" value="([^"]+)"', + webpage, u'flashVars') + flashVars = compat_parse_qs(flashVars_str) + + info = { + '_type': 'video', + 'id': video_id, + 'ext': 'mp4', + 'url': flashVars['videoURL'][0], + 'title': flashVars['title'][0], + 'description': clean_html(flashVars['desc'][0]), + 'thumbnail': flashVars['preview'][0], + } + return info diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index 2f134e6a7..7bb236c2b 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -8,8 +8,8 @@ from ..utils import ( ) class RTLnowIE(InfoExtractor): - """Information Extractor for RTLnow, RTL2now and VOXnow""" - _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl(?:(?P<is_rtl2>2)|-)now\.rtl(?(is_rtl2)2|)\.de/|(?:www\.)?voxnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' + """Information Extractor for RTL NOW, RTL2 NOW, SUPER RTL NOW and VOX NOW""" + _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?superrtlnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' _TESTS = [{ u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', u'file': u'90419.flv', @@ -48,6 +48,19 @@ class RTLnowIE(InfoExtractor): u'params': { u'skip_download': True, }, + }, + { + u'url': u'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1', + u'file': u'99205.flv', + u'info_dict': { + u'upload_date': u'20080928', + u'title': u'Medicopter 117 - Angst!', + u'description': u'Angst!', + u'thumbnail': u'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg' + }, + u'params': { + u'skip_download': True, + }, }] def _real_extract(self,url): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e4987b2b3..af01c9da0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -427,7 +427,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif len(s) == 85: return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27] elif len(s) == 84: - return s[83:27:-1] + s[0] + s[26:5:-1] + s[2:0:-1] + s[27] + return s[5:40] + s[3] + s[41:48] + s[0] + s[49:84] elif len(s) == 83: return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0] elif len(s) == 82: |