diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/appletrailers.py | 23 | ||||
-rw-r--r-- | youtube_dl/extractor/cbs.py | 30 | ||||
-rw-r--r-- | youtube_dl/extractor/clipsyndicate.py | 10 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/dailymotion.py | 16 | ||||
-rw-r--r-- | youtube_dl/extractor/daum.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/ign.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/metacritic.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/mixcloud.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/mtv.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/naver.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/pornhd.py | 38 | ||||
-rw-r--r-- | youtube_dl/extractor/pornhub.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/soundcloud.py | 17 | ||||
-rw-r--r-- | youtube_dl/extractor/theplatform.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/vimeo.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/zdf.py | 4 |
19 files changed, 164 insertions, 50 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fb206a742..f01fa2cde 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,6 +20,7 @@ from .brightcove import BrightcoveIE from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .cbs import CBSIE from .channel9 import Channel9IE from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE @@ -112,6 +113,7 @@ from .orf import ORFIE from .pbs import PBSIE from .photobucket import PhotobucketIE from .podomatic import PodomaticIE +from .pornhd import PornHdIE from .pornhub import PornHubIE from .pornotube import PornotubeIE from .pyvideo import PyvideoIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index a527f10de..ef5644aa5 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree import json from .common import InfoExtractor @@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor): uploader_id = mobj.group('company') playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') - playlist_snippet = self._download_webpage(playlist_url, movie) - playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet) - playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned) - # The ' in the onClick attributes are not escaped, it couldn't be parsed - # with xml.etree.ElementTree.fromstring - # like: http://trailers.apple.com/trailers/wb/gravity/ - def _clean_json(m): - return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') - playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) - playlist_html = u'<html>' + playlist_cleaned + u'</html>' + def fix_html(s): + s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s) + s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # like: http://trailers.apple.com/trailers/wb/gravity/ + def _clean_json(m): + return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + s = re.sub(self._JSON_RE, _clean_json, s) + s = u'<html>' + s + u'</html>' + return s + doc = self._download_xml(playlist_url, movie, transform_source=fix_html) - doc = xml.etree.ElementTree.fromstring(playlist_html) playlist = [] for li in doc.findall('./div/ul/li'): on_click = li.find('.//a').attrib['onClick'] diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py new file mode 100644 index 000000000..ac0315853 --- /dev/null +++ b/youtube_dl/extractor/cbs.py @@ -0,0 +1,30 @@ +import re + +from .common import InfoExtractor + + +class CBSIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/video/(?P<id>[^/]+)/.*' + + _TEST = { + u'url': u'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', + u'file': u'4JUVEwq3wUT7.flv', + u'info_dict': { + u'title': u'Connect Chat feat. Garth Brooks', + u'description': u'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', + u'duration': 1495, + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + real_id = self._search_regex( + r"video\.settings\.pid\s*=\s*'([^']+)';", + webpage, u'real video ID') + return self.url_result(u'theplatform:%s' % real_id) diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index d4fc86973..c60089ad3 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -1,9 +1,9 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( find_xpath_attr, + fix_xml_all_ampersand, ) @@ -30,12 +30,10 @@ class ClipsyndicateIE(InfoExtractor): # it includes a required token flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') - playlist_page = self._download_webpage( + pdoc = self._download_xml( 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, - video_id, u'Downloading video info') - # Fix broken xml - playlist_page = re.sub('&', '&', playlist_page) - pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8')) + video_id, u'Downloading video info', + transform_source=fix_xml_all_ampersand) track_doc = pdoc.find('trackList/track') def find_param(name): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 534908a2b..fe8ce9e6c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -34,8 +34,8 @@ class InfoExtractor(object): The dictionaries must include the following fields: id: Video identifier. - url: Final video URL. title: Video title, unescaped. + url: Final video URL. ext: Video filename extension. Instead of url and ext, formats can also specified. @@ -54,6 +54,7 @@ class InfoExtractor(object): player_url: SWF Player URL (used for rtmpdump). subtitles: The subtitle file contents as a dictionary in the format {language: subtitles}. + duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video @@ -230,9 +231,12 @@ class InfoExtractor(object): return content def _download_xml(self, url_or_request, video_id, - note=u'Downloading XML', errnote=u'Unable to download XML'): + note=u'Downloading XML', errnote=u'Unable to download XML', + transform_source=None): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage(url_or_request, video_id, note, errnote) + if transform_source: + xml_string = transform_source(xml_string) return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) def to_screen(self, msg): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 3bd0b862c..6685c94a3 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -28,7 +28,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): """Information Extractor for Dailymotion""" - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)' IE_NAME = u'dailymotion' _FORMATS = [ @@ -81,7 +81,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1).split('_')[0].split('?')[0] + video_id = mobj.group('id') url = 'http://www.dailymotion.com/video/%s' % video_id @@ -101,10 +101,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self.to_screen(u'Vevo video detected: %s' % vevo_id) return self.url_result(u'vevo:%s' % vevo_id, ie='Vevo') - video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', - # Looking for official user - r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'], - webpage, 'video uploader', fatal=False) age_limit = self._rta_search(webpage) video_upload_date = None @@ -147,13 +143,15 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self._list_available_subtitles(video_id, webpage) return - view_count = str_to_int(self._search_regex( - r'video_views_value[^>]+>([\d\.,]+)<', webpage, u'view count')) + view_count = self._search_regex( + r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, u'view count', fatal=False) + if view_count is not None: + view_count = str_to_int(view_count) return { 'id': video_id, 'formats': formats, - 'uploader': video_uploader, + 'uploader': info['owner_screenname'], 'upload_date': video_upload_date, 'title': self._og_search_title(webpage), 'subtitles': video_subtitles, diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index d418ce4a8..4876ecb48 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -9,7 +9,7 @@ from ..utils import ( class DaumIE(InfoExtractor): - _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' + _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' IE_NAME = u'daum.net' _TEST = { diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 57b79a336..381af91e4 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -44,7 +44,7 @@ class IGNIE(InfoExtractor): { u'file': u'638672ee848ae4ff108df2a296418ee2.mp4', u'info_dict': { - u'title': u'GTA 5\'s Twisted Beauty in Super Slow Motion', + u'title': u'26 Twisted Moments from GTA 5 in Slow Motion', u'description': u'The twisted beauty of GTA 5 in stunning slow motion.', }, }, diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index 6b95b4998..e560c1d35 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -1,8 +1,10 @@ import re -import xml.etree.ElementTree import operator from .common import InfoExtractor +from ..utils import ( + fix_xml_all_ampersand, +) class MetacriticIE(InfoExtractor): @@ -23,9 +25,8 @@ class MetacriticIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) # The xml is not well formatted, there are raw '&' - info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id, - video_id, u'Downloading info xml').replace('&', '&') - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, + video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand) clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) formats = [] diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 04fa3ac7a..125d81551 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -37,6 +37,9 @@ class MixcloudIE(InfoExtractor): return None + def _get_url(self, template_url): + return self.check_urls(template_url % i for i in range(30)) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -52,13 +55,18 @@ class MixcloudIE(InfoExtractor): preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) - final_song_url = self.check_urls(template_url % i for i in range(30)) + final_song_url = self._get_url(template_url) + if final_song_url is None: + self.to_screen('Trying with m4a extension') + template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') + final_song_url = self._get_url(template_url) + if final_song_url is None: + raise ExtractorError(u'Unable to extract track url') return { 'id': track_id, 'title': info['name'], 'url': final_song_url, - 'ext': 'mp3', 'description': info.get('description'), 'thumbnail': info['pictures'].get('extra_large'), 'uploader': info['user']['name'], diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 6b3feb560..5b2bd9633 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -82,8 +82,13 @@ class MTVServicesInfoExtractor(InfoExtractor): def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) data = compat_urllib_parse.urlencode({'uri': uri}) - idoc = self._download_xml(self._FEED_URL +'?' + data, video_id, - u'Downloading info') + + def fix_ampersand(s): + """ Fix unencoded ampersand in XML """ + return s.replace(u'& ', '& ') + idoc = self._download_xml( + self._FEED_URL + '?' + data, video_id, + u'Downloading info', transform_source=fix_ampersand) return [self._get_video_info(item) for item in idoc.findall('.//item')] diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index c012ec0cf..4cab30631 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -9,7 +9,7 @@ from ..utils import ( class NaverIE(InfoExtractor): - _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P<id>\d+)' + _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)' _TEST = { u'url': u'http://tvcast.naver.com/v/81652', diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py new file mode 100644 index 000000000..71abd5013 --- /dev/null +++ b/youtube_dl/extractor/pornhd.py @@ -0,0 +1,38 @@ +import re + +from .common import InfoExtractor +from ..utils import compat_urllib_parse + + +class PornHdIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)' + _TEST = { + u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', + u'file': u'1962.flv', + u'md5': u'35272469887dca97abd30abecc6cdf75', + u'info_dict': { + u"title": u"sierra-day-gets-his-cum-all-over-herself-hd-porn-video", + u"age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('video_id') + video_title = mobj.group('video_title') + + webpage = self._download_webpage(url, video_id) + + video_url = self._html_search_regex( + r'&hd=(http.+?)&', webpage, u'video URL') + video_url = compat_urllib_parse.unquote(video_url) + age_limit = 18 + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 8b3471919..d9135c6b9 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -12,7 +12,7 @@ from ..aes import ( ) class PornHubIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9]+))' + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9a-f]+))' _TEST = { u'url': u'http://www.pornhub.com/view_video.php?viewkey=648719015', u'file': u'648719015.mp4', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 5c026c0b8..cbba4094b 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -73,6 +73,19 @@ class SoundcloudIE(InfoExtractor): u'upload_date': u'20131209', }, }, + # downloadable song + { + u'url': u'https://soundcloud.com/simgretina/just-your-problem-baby-1', + u'md5': u'56a8b69568acaa967b4c49f9d1d52d19', + u'info_dict': { + u'id': u'105614606', + u'ext': u'wav', + u'title': u'Just Your Problem Baby (Acapella)', + u'description': u'Vocals', + u'uploader': u'Sim Gretina', + u'upload_date': u'20130815', + }, + }, ] _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' @@ -99,7 +112,7 @@ class SoundcloudIE(InfoExtractor): thumbnail = info['artwork_url'] if thumbnail is not None: thumbnail = thumbnail.replace('-large', '-t500x500') - ext = info.get('original_format', u'mp3') + ext = u'mp3' result = { 'id': track_id, 'uploader': info['user']['username'], @@ -115,7 +128,7 @@ class SoundcloudIE(InfoExtractor): track_id, self._CLIENT_ID)) result['formats'] = [{ 'format_id': 'download', - 'ext': ext, + 'ext': info.get('original_format', u'mp3'), 'url': format_url, 'vcodec': 'none', }] diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 61452e47d..cec65261b 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -3,6 +3,7 @@ import json from .common import InfoExtractor from ..utils import ( + ExtractorError, xpath_with_ns, ) @@ -32,6 +33,17 @@ class ThePlatformIE(InfoExtractor): smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' 'format=smil&mbr=true'.format(video_id)) meta = self._download_xml(smil_url, video_id) + + try: + error_msg = next( + n.attrib['abstract'] + for n in meta.findall(_x('.//smil:ref')) + if n.attrib.get('title') == u'Geographic Restriction') + except StopIteration: + pass + else: + raise ExtractorError(error_msg, expected=True) + info_url = 'http://link.theplatform.com/s/dJ5BDC/{0}?format=preview'.format(video_id) info_json = self._download_webpage(info_url, video_id) info = json.loads(info_json) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index fb2bd225a..ea4409528 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -115,7 +115,7 @@ class VimeoIE(InfoExtractor): def _real_initialize(self): self._login() - def _real_extract(self, url, new_video=True): + def _real_extract(self, url): url, data = unsmuggle_url(url) headers = std_headers if data is not None: @@ -151,8 +151,14 @@ class VimeoIE(InfoExtractor): config = json.loads(config_json) except RegexNotFoundError: # For pro videos or player.vimeo.com urls - config = self._search_regex([r' = {config:({.+?}),assets:', r'(?:c|b)=({.+?});'], - webpage, u'info section', flags=re.DOTALL) + # We try to find out to which variable is assigned the config dic + m_variable_name = re.search('(\w)\.video\.id', webpage) + if m_variable_name is not None: + config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1)) + else: + config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] + config = self._search_regex(config_re, webpage, u'info section', + flags=re.DOTALL) config = json.loads(config) except Exception as e: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 874429b78..a68a214ca 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1377,9 +1377,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if 'length_seconds' not in video_info: self._downloader.report_warning(u'unable to extract video duration') - video_duration = '' + video_duration = None else: - video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) + video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])) # annotations video_annotations = None diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 689f19735..35ece354a 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -73,14 +73,14 @@ class ZDFIE(InfoExtractor): try: proto_pref = -PROTO_ORDER.index(format_m.group('proto')) except ValueError: - proto_pref = 999 + proto_pref = -999 quality = fnode.find('./quality').text QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low'] try: quality_pref = -QUALITY_ORDER.index(quality) except ValueError: - quality_pref = 999 + quality_pref = -999 abr = int(fnode.find('./audioBitrate').text) // 1000 vbr = int(fnode.find('./videoBitrate').text) // 1000 |