diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/cliphunter.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/collegehumor.py | 28 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/cspan.py | 45 | ||||
-rw-r--r-- | youtube_dl/extractor/infoq.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/lifenews.py | 63 | ||||
-rw-r--r-- | youtube_dl/extractor/liveleak.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/mtv.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/newgrounds.py | 10 | ||||
-rw-r--r-- | youtube_dl/extractor/normalboots.py | 61 | ||||
-rw-r--r-- | youtube_dl/extractor/rbmaradio.py | 6 | ||||
-rw-r--r-- | youtube_dl/extractor/ro220.py | 19 | ||||
-rw-r--r-- | youtube_dl/extractor/southparkstudios.py | 32 | ||||
-rw-r--r-- | youtube_dl/extractor/spiegel.py | 37 | ||||
-rw-r--r-- | youtube_dl/extractor/tinypic.py | 50 | ||||
-rw-r--r-- | youtube_dl/extractor/traileraddict.py | 51 | ||||
-rw-r--r-- | youtube_dl/extractor/ustream.py | 48 | ||||
-rw-r--r-- | youtube_dl/extractor/vevo.py | 40 | ||||
-rw-r--r-- | youtube_dl/extractor/vube.py | 80 | ||||
-rw-r--r-- | youtube_dl/extractor/xhamster.py | 1 |
21 files changed, 449 insertions, 148 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e89b5cf9d..073f3a0d1 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -110,6 +110,7 @@ from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE from .keek import KeekIE from .la7 import LA7IE +from .lifenews import LifeNewsIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE, LivestreamOriginalIE from .lynda import ( @@ -141,6 +142,7 @@ from .newgrounds import NewgroundsIE from .nhl import NHLIE, NHLVideocenterIE from .niconico import NiconicoIE from .ninegag import NineGagIE +from .normalboots import NormalbootsIE from .novamov import NovamovIE from .nowness import NownessIE from .nowvideo import NowVideoIE @@ -198,6 +200,7 @@ from .ted import TEDIE from .tf1 import TF1IE from .theplatform import ThePlatformIE from .thisav import ThisAVIE +from .tinypic import TinyPicIE from .toutv import TouTvIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE @@ -228,6 +231,7 @@ from .vimeo import ( from .vine import VineIE from .viki import VikiIE from .vk import VKIE +from .vube import VubeIE from .wat import WatIE from .weibo import WeiboIE from .wimp import WimpIE diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index d891fa301..58846e8e7 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -1,12 +1,9 @@ from __future__ import unicode_literals import re -import string from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) + translation_table = { 'a': 'h', 'd': 'e', 'e': 'v', 'f': 'o', 'g': 'f', 'i': 'd', 'l': 'n', diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index d10b7bd0c..2b4bf34c9 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -28,7 +28,25 @@ class CollegeHumorIE(InfoExtractor): 'description': 'This video wasn\'t long enough, so we made it double-spaced.', 'age_limit': 10, }, - }] + }, + # embedded youtube video + { + 'url': 'http://www.collegehumor.com/embed/6950457', + 'info_dict': { + 'id': 'W5gMp3ZjYg4', + 'ext': 'mp4', + 'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]', + 'uploader': 'Funnyplox TV', + 'uploader_id': 'funnyploxtv', + 'description': 'md5:506f69f7a297ed698ced3375f2363b0e', + 'upload_date': '20140128', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Youtube'], + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -38,6 +56,12 @@ class CollegeHumorIE(InfoExtractor): data = json.loads(self._download_webpage( jsonUrl, video_id, 'Downloading info JSON')) vdata = data['video'] + if vdata.get('youtubeId') is not None: + return { + '_type': 'url', + 'url': vdata['youtubeId'], + 'ie_key': 'Youtube', + } AGE_LIMITS = {'nc17': 18, 'r': 18, 'pg13': 13, 'pg': 10, 'g': 0} rating = vdata.get('rating') @@ -49,7 +73,7 @@ class CollegeHumorIE(InfoExtractor): PREFS = {'high_quality': 2, 'low_quality': 0} formats = [] for format_key in ('mp4', 'webm'): - for qname, qurl in vdata[format_key].items(): + for qname, qurl in vdata.get(format_key, {}).items(): formats.append({ 'format_id': format_key + '_' + qname, 'url': qurl, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 70ba9eaba..2c0c75604 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -399,7 +399,7 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')' + content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) template = r'<meta[^>]+?%s[^>]+?%s' return [ diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index e54009622..d65046f58 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -1,49 +1,60 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor from ..utils import ( unescapeHTML, + find_xpath_attr, ) class CSpanIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?c-spanvideo\.org/program/(?P<name>.*)' + _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>\d+)' IE_DESC = 'C-SPAN' _TEST = { - 'url': 'http://www.c-spanvideo.org/program/HolderonV', - 'file': '315139.mp4', + 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', 'md5': '8e44ce11f0f725527daccc453f553eb0', 'info_dict': { + 'id': '315139', + 'ext': 'mp4', 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', - 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in [Shelby County v. Holder] in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', + 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', }, 'skip': 'Regularly fails on travis, for unknown reasons', } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - prog_name = mobj.group('name') - webpage = self._download_webpage(url, prog_name) - video_id = self._search_regex(r'prog(?:ram)?id=(.*?)&', webpage, 'video id') - - title = self._html_search_regex( - r'<!-- title -->\n\s*<h1[^>]*>(.*?)</h1>', webpage, 'title') - description = self._og_search_description(webpage) + page_id = mobj.group('id') + webpage = self._download_webpage(url, page_id) + video_id = self._search_regex(r'data-progid=\'(\d+)\'>', webpage, 'video id') + + description = self._html_search_regex( + [ + # The full description + r'<div class=\'expandable\'>(.*?)<a href=\'#\'', + # If the description is small enough the other div is not + # present, otherwise this is a stripped version + r'<p class=\'initial\'>(.*?)</p>' + ], + webpage, 'description', flags=re.DOTALL) info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id - data_json = self._download_webpage( - info_url, video_id, 'Downloading video info') - data = json.loads(data_json) + data = self._download_json(info_url, video_id) url = unescapeHTML(data['video']['files'][0]['path']['#text']) + doc = self._download_xml('http://www.c-span.org/common/services/flashXml.php?programid=' + video_id, + video_id) + + def find_string(s): + return find_xpath_attr(doc, './/string', 'name', s).text + return { 'id': video_id, - 'title': title, + 'title': find_string('title'), 'url': url, 'description': description, - 'thumbnail': self._og_search_thumbnail(webpage), + 'thumbnail': find_string('poster'), } diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 7c208b85d..ed32373a1 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -10,7 +10,7 @@ from ..utils import ( class InfoQIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$' + _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P<id>[^/]+)$' _TEST = { "name": "InfoQ", "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things", @@ -26,9 +26,9 @@ class InfoQIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id=url) - self.report_extraction(url) + webpage = self._download_webpage(url, video_id) # Extract video URL encoded_id = self._search_regex(r"jsclassref ?= ?'([^']*)'", webpage, 'encoded id') @@ -50,6 +50,6 @@ class InfoQIE(InfoExtractor): 'id': video_id, 'url': video_url, 'title': video_title, - 'ext': extension, # Extension is always(?) mp4, but seems to be flv + 'ext': extension, # Extension is always(?) mp4, but seems to be flv 'description': video_description, } diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py new file mode 100644 index 000000000..4e4035b76 --- /dev/null +++ b/youtube_dl/extractor/lifenews.py @@ -0,0 +1,63 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class LifeNewsIE(InfoExtractor): + IE_NAME = 'lifenews' + IE_DESC = 'LIFE | NEWS' + _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?news/(?P<id>\d+)' + + _TEST = { + 'url': 'http://lifenews.ru/news/126342', + 'file': '126342.mp4', + 'md5': 'e1b50a5c5fb98a6a544250f2e0db570a', + 'info_dict': { + 'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом', + 'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.', + 'thumbnail': 'http://lifenews.ru/static/posts/2014/1/126342/.video.jpg', + 'upload_date': '20140130', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage('http://lifenews.ru/mobile/news/%s' % video_id, video_id, 'Downloading page') + + video_url = self._html_search_regex( + r'<video.*?src="([^"]+)"></video>', webpage, 'video URL') + + thumbnail = self._html_search_regex( + r'<video.*?poster="([^"]+)".*?"></video>', webpage, 'video thumbnail') + + title = self._og_search_title(webpage) + TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS' + if title.endswith(TITLE_SUFFIX): + title = title[:-len(TITLE_SUFFIX)] + + description = self._og_search_description(webpage) + + view_count = self._html_search_regex( + r'<div class=\'views\'>(\d+)</div>', webpage, 'view count') + comment_count = self._html_search_regex( + r'<div class=\'comments\'>(\d+)</div>', webpage, 'comment count') + + upload_date = self._html_search_regex( + r'<time datetime=\'([^\']+)\'>', webpage, 'upload date') + + return { + 'id': video_id, + 'url': video_url, + 'thumbnail': thumbnail, + 'title': title, + 'description': description, + 'view_count': view_count, + 'comment_count': comment_count, + 'upload_date': unified_strdate(upload_date), + }
\ No newline at end of file diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 4e76c1f4a..0a700d663 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -4,9 +4,6 @@ import json import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class LiveLeakIE(InfoExtractor): diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index f6f31bfdc..4521451ac 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -119,7 +119,9 @@ class MTVServicesInfoExtractor(InfoExtractor): if mgid.endswith('.swf'): mgid = mgid[:-4] except RegexNotFoundError: - mgid = self._search_regex(r'data-mgid="(.*?)"', webpage, u'mgid') + mgid = self._search_regex( + [r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'], + webpage, u'mgid') return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py index 5cb83ba14..2e72e8915 100644 --- a/youtube_dl/extractor/newgrounds.py +++ b/youtube_dl/extractor/newgrounds.py @@ -4,18 +4,18 @@ import json import re from .common import InfoExtractor -from ..utils import determine_ext class NewgroundsIE(InfoExtractor): - _VALID_URL = r'(?:https?://)?(?:www\.)?newgrounds\.com/audio/listen/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/audio/listen/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.newgrounds.com/audio/listen/549479', - 'file': '549479.mp3', 'md5': 'fe6033d297591288fa1c1f780386f07a', 'info_dict': { - "title": "B7 - BusMode", - "uploader": "Burn7", + 'id': '549479', + 'ext': 'mp3', + 'title': 'B7 - BusMode', + 'uploader': 'Burn7', } } diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py new file mode 100644 index 000000000..81b7855b0 --- /dev/null +++ b/youtube_dl/extractor/normalboots.py @@ -0,0 +1,61 @@ +import re + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + unified_strdate, +) + +class NormalbootsIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$' + _TEST = { + u'url': u'http://normalboots.com/video/home-alone-games-jontron/', + u'file': u'home-alone-games-jontron.mp4', + u'md5': u'8bf6de238915dd501105b44ef5f1e0f6', + u'info_dict': { + u'title': u'Home Alone Games - JonTron - NormalBoots', + u'description': u'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for \u2018Tense Battle Theme\u2019:\xa0http://www.youtube.com/Kiamet/', + u'uploader': u'JonTron', + u'upload_date': u'20140125', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group('videoid') + + info = { + 'id': video_id, + 'uploader': None, + 'upload_date': None, + } + + if url[:4] != 'http': + url = 'http://' + url + + webpage = self._download_webpage(url, video_id) + video_title = self._og_search_title(webpage) + video_description = self._og_search_description(webpage) + video_thumbnail = self._og_search_thumbnail(webpage) + video_uploader = self._html_search_regex(r'Posted\sby\s<a\shref="[A-Za-z0-9/]*">(?P<uploader>[A-Za-z]*)\s</a>', + webpage, 'uploader') + raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>', + webpage, 'date') + video_upload_date = unified_strdate(raw_upload_date) + video_upload_date = unified_strdate(raw_upload_date) + + player_url = self._html_search_regex(r'<iframe\swidth="[0-9]+"\sheight="[0-9]+"\ssrc="(?P<url>[\S]+)"', webpage, 'url') + player_page = self._download_webpage(player_url, video_id) + video_url = u'http://player.screenwavemedia.com/' + self._html_search_regex(r"'file':\s'(?P<file>[0-9A-Za-z-_\.]+)'", player_page, 'file') + + info['url'] = video_url + info['title'] = video_title + info['description'] = video_description + info['thumbnail'] = video_thumbnail + info['uploader'] = video_uploader + info['upload_date'] = video_upload_date + + return info diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index b9cb7abd1..2c53ed2e1 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -6,8 +6,6 @@ import re from .common import InfoExtractor from ..utils import ( - compat_urllib_parse_urlparse, - ExtractorError, ) @@ -16,9 +14,10 @@ class RBMARadioIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$' _TEST = { 'url': 'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011', - 'file': 'ford-lopatin-live-at-primavera-sound-2011.mp3', 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', 'info_dict': { + 'id': 'ford-lopatin-live-at-primavera-sound-2011', + 'ext': 'mp3', "uploader_id": "ford-lopatin", "location": "Spain", "description": "Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.", @@ -42,7 +41,6 @@ class RBMARadioIE(InfoExtractor): raise ExtractorError('Invalid JSON: ' + str(e)) video_url = data['akamai_url'] + '&cbr=256' - url_parts = compat_urllib_parse_urlparse(video_url) return { 'id': video_id, diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py index c32f64d99..4678f62df 100644 --- a/youtube_dl/extractor/ro220.py +++ b/youtube_dl/extractor/ro220.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -11,12 +13,12 @@ class Ro220IE(InfoExtractor): IE_NAME = '220.ro' _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<video_id>[^/]+)' _TEST = { - u"url": u"http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/", - u'file': u'LYV6doKo7f.mp4', - u'md5': u'03af18b73a07b4088753930db7a34add', - u'info_dict': { - u"title": u"Luati-le Banii sez 4 ep 1", - u"description": u"Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.", + "url": "http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/", + 'file': 'LYV6doKo7f.mp4', + 'md5': '03af18b73a07b4088753930db7a34add', + 'info_dict': { + "title": "Luati-le Banii sez 4 ep 1", + "description": "Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.", } } @@ -27,10 +29,10 @@ class Ro220IE(InfoExtractor): webpage = self._download_webpage(url, video_id) flashVars_str = self._search_regex( r'<param name="flashVars" value="([^"]+)"', - webpage, u'flashVars') + webpage, 'flashVars') flashVars = compat_parse_qs(flashVars_str) - info = { + return { '_type': 'video', 'id': video_id, 'ext': 'mp4', @@ -39,4 +41,3 @@ class Ro220IE(InfoExtractor): 'description': clean_html(flashVars['desc'][0]), 'thumbnail': flashVars['preview'][0], } - return info diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py index 9f8d3a5fa..aea8e6439 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southparkstudios.py @@ -1,34 +1,36 @@ -import re +from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor class SouthParkStudiosIE(MTVServicesInfoExtractor): - IE_NAME = u'southparkstudios.com' - _VALID_URL = r'(https?://)?(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))' + IE_NAME = 'southparkstudios.com' + _VALID_URL = r'https?://(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' _TESTS = [{ - u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured', - u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4', - u'info_dict': { - u'title': u'Bat Daded', - u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.', + 'url': 'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured', + 'info_dict': { + 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'Bat Daded', + 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', }, }] class SouthparkDeIE(SouthParkStudiosIE): - IE_NAME = u'southpark.de' - _VALID_URL = r'(https?://)?(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))' + IE_NAME = 'southpark.de' + _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' _TESTS = [{ - u'url': u'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured', - u'file': u'85487c96-b3b9-4e39-9127-ad88583d9bf2.mp4', - u'info_dict': { - u'title': u'The Government Won\'t Respect My Privacy', - u'description': u'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', + 'url': 'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured', + 'info_dict': { + 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2', + 'ext': 'mp4', + 'title': 'The Government Won\'t Respect My Privacy', + 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', }, }] diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 051a34d5b..9156d7faf 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -6,20 +8,20 @@ from .common import InfoExtractor class SpiegelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$' _TESTS = [{ - u'url': u'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - u'file': u'1259285.mp4', - u'md5': u'2c2754212136f35fb4b19767d242f66e', - u'info_dict': { - u"title": u"Vulkanausbruch in Ecuador: Der \"Feuerschlund\" ist wieder aktiv" - } + 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', + 'file': '1259285.mp4', + 'md5': '2c2754212136f35fb4b19767d242f66e', + 'info_dict': { + 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', + }, }, { - u'url': u'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - u'file': u'1309159.mp4', - u'md5': u'f2cdf638d7aa47654e251e1aee360af1', - u'info_dict': { - u'title': u'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers' - } + 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', + 'file': '1309159.mp4', + 'md5': 'f2cdf638d7aa47654e251e1aee360af1', + 'info_dict': { + 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', + }, }] def _real_extract(self, url): @@ -29,17 +31,17 @@ class SpiegelIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_title = self._html_search_regex( - r'<div class="module-title">(.*?)</div>', webpage, u'title') + r'<div class="module-title">(.*?)</div>', webpage, 'title') - xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' + xml_url = 'http://video2.spiegel.de/flash/' + video_id + '.xml' idoc = self._download_xml( xml_url, video_id, - note=u'Downloading XML', errnote=u'Failed to download XML') + note='Downloading XML', errnote='Failed to download XML') formats = [ { 'format_id': n.tag.rpartition('type')[2], - 'url': u'http://video2.spiegel.de/flash/' + n.find('./filename').text, + 'url': 'http://video2.spiegel.de/flash/' + n.find('./filename').text, 'width': int(n.find('./width').text), 'height': int(n.find('./height').text), 'abr': int(n.find('./audiobitrate').text), @@ -55,10 +57,9 @@ class SpiegelIE(InfoExtractor): self._sort_formats(formats) - info = { + return { 'id': video_id, 'title': video_title, 'duration': duration, 'formats': formats, } - return info diff --git a/youtube_dl/extractor/tinypic.py b/youtube_dl/extractor/tinypic.py new file mode 100644 index 000000000..2246d27b2 --- /dev/null +++ b/youtube_dl/extractor/tinypic.py @@ -0,0 +1,50 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from youtube_dl.utils import ExtractorError + + +class TinyPicIE(InfoExtractor): + IE_NAME = 'tinypic' + IE_DESC = 'tinypic.com videos' + _VALID_URL = r'http://tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+' + + _TEST = { + 'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8', + 'md5': '609b74432465364e72727ebc6203f044', + 'info_dict': { + 'id': '6xw7tc', + 'ext': 'flv', + 'title': 'shadow phenomenon weird', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id, 'Downloading page') + + mobj = re.search(r'(?m)fo\.addVariable\("file",\s"(?P<fileid>[\da-z]+)"\);\n' + '\s+fo\.addVariable\("s",\s"(?P<serverid>\d+)"\);', webpage) + if mobj is None: + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + file_id = mobj.group('fileid') + server_id = mobj.group('serverid') + + KEYWORDS_SUFFIX = ', Video, images, photos, videos, myspace, ebay, video hosting, photo hosting' + keywords = self._html_search_meta('keywords', webpage, 'title') + title = keywords[:-len(KEYWORDS_SUFFIX)] if keywords.endswith(KEYWORDS_SUFFIX) else '' + + video_url = 'http://v%s.tinypic.com/%s.flv' % (server_id, file_id) + thumbnail = 'http://v%s.tinypic.com/%s_th.jpg' % (server_id, file_id) + + return { + 'id': file_id, + 'url': video_url, + 'thumbnail': thumbnail, + 'title': title + }
\ No newline at end of file diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py index 35f89e9ee..3b1a6fb61 100644 --- a/youtube_dl/extractor/traileraddict.py +++ b/youtube_dl/extractor/traileraddict.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -6,12 +8,13 @@ from .common import InfoExtractor class TrailerAddictIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)' _TEST = { - u'url': u'http://www.traileraddict.com/trailer/prince-avalanche/trailer', - u'file': u'76184.mp4', - u'md5': u'57e39dbcf4142ceb8e1f242ff423fd71', - u'info_dict': { - u"title": u"Prince Avalanche Trailer", - u"description": u"Trailer for Prince Avalanche.Two highway road workers spend the summer of 1988 away from their city lives. The isolated landscape becomes a place of misadventure as the men find themselves at odds with each other and the women they left behind." + 'url': 'http://www.traileraddict.com/trailer/prince-avalanche/trailer', + 'md5': '41365557f3c8c397d091da510e73ceb4', + 'info_dict': { + 'id': '76184', + 'ext': 'mp4', + 'title': 'Prince Avalanche Trailer', + 'description': 'Trailer for Prince Avalanche.\n\nTwo highway road workers spend the summer of 1988 away from their city lives. The isolated landscape becomes a place of misadventure as the men find themselves at odds with each other and the women they left behind.', } } @@ -22,9 +25,15 @@ class TrailerAddictIE(InfoExtractor): title = self._search_regex(r'<title>(.+?)</title>', webpage, 'video title').replace(' - Trailer Addict','') - view_count = self._search_regex(r'Views: (.+?)<br />', - webpage, 'Views Count') - video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1] + view_count_str = self._search_regex( + r'<span class="views_n">([0-9,.]+)</span>', + webpage, 'view count', fatal=False) + view_count = ( + None if view_count_str is None + else int(view_count_str.replace(',', ''))) + video_id = self._search_regex( + r'<param\s+name="movie"\s+value="/emb/([0-9]+)"\s*/>', + webpage, 'video id') # Presence of (no)watchplus function indicates HD quality is available if re.search(r'function (no)?watchplus()', webpage): @@ -39,14 +48,16 @@ class TrailerAddictIE(InfoExtractor): info_webpage, 'Download url').replace('%3F','?') thumbnail_url = self._search_regex(r'&image=(.+?)&', info_webpage, 'thumbnail url') - ext = final_url.split('.')[-1].split('?')[0] - - return [{ - 'id' : video_id, - 'url' : final_url, - 'ext' : ext, - 'title' : title, - 'thumbnail' : thumbnail_url, - 'description' : self._og_search_description(webpage), - 'view_count' : view_count, - }] + + description = self._html_search_regex( + r'(?s)<div class="synopsis">.*?<div class="movie_label_info"[^>]*>(.*?)</div>', + webpage, 'description', fatal=False) + + return { + 'id': video_id, + 'url': final_url, + 'title': title, + 'thumbnail': thumbnail_url, + 'description': description, + 'view_count': view_count, + } diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 74c82587f..7fa2b9e15 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import json import re @@ -10,48 +12,48 @@ from ..utils import ( class UstreamIE(InfoExtractor): _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)' - IE_NAME = u'ustream' + IE_NAME = 'ustream' _TEST = { - u'url': u'http://www.ustream.tv/recorded/20274954', - u'file': u'20274954.flv', - u'md5': u'088f151799e8f572f84eb62f17d73e5c', - u'info_dict': { - u"uploader": u"Young Americans for Liberty", - u"title": u"Young Americans for Liberty February 7, 2012 2:28 AM" - } + 'url': 'http://www.ustream.tv/recorded/20274954', + 'file': '20274954.flv', + 'md5': '088f151799e8f572f84eb62f17d73e5c', + 'info_dict': { + "uploader": "Young Americans for Liberty", + "title": "Young Americans for Liberty February 7, 2012 2:28 AM", + }, } def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') - video_url = u'http://tcdn.ustream.tv/video/%s' % video_id + video_url = 'http://tcdn.ustream.tv/video/%s' % video_id webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) video_title = self._html_search_regex(r'data-title="(?P<title>.+)"', - webpage, u'title') + webpage, 'title') uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', - webpage, u'uploader', fatal=False, flags=re.DOTALL) + webpage, 'uploader', fatal=False, flags=re.DOTALL) thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', - webpage, u'thumbnail', fatal=False) - - info = { - 'id': video_id, - 'url': video_url, - 'ext': 'flv', - 'title': video_title, - 'uploader': uploader, - 'thumbnail': thumbnail, - } - return info + webpage, 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'uploader': uploader, + 'thumbnail': thumbnail, + } + class UstreamChannelIE(InfoExtractor): _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P<slug>.+)' - IE_NAME = u'ustream:channel' + IE_NAME = 'ustream:channel' def _real_extract(self, url): m = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index a4b26a26f..e458ac961 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -1,5 +1,6 @@ +from __future__ import unicode_literals + import re -import json import xml.etree.ElementTree import datetime @@ -22,16 +23,16 @@ class VevoIE(InfoExtractor): vevo:) (?P<id>[^&?#]+)''' _TESTS = [{ - u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', - u'file': u'GB1101300280.mp4', - u"md5": u"06bea460acb744eab74a9d7dcb4bfd61", - u'info_dict': { - u"upload_date": u"20130624", - u"uploader": u"Hurts", - u"title": u"Somebody to Die For", - u"duration": 230.12, - u"width": 1920, - u"height": 1080, + 'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', + 'file': 'GB1101300280.mp4', + "md5": "06bea460acb744eab74a9d7dcb4bfd61", + 'info_dict': { + "upload_date": "20130624", + "uploader": "Hurts", + "title": "Somebody to Die For", + "duration": 230.12, + "width": 1920, + "height": 1080, } }] _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/' @@ -44,7 +45,7 @@ class VevoIE(InfoExtractor): if version['version'] > last_version['version']: last_version = version if last_version['version'] == -1: - raise ExtractorError(u'Unable to extract last version of the video') + raise ExtractorError('Unable to extract last version of the video') renditions = xml.etree.ElementTree.fromstring(last_version['data']) formats = [] @@ -85,7 +86,7 @@ class VevoIE(InfoExtractor): format_url = self._SMIL_BASE_URL + m.group('path') formats.append({ 'url': format_url, - 'format_id': u'SMIL_' + m.group('cbr'), + 'format_id': 'SMIL_' + m.group('cbr'), 'vcodec': m.group('vcodec'), 'acodec': m.group('acodec'), 'vbr': int(m.group('vbr')), @@ -101,26 +102,25 @@ class VevoIE(InfoExtractor): video_id = mobj.group('id') json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id - info_json = self._download_webpage(json_url, video_id, u'Downloading json info') - video_info = json.loads(info_json)['video'] + video_info = self._download_json(json_url, video_id)['video'] formats = self._formats_from_json(video_info) try: smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( self._SMIL_BASE_URL, video_id, video_id.lower()) smil_xml = self._download_webpage(smil_url, video_id, - u'Downloading SMIL info') + 'Downloading SMIL info') formats.extend(self._formats_from_smil(smil_xml)) except ExtractorError as ee: if not isinstance(ee.cause, compat_HTTPError): raise self._downloader.report_warning( - u'Cannot download SMIL information, falling back to JSON ..') + 'Cannot download SMIL information, falling back to JSON ..') timestamp_ms = int(self._search_regex( - r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date')) + r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date')) upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000) - info = { + return { 'id': video_id, 'title': video_info['title'], 'formats': formats, @@ -129,5 +129,3 @@ class VevoIE(InfoExtractor): 'uploader': video_info['mainArtists'][0]['artistName'], 'duration': video_info['duration'], } - - return info diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py new file mode 100644 index 000000000..fbdff471a --- /dev/null +++ b/youtube_dl/extractor/vube.py @@ -0,0 +1,80 @@ +from __future__ import unicode_literals + +import re +import datetime + +from .common import InfoExtractor + + +class VubeIE(InfoExtractor): + IE_NAME = 'vube' + IE_DESC = 'Vube.com' + _VALID_URL = r'http://vube\.com/[^/]+/(?P<id>[\da-zA-Z]{10})' + + _TEST = { + 'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon', + 'md5': 'f81dcf6d0448e3291f54380181695821', + 'info_dict': { + 'id': 'YL2qNPkqon', + 'ext': 'mp4', + 'title': 'Chiara Grispo - Price Tag by Jessie J', + 'description': 'md5:8ea652a1f36818352428cb5134933313', + 'thumbnail': 'http://frame.thestaticvube.com/snap/228x128/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f.jpg', + 'uploader': 'Chiara.Grispo', + 'uploader_id': '1u3hX0znhP', + 'upload_date': '20140103', + 'duration': 170.56 + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + video = self._download_json('http://vube.com/api/v2/video/%s' % video_id, + video_id, 'Downloading video JSON') + + public_id = video['public_id'] + + formats = [{'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (fmt['media_resolution_id'], public_id), + 'height': int(fmt['height']), + 'abr': int(fmt['audio_bitrate']), + 'vbr': int(fmt['video_bitrate']), + 'format_id': fmt['media_resolution_id'] + } for fmt in video['mtm'] if fmt['transcoding_status'] == 'processed'] + + self._sort_formats(formats) + + title = video['title'] + description = video.get('description') + thumbnail = video['thumbnail_src'] + if thumbnail.startswith('//'): + thumbnail = 'http:' + thumbnail + uploader = video['user_alias'] + uploader_id = video['user_url_id'] + upload_date = datetime.datetime.fromtimestamp(int(video['upload_time'])).strftime('%Y%m%d') + duration = video['duration'] + view_count = video['raw_view_count'] + like_count = video['total_likes'] + dislike_count= video['total_hates'] + + comment = self._download_json('http://vube.com/api/video/%s/comment' % video_id, + video_id, 'Downloading video comment JSON') + + comment_count = comment['total'] + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'comment_count': comment_count, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index d317f29f2..f6c515f7f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ( compat_urllib_parse, - unescapeHTML, ExtractorError, ) |