diff options
Diffstat (limited to 'youtube_dl')
36 files changed, 1066 insertions, 333 deletions
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py deleted file mode 100755 index 672ef9eed..000000000 --- a/youtube_dl/InfoExtractors.py +++ /dev/null @@ -1,4 +0,0 @@ -# Legacy file for backwards compatibility, use youtube_dl.extractor instead! - -from .extractor.common import InfoExtractor, SearchInfoExtractor -from .extractor import gen_extractors, get_info_extractor diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5095f87d2..d18d6dd00 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -148,6 +148,8 @@ class YoutubeDL(object): again. cookiefile: File name where cookies should be read from and dumped to. nocheckcertificate:Do not verify SSL certificates + prefer_insecure: Use HTTP instead of HTTPS to retrieve information. + At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use socket_timeout: Time to wait for unresponsive hosts, in seconds bidi_workaround: Work around buggy terminals without bidirectional text @@ -510,13 +512,7 @@ class YoutubeDL(object): '_type': 'compat_list', 'entries': ie_result, } - self.add_extra_info(ie_result, - { - 'extractor': ie.IE_NAME, - 'webpage_url': url, - 'webpage_url_basename': url_basename(url), - 'extractor_key': ie.ie_key(), - }) + self.add_default_extra_info(ie_result, ie, url) if process: return self.process_ie_result(ie_result, download, extra_info) else: @@ -533,7 +529,15 @@ class YoutubeDL(object): else: raise else: - self.report_error('no suitable InfoExtractor: %s' % url) + self.report_error('no suitable InfoExtractor for URL %s' % url) + + def add_default_extra_info(self, ie_result, ie, url): + self.add_extra_info(ie_result, { + 'extractor': ie.IE_NAME, + 'webpage_url': url, + 'webpage_url_basename': url_basename(url), + 'extractor_key': ie.ie_key(), + }) def process_ie_result(self, ie_result, download=True, extra_info={}): """ diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 470acf3ef..056e94457 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -56,7 +56,6 @@ __authors__ = ( __license__ = 'Public Domain' import codecs -import getpass import io import locale import optparse @@ -68,6 +67,7 @@ import sys from .utils import ( + compat_getpass, compat_print, DateRange, decodeOption, @@ -241,6 +241,9 @@ def parseOpts(overrideArguments=None): help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') general.add_option( + '--prefer-insecure', action='store_true', dest='prefer_insecure', + help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)') + general.add_option( '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR', help='Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.') general.add_option( @@ -260,7 +263,6 @@ def parseOpts(overrideArguments=None): action='store_true', help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)') - selection.add_option( '--playlist-start', dest='playliststart', metavar='NUMBER', default=1, type=int, @@ -624,7 +626,7 @@ def _real_main(argv=None): if opts.usetitle and opts.useid: parser.error(u'using title conflicts with using video ID') if opts.username is not None and opts.password is None: - opts.password = getpass.getpass(u'Type account password and press return:') + opts.password = compat_getpass(u'Type account password and press [Return]: ') if opts.ratelimit is not None: numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) if numeric_limit is None: @@ -769,6 +771,7 @@ def _real_main(argv=None): 'download_archive': download_archive_fn, 'cookiefile': opts.cookiefile, 'nocheckcertificate': opts.no_check_certificate, + 'prefer_insecure': opts.prefer_insecure, 'proxy': opts.proxy, 'socket_timeout': opts.socket_timeout, 'bidi_workaround': opts.bidi_workaround, diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bbdb04069..245860140 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -2,6 +2,7 @@ from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE from .aftonbladet import AftonbladetIE from .anitube import AnitubeIE +from .aol import AolIE from .aparat import AparatIE from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE @@ -10,6 +11,7 @@ from .arte import ( ArteTvIE, ArteTVPlus7IE, ArteTVCreativeIE, + ArteTVConcertIE, ArteTVFutureIE, ArteTVDDCIE, ) @@ -63,6 +65,7 @@ from .ehow import EHowIE from .eighttracks import EightTracksIE from .eitb import EitbIE from .elpais import ElPaisIE +from .engadget import EngadgetIE from .escapist import EscapistIE from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE @@ -71,6 +74,7 @@ from .facebook import FacebookIE from .faz import FazIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE +from .fivemin import FiveMinIE from .fktv import ( FKTVIE, FKTVPosteckeIE, @@ -108,7 +112,7 @@ from .imdb import ( ) from .ina import InaIE from .infoq import InfoQIE -from .instagram import InstagramIE +from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE from .ivi import ( @@ -173,6 +177,7 @@ from .nowness import NownessIE from .nowvideo import NowVideoIE from .ooyala import OoyalaIE from .orf import ORFIE +from .parliamentliveuk import ParliamentLiveUKIE from .pbs import PBSIE from .photobucket import PhotobucketIE from .playvid import PlayvidIE @@ -190,6 +195,7 @@ from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE +from .rts import RTSIE from .rutube import ( RutubeIE, RutubeChannelIE, @@ -234,6 +240,7 @@ from .theplatform import ThePlatformIE from .thisav import ThisAVIE from .tinypic import TinyPicIE from .toutv import TouTvIE +from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE from .trutube import TruTubeIE @@ -258,6 +265,7 @@ from .vice import ViceIE from .viddler import ViddlerIE from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE +from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE from .vimeo import ( @@ -278,10 +286,11 @@ from .weibo import WeiboIE from .wimp import WimpIE from .wistia import WistiaIE from .worldstarhiphop import WorldStarHipHopIE +from .xbef import XBefIE from .xhamster import XHamsterIE from .xnxx import XNXXIE from .xvideos import XVideosIE -from .xtube import XTubeIE +from .xtube import XTubeUserIE, XTubeIE from .yahoo import ( YahooIE, YahooNewsIE, diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py new file mode 100644 index 000000000..abc668912 --- /dev/null +++ b/youtube_dl/extractor/aol.py @@ -0,0 +1,28 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .fivemin import FiveMinIE + + +class AolIE(InfoExtractor): + IE_NAME = 'on.aol.com' + _VALID_URL = r'http://on\.aol\.com/video/.*-(?P<id>\d+)($|\?)' + + _TEST = { + 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', + 'md5': '18ef68f48740e86ae94b98da815eec42', + 'info_dict': { + 'id': '518167793', + 'ext': 'mp4', + 'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam', + }, + 'add_ie': ['FiveMin'], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + self.to_screen('Downloading 5min.com video %s' % video_id) + return FiveMinIE._build_result(video_id) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index d194f2564..979481b21 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -131,7 +131,7 @@ class ArteTvIE(InfoExtractor): class ArteTVPlus7IE(InfoExtractor): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' @classmethod def _extract_url_info(cls, url): @@ -202,6 +202,8 @@ class ArteTVPlus7IE(InfoExtractor): re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None, # The version with sourds/mal subtitles has also lower relevance re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None, + # Prefer http downloads over m3u8 + 0 if f['url'].endswith('m3u8') else 1, ) formats = sorted(formats, key=sort_key) def _format(format_info): @@ -242,8 +244,9 @@ class ArteTVCreativeIE(ArteTVPlus7IE): _TEST = { 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', - 'file': '050489-002.mp4', 'info_dict': { + 'id': '050489-002', + 'ext': 'mp4', 'title': 'Agentur Amateur / Agence Amateur #2 : Corporate Design', }, } @@ -255,8 +258,9 @@ class ArteTVFutureIE(ArteTVPlus7IE): _TEST = { 'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', - 'file': '050940-003.mp4', 'info_dict': { + 'id': '050940-003', + 'ext': 'mp4', 'title': 'Les champignons au secours de la planète', }, } @@ -270,7 +274,7 @@ class ArteTVFutureIE(ArteTVPlus7IE): class ArteTVDDCIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:ddc' - _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)' + _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)' def _real_extract(self, url): video_id, lang = self._extract_url_info(url) @@ -284,3 +288,20 @@ class ArteTVDDCIE(ArteTVPlus7IE): javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator') json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url') return self._extract_from_json_url(json_url, video_id, lang) + + +class ArteTVConcertIE(ArteTVPlus7IE): + IE_NAME = 'arte.tv:concert' + _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>de|fr)/(?P<id>.+)' + + _TEST = { + 'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', + 'md5': '9ea035b7bd69696b67aa2ccaaa218161', + 'info_dict': { + 'id': '186', + 'ext': 'mp4', + 'title': 'The Notwist im Pariser Konzertclub "Divan du Monde"', + 'upload_date': '20140128', + 'description': 'md5:486eb08f991552ade77439fe6d82c305', + }, + } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index ed3986f31..d50fcdbdb 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -14,7 +14,7 @@ from ..utils import ( class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?comedycentral\.com/ + _VALID_URL = r'''(?x)https?://(?:www\.)?(comedycentral|cc)\.com/ (video-clips|episodes|cc-studios|video-collections) /(?P<title>.*)''' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 647720c8a..78f238f84 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -74,7 +74,7 @@ class InfoExtractor(object): "http", "https", "rtsp", "rtmp", "m3u8" or so. * preference Order number of this format. If this field is present and not None, the formats get sorted - by this field. + by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. * quality Order number of the video quality of this diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index d65046f58..2a8eda9ef 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -10,9 +10,9 @@ from ..utils import ( class CSpanIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>\d+)' + _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)' IE_DESC = 'C-SPAN' - _TEST = { + _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', 'md5': '8e44ce11f0f725527daccc453f553eb0', 'info_dict': { @@ -22,13 +22,24 @@ class CSpanIE(InfoExtractor): 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', }, 'skip': 'Regularly fails on travis, for unknown reasons', - } + }, { + 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', + # For whatever reason, the served video alternates between + # two different ones + #'md5': 'dbb0f047376d457f2ab8b3929cbb2d0c', + 'info_dict': { + 'id': '340723', + 'ext': 'mp4', + 'title': 'International Health Care Models', + 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) page_id = mobj.group('id') webpage = self._download_webpage(url, page_id) - video_id = self._search_regex(r'data-progid=\'(\d+)\'>', webpage, 'video id') + video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id') description = self._html_search_regex( [ diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 4876ecb48..6033cd94a 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -1,25 +1,28 @@ # encoding: utf-8 + +from __future__ import unicode_literals + import re from .common import InfoExtractor from ..utils import ( compat_urllib_parse, - determine_ext, ) class DaumIE(InfoExtractor): _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' - IE_NAME = u'daum.net' + IE_NAME = 'daum.net' _TEST = { - u'url': u'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690', - u'file': u'52554690.mp4', - u'info_dict': { - u'title': u'DOTA 2GETHER 시즌2 6회 - 2부', - u'description': u'DOTA 2GETHER 시즌2 6회 - 2부', - u'upload_date': u'20130831', - u'duration': 3868, + 'url': 'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690', + 'info_dict': { + 'id': '52554690', + 'ext': 'mp4', + 'title': 'DOTA 2GETHER 시즌2 6회 - 2부', + 'description': 'DOTA 2GETHER 시즌2 6회 - 2부', + 'upload_date': '20130831', + 'duration': 3868, }, } @@ -30,14 +33,14 @@ class DaumIE(InfoExtractor): webpage = self._download_webpage(canonical_url, video_id) full_id = self._search_regex( r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]', - webpage, u'full id') + webpage, 'full id') query = compat_urllib_parse.urlencode({'vid': full_id}) info = self._download_xml( 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, - u'Downloading video info') + 'Downloading video info') urls = self._download_xml( 'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query, - video_id, u'Downloading video formats info') + video_id, 'Downloading video formats info') self.to_screen(u'%s: Getting video urls' % video_id) formats = [] @@ -53,7 +56,6 @@ class DaumIE(InfoExtractor): format_url = url_doc.find('result/url').text formats.append({ 'url': format_url, - 'ext': determine_ext(format_url), 'format_id': profile, }) diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py new file mode 100644 index 000000000..92ada81d2 --- /dev/null +++ b/youtube_dl/extractor/engadget.py @@ -0,0 +1,43 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .fivemin import FiveMinIE +from ..utils import ( + url_basename, +) + + +class EngadgetIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://www.engadget.com/ + (?:video/5min/(?P<id>\d+)| + [\d/]+/.*?) + ''' + + _TEST = { + 'url': 'http://www.engadget.com/video/5min/518153925/', + 'md5': 'c6820d4828a5064447a4d9fc73f312c9', + 'info_dict': { + 'id': '518153925', + 'ext': 'mp4', + 'title': 'Samsung Galaxy Tab Pro 8.4 Review', + }, + 'add_ie': ['FiveMin'], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + if video_id is not None: + return FiveMinIE._build_result(video_id) + else: + title = url_basename(url) + webpage = self._download_webpage(url, title) + ids = re.findall(r'<iframe[^>]+?playList=(\d+)', webpage) + return { + '_type': 'playlist', + 'title': title, + 'entries': [FiveMinIE._build_result(id) for id in ids] + } diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py new file mode 100644 index 000000000..215cc831e --- /dev/null +++ b/youtube_dl/extractor/fivemin.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_str, +) + + +class FiveMinIE(InfoExtractor): + IE_NAME = '5min' + _VALID_URL = r'''(?x) + (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(.*?&)?playList=| + 5min:) + (?P<id>\d+) + ''' + + _TEST = { + # From http://www.engadget.com/2013/11/15/ipad-mini-retina-display-review/ + 'url': 'http://pshared.5min.com/Scripts/PlayerSeed.js?sid=281&width=560&height=345&playList=518013791', + 'md5': '4f7b0b79bf1a470e5004f7112385941d', + 'info_dict': { + 'id': '518013791', + 'ext': 'mp4', + 'title': 'iPad Mini with Retina Display Review', + }, + } + + @classmethod + def _build_result(cls, video_id): + return cls.url_result('5min:%s' % video_id, cls.ie_key()) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info = self._download_json( + 'https://syn.5min.com/handlers/SenseHandler.ashx?func=GetResults&' + 'playlist=%s&url=https' % video_id, + video_id)['binding'][0] + + second_id = compat_str(int(video_id[:-2]) + 1) + formats = [] + for quality, height in [(1, 320), (2, 480), (4, 720), (8, 1080)]: + if any(r['ID'] == quality for r in info['Renditions']): + formats.append({ + 'format_id': compat_str(quality), + 'url': 'http://avideos.5min.com/%s/%s/%s_%s.mp4' % (second_id[-3:], second_id, video_id, quality), + 'height': height, + }) + + return { + 'id': video_id, + 'title': info['Title'], + 'formats': formats, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 71b9c541e..4d649fe71 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -102,6 +102,20 @@ class GenericIE(InfoExtractor): 'title': '2cc213299525360.mov', # that's what we get }, }, + # second style of embedded ooyala videos + { + 'url': 'http://www.smh.com.au/tv/business/show/financial-review-sunday/behind-the-scenes-financial-review-sunday--4350201.html', + 'info_dict': { + 'id': '13djJjYjptA1XpPx8r9kuzPyj3UZH0Uk', + 'ext': 'mp4', + 'title': 'Behind-the-scenes: Financial Review Sunday ', + 'description': 'Step inside Channel Nine studios for an exclusive tour of its upcoming financial business show.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', @@ -159,7 +173,30 @@ class GenericIE(InfoExtractor): # m3u8 download 'skip_download': True, }, - } + }, + # Embedded TED video + { + 'url': 'http://en.support.wordpress.com/videos/ted-talks/', + 'md5': 'deeeabcc1085eb2ba205474e7235a3d5', + 'info_dict': { + 'id': '981', + 'ext': 'mp4', + 'title': 'My web playroom', + 'uploader': 'Ze Frank', + 'description': 'md5:ddb2a40ecd6b6a147e400e535874947b', + } + }, + # nowvideo embed hidden behind percent encoding + { + 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/', + 'md5': '2baf4ddd70f697d94b1c18cf796d5107', + 'info_dict': { + 'id': '06e53103ca9aa', + 'ext': 'flv', + 'title': 'Macross Episode 001 Watch Macross Episode 001 onl', + 'description': 'No description', + }, + }, ] def report_download_webpage(self, video_id): @@ -185,9 +222,14 @@ class GenericIE(InfoExtractor): newurl = newurl.replace(' ', '%20') newheaders = dict((k,v) for k,v in req.headers.items() if k.lower() not in ("content-length", "content-type")) + try: + # This function was deprecated in python 3.3 and removed in 3.4 + origin_req_host = req.get_origin_req_host() + except AttributeError: + origin_req_host = req.origin_req_host return HEADRequest(newurl, headers=newheaders, - origin_req_host=req.get_origin_req_host(), + origin_req_host=origin_req_host, unverifiable=True) else: raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) @@ -306,6 +348,11 @@ class GenericIE(InfoExtractor): except compat_xml_parse_error: pass + # Sometimes embedded video player is hidden behind percent encoding + # (e.g. https://github.com/rg3/youtube-dl/issues/2448) + # Unescaping the whole page allows to handle those cases in a generic way + webpage = compat_urllib_parse.unquote(webpage) + # it's tempting to parse this further, but you would # have to take into account all the variations like # Video Title - Site Name @@ -407,9 +454,10 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) # Look for Ooyala videos - mobj = re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=([^"&]+)', webpage) + mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or + re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage)) if mobj is not None: - return OoyalaIE._build_url_result(mobj.group(1)) + return OoyalaIE._build_url_result(mobj.group('ec')) # Look for Aparat videos mobj = re.search(r'<iframe src="(http://www\.aparat\.com/video/[^"]+)"', webpage) @@ -471,6 +519,12 @@ class GenericIE(InfoExtractor): if rutv_url: return self.url_result(rutv_url, 'RUTV') + # Look for embedded TED player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'TED') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: @@ -482,6 +536,7 @@ class GenericIE(InfoExtractor): if mobj is None: # Broaden the search a little bit: JWPlayer JS loader mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) + if mobj is None: # Try to find twitter cards info mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 63141af27..b5372bf7a 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -3,6 +3,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + int_or_none, +) class InstagramIE(InfoExtractor): @@ -37,3 +40,68 @@ class InstagramIE(InfoExtractor): 'uploader_id': uploader_id, 'description': desc, } + + +class InstagramUserIE(InfoExtractor): + _VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])' + IE_DESC = 'Instagram user profile' + IE_NAME = 'instagram:user' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + uploader_id = mobj.group('username') + + entries = [] + page_count = 0 + media_url = 'http://instagram.com/%s/media' % uploader_id + while True: + page = self._download_json( + media_url, uploader_id, + note='Downloading page %d ' % (page_count + 1), + ) + page_count += 1 + + for it in page['items']: + if it.get('type') != 'video': + continue + like_count = int_or_none(it.get('likes', {}).get('count')) + user = it.get('user', {}) + + formats = [{ + 'format_id': k, + 'height': v.get('height'), + 'width': v.get('width'), + 'url': v['url'], + } for k, v in it['videos'].items()] + self._sort_formats(formats) + + thumbnails_el = it.get('images', {}) + thumbnail = thumbnails_el.get('thumbnail', {}).get('url') + + title = it.get('caption', {}).get('text', it['id']) + + entries.append({ + 'id': it['id'], + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'webpage_url': it.get('link'), + 'uploader': user.get('full_name'), + 'uploader_id': user.get('username'), + 'like_count': like_count, + 'timestamp': int_or_none(it.get('created_time')), + }) + + if not page['items']: + break + max_id = page['items'][-1]['id'] + media_url = ( + 'http://instagram.com/%s/media?max_id=%s' % ( + uploader_id, max_id)) + + return { + '_type': 'playlist', + 'entries': entries, + 'id': uploader_id, + 'title': uploader_id, + } diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 2a29e6072..d1defd363 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -48,7 +48,7 @@ class IPrimaIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - if re.search(r'Nemáte oprávnění přistupovat na tuto stránku.\s*</div>', webpage): + if re.search(r'Nemáte oprávnění přistupovat na tuto stránku\.\s*</div>', webpage): raise ExtractorError( '%s said: You do not have permission to access this page' % self.IE_NAME, expected=True) diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py index 1b45b67b0..5341ac773 100644 --- a/youtube_dl/extractor/kontrtube.py +++ b/youtube_dl/extractor/kontrtube.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import int_or_none class KontrTubeIE(InfoExtractor): @@ -32,27 +33,26 @@ class KontrTubeIE(InfoExtractor): video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL') thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False) - title = self._html_search_regex(r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage, - 'video title') + title = self._html_search_regex( + r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage, 'video title') description = self._html_search_meta('description', webpage, 'video description') - mobj = re.search(r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>', - webpage) + mobj = re.search( + r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>', webpage) duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None - view_count = self._html_search_regex(r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage, - 'view count', fatal=False) - view_count = int(view_count) if view_count is not None else None + view_count = self._html_search_regex( + r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage, 'view count', fatal=False) comment_count = None - comment_str = self._html_search_regex(r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count', - fatal=False) + comment_str = self._html_search_regex( + r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count', fatal=False) if comment_str.startswith('комментариев нет'): comment_count = 0 else: mobj = re.search(r'\d+ из (?P<total>\d+) комментариев', comment_str) if mobj: - comment_count = int(mobj.group('total')) + comment_count = mobj.group('total') return { 'id': video_id, @@ -61,6 +61,6 @@ class KontrTubeIE(InfoExtractor): 'title': title, 'description': description, 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, + 'view_count': int_or_none(view_count), + 'comment_count': int_or_none(comment_count), }
\ No newline at end of file diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 301031197..6436c05a3 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -9,104 +11,103 @@ from ..utils import ( ExtractorError, ) -class MetacafeIE(InfoExtractor): - """Information Extractor for metacafe.com.""" - _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' +class MetacafeIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' - IE_NAME = u'metacafe' + IE_NAME = 'metacafe' _TESTS = [ - # Youtube video - { - u"add_ie": ["Youtube"], - u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/", - u"file": u"_aUehQsCQtM.mp4", - u"info_dict": { - u"upload_date": u"20090102", - u"title": u"The Electric Company | \"Short I\" | PBS KIDS GO!", - u"description": u"md5:2439a8ef6d5a70e380c22f5ad323e5a8", - u"uploader": u"PBS", - u"uploader_id": u"PBS" - } - }, - # Normal metacafe video - { - u'url': u'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/', - u'md5': u'6e0bca200eaad2552e6915ed6fd4d9ad', - u'info_dict': { - u'id': u'11121940', - u'ext': u'mp4', - u'title': u'News: Stuff You Won\'t Do with Your PlayStation 4', - u'uploader': u'ign', - u'description': u'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', + # Youtube video + { + 'add_ie': ['Youtube'], + 'url': 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/', + 'info_dict': { + 'id': '_aUehQsCQtM', + 'ext': 'mp4', + 'upload_date': '20090102', + 'title': 'The Electric Company | "Short I" | PBS KIDS GO!', + 'description': 'md5:2439a8ef6d5a70e380c22f5ad323e5a8', + 'uploader': 'PBS', + 'uploader_id': 'PBS' + } }, - }, - # AnyClip video - { - u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/", - u"file": u"an-dVVXnuY7Jh77J.mp4", - u"info_dict": { - u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3", - u"uploader": u"anyclip", - u"description": u"md5:38c711dd98f5bb87acf973d573442e67", + # Normal metacafe video + { + 'url': 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/', + 'md5': '6e0bca200eaad2552e6915ed6fd4d9ad', + 'info_dict': { + 'id': '11121940', + 'ext': 'mp4', + 'title': 'News: Stuff You Won\'t Do with Your PlayStation 4', + 'uploader': 'ign', + 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', + }, }, - }, - # age-restricted video - { - u'url': u'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/', - u'md5': u'98dde7c1a35d02178e8ab7560fe8bd09', - u'info_dict': { - u'id': u'5186653', - u'ext': u'mp4', - u'title': u'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.', - u'uploader': u'Dwayne Pipe', - u'description': u'md5:950bf4c581e2c059911fa3ffbe377e4b', - u'age_limit': 18, + # AnyClip video + { + 'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/', + 'info_dict': { + 'id': 'an-dVVXnuY7Jh77J', + 'ext': 'mp4', + 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3', + 'uploader': 'anyclip', + 'description': 'md5:38c711dd98f5bb87acf973d573442e67', + }, }, - }, - # cbs video - { - u'url': u'http://www.metacafe.com/watch/cb-0rOxMBabDXN6/samsung_galaxy_note_2_samsungs_next_generation_phablet/', - u'info_dict': { - u'id': u'0rOxMBabDXN6', - u'ext': u'flv', - u'title': u'Samsung Galaxy Note 2: Samsung\'s next-generation phablet', - u'description': u'md5:54d49fac53d26d5a0aaeccd061ada09d', - u'duration': 129, + # age-restricted video + { + 'url': 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/', + 'md5': '98dde7c1a35d02178e8ab7560fe8bd09', + 'info_dict': { + 'id': '5186653', + 'ext': 'mp4', + 'title': 'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.', + 'uploader': 'Dwayne Pipe', + 'description': 'md5:950bf4c581e2c059911fa3ffbe377e4b', + 'age_limit': 18, + }, }, - u'params': { - # rtmp download - u'skip_download': True, + # cbs video + { + 'url': 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/', + 'info_dict': { + 'id': '8VD4r_Zws8VP', + 'ext': 'flv', + 'title': 'Open: This is Face the Nation, February 9', + 'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476', + 'duration': 96, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, }, - }, ] - def report_disclaimer(self): - """Report disclaimer retrieval.""" - self.to_screen(u'Retrieving disclaimer') + self.to_screen('Retrieving disclaimer') def _real_initialize(self): # Retrieve disclaimer self.report_disclaimer() - self._download_webpage(self._DISCLAIMER, None, False, u'Unable to retrieve disclaimer') + self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer') # Confirm age disclaimer_form = { 'filters': '0', 'submit': "Continue - I'm over 18", - } + } request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') self.report_age_confirmation() - self._download_webpage(request, None, False, u'Unable to confirm age') + self._download_webpage(request, None, False, 'Unable to confirm age') def _real_extract(self, url): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) video_id = mobj.group(1) @@ -153,23 +154,24 @@ class MetacafeIE(InfoExtractor): else: mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) if mobj is None: - raise ExtractorError(u'Unable to extract media URL') + raise ExtractorError('Unable to extract media URL') vardict = compat_parse_qs(mobj.group(1)) if 'mediaData' not in vardict: - raise ExtractorError(u'Unable to extract media URL') - mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0]) + raise ExtractorError('Unable to extract media URL') + mobj = re.search( + r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0]) if mobj is None: - raise ExtractorError(u'Unable to extract media URL') + raise ExtractorError('Unable to extract media URL') mediaURL = mobj.group('mediaURL').replace('\\/', '/') video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) video_ext = determine_ext(video_url) - video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, u'title') + video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, 'title') description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) video_uploader = self._html_search_regex( r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', - webpage, u'uploader nickname', fatal=False) + webpage, 'uploader nickname', fatal=False) if re.search(r'"contentRating":"restricted"', webpage) is not None: age_limit = 18 @@ -177,14 +179,12 @@ class MetacafeIE(InfoExtractor): age_limit = 0 return { - '_type': 'video', - 'id': video_id, - 'url': video_url, + 'id': video_id, + 'url': video_url, 'description': description, 'uploader': video_uploader, - 'upload_date': None, - 'title': video_title, + 'title': video_title, 'thumbnail':thumbnail, - 'ext': video_ext, + 'ext': video_ext, 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index 1d7aa40ed..b8c892cce 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor @@ -12,8 +11,9 @@ class NineGagIE(InfoExtractor): _TEST = { "url": "http://9gag.tv/v/1912", - "file": "1912.mp4", "info_dict": { + "id": "1912", + "ext": "mp4", "description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)", "title": "\"People Are Awesome 2013\" Is Absolutely Awesome", "view_count": int, diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 44312ba4e..e20327791 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -1,20 +1,23 @@ +from __future__ import unicode_literals import re import json from .common import InfoExtractor from ..utils import unescapeHTML + class OoyalaIE(InfoExtractor): _VALID_URL = r'https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=(?P<id>.+?)(&|$)' _TEST = { # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video - u'url': u'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', - u'file': u'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8.mp4', - u'md5': u'3f5cceb3a7bf461d6c29dc466cf8033c', - u'info_dict': { - u'title': u'Explaining Data Recovery from Hard Drives and SSDs', - u'description': u'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', + 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', + 'md5': '3f5cceb3a7bf461d6c29dc466cf8033c', + 'info_dict': { + 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', + 'ext': 'mp4', + 'title': 'Explaining Data Recovery from Hard Drives and SSDs', + 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', }, } @@ -28,13 +31,14 @@ class OoyalaIE(InfoExtractor): ie=cls.ie_key()) def _extract_result(self, info, more_info): - return {'id': info['embedCode'], - 'ext': 'mp4', - 'title': unescapeHTML(info['title']), - 'url': info.get('ipad_url') or info['url'], - 'description': unescapeHTML(more_info['description']), - 'thumbnail': more_info['promo'], - } + return { + 'id': info['embedCode'], + 'ext': 'mp4', + 'title': unescapeHTML(info['title']), + 'url': info.get('ipad_url') or info['url'], + 'description': unescapeHTML(more_info['description']), + 'thumbnail': more_info['promo'], + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -42,22 +46,23 @@ class OoyalaIE(InfoExtractor): player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode player = self._download_webpage(player_url, embedCode) mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', - player, u'mobile player url') + player, 'mobile player url') mobile_player = self._download_webpage(mobile_url, embedCode) videos_info = self._search_regex( r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', - mobile_player, u'info').replace('\\"','"') - videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, u'more info').replace('\\"','"') + mobile_player, 'info').replace('\\"','"') + videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"','"') videos_info = json.loads(videos_info) videos_more_info =json.loads(videos_more_info) if videos_more_info.get('lineup'): videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])] - return {'_type': 'playlist', - 'id': embedCode, - 'title': unescapeHTML(videos_more_info['title']), - 'entries': videos, - } + return { + '_type': 'playlist', + 'id': embedCode, + 'title': unescapeHTML(videos_more_info['title']), + 'entries': videos, + } else: return self._extract_result(videos_info[0], videos_more_info) diff --git a/youtube_dl/extractor/parliamentliveuk.py b/youtube_dl/extractor/parliamentliveuk.py new file mode 100644 index 000000000..0a423a08f --- /dev/null +++ b/youtube_dl/extractor/parliamentliveuk.py @@ -0,0 +1,53 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class ParliamentLiveUKIE(InfoExtractor): + IE_NAME = 'parliamentlive.tv' + IE_DESC = 'UK parliament videos' + _VALID_URL = r'https?://www\.parliamentlive\.tv/Main/Player\.aspx\?(?:[^&]+&)*?meetingId=(?P<id>[0-9]+)' + + _TEST = { + 'url': 'http://www.parliamentlive.tv/Main/Player.aspx?meetingId=15121&player=windowsmedia', + 'info_dict': { + 'id': '15121', + 'ext': 'asf', + 'title': 'hoc home affairs committee, 18 mar 2014.pm', + 'description': 'md5:033b3acdf83304cd43946b2d5e5798d1', + }, + 'params': { + 'skip_download': True, # Requires mplayer (mms) + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + + asx_url = self._html_search_regex( + r'embed.*?src="([^"]+)" name="MediaPlayer"', webpage, + 'metadata URL') + asx = self._download_xml(asx_url, video_id, 'Downloading ASX metadata') + video_url = asx.find('.//REF').attrib['HREF'] + + title = self._search_regex( + r'''(?x)player\.setClipDetails\( + (?:(?:[0-9]+|"[^"]+"),\s*){2} + "([^"]+",\s*"[^"]+)" + ''', + webpage, 'title').replace('", "', ', ') + description = self._html_search_regex( + r'(?s)<span id="MainContentPlaceHolder_CaptionsBlock_WitnessInfo">(.*?)</span>', + webpage, 'description') + + return { + 'id': video_id, + 'ext': 'asf', + 'url': video_url, + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index e7e0042fb..64cded707 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -3,6 +3,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + US_RATINGS, +) class PBSIE(InfoExtractor): @@ -13,7 +16,7 @@ class PBSIE(InfoExtractor): # Article with embedded player (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) | # Player - video\.pbs\.org/partnerplayer/(?P<player_id>[^/]+)/ + video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ ) ''' @@ -57,6 +60,11 @@ class PBSIE(InfoExtractor): info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id info = self._download_json(info_url, display_id) + rating_str = info.get('rating') + if rating_str is not None: + rating_str = rating_str.rpartition('-')[2] + age_limit = US_RATINGS.get(rating_str) + return { 'id': video_id, 'title': info['title'], @@ -65,4 +73,5 @@ class PBSIE(InfoExtractor): 'description': info['program'].get('description'), 'thumbnail': info.get('image_url'), 'duration': info.get('duration'), + 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 834fe7266..7dd3dca0d 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -8,6 +8,7 @@ from ..utils import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urllib_parse, + str_to_int, ) from ..aes import ( aes_decrypt_text @@ -27,6 +28,12 @@ class PornHubIE(InfoExtractor): } } + def _extract_count(self, pattern, webpage, name): + count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False) + if count: + count = str_to_int(count) + return count + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') @@ -37,11 +44,19 @@ class PornHubIE(InfoExtractor): webpage = self._download_webpage(req, video_id) video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title') - video_uploader = self._html_search_regex(r'<b>From: </b>(?:\s|<[^>]*>)*(.+?)<', webpage, 'uploader', fatal=False) + video_uploader = self._html_search_regex( + r'(?s)<div class="video-info-row">\s*From: .+?<(?:a href="/users/|<span class="username)[^>]+>(.+?)<', + webpage, 'uploader', fatal=False) thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False) if thumbnail: thumbnail = compat_urllib_parse.unquote(thumbnail) + view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') + like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') + dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') + comment_count = self._extract_count( + r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment') + video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) if webpage.find('"encrypted":true') != -1: password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password')) @@ -77,6 +92,10 @@ class PornHubIE(InfoExtractor): 'uploader': video_uploader, 'title': video_title, 'thumbnail': thumbnail, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'comment_count': comment_count, 'formats': formats, 'age_limit': 18, } diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py index 34652f6c1..09352ed82 100644 --- a/youtube_dl/extractor/radiofrance.py +++ b/youtube_dl/extractor/radiofrance.py @@ -1,4 +1,6 @@ # coding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -6,16 +8,17 @@ from .common import InfoExtractor class RadioFranceIE(InfoExtractor): _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' - IE_NAME = u'radiofrance' + IE_NAME = 'radiofrance' _TEST = { - u'url': u'http://maison.radiofrance.fr/radiovisions/one-one', - u'file': u'one-one.ogg', - u'md5': u'bdbb28ace95ed0e04faab32ba3160daf', - u'info_dict': { - u"title": u"One to one", - u"description": u"Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", - u"uploader": u"Thomas Hercouët", + 'url': 'http://maison.radiofrance.fr/radiovisions/one-one', + 'md5': 'bdbb28ace95ed0e04faab32ba3160daf', + 'info_dict': { + 'id': 'one-one', + 'ext': 'ogg', + "title": "One to one", + "description": "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", + "uploader": "Thomas Hercouët", }, } @@ -24,27 +27,28 @@ class RadioFranceIE(InfoExtractor): video_id = m.group('id') webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, u'title') + title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title') description = self._html_search_regex( r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>', - webpage, u'description', fatal=False) + webpage, 'description', fatal=False) uploader = self._html_search_regex( r'<div class="credit"> © (.*?)</div>', - webpage, u'uploader', fatal=False) + webpage, 'uploader', fatal=False) formats_str = self._html_search_regex( r'class="jp-jplayer[^"]*" data-source="([^"]+)">', - webpage, u'audio URLs') + webpage, 'audio URLs') formats = [ { 'format_id': fm[0], 'url': fm[1], 'vcodec': 'none', + 'preference': i, } - for fm in - re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str) + for i, fm in + enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)) ] - # No sorting, we don't know any more about these formats + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py new file mode 100644 index 000000000..f211637a7 --- /dev/null +++ b/youtube_dl/extractor/rts.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, + unescapeHTML, +) + + +class RTSIE(InfoExtractor): + IE_DESC = 'RTS.ch' + _VALID_URL = r'^https?://(?:www\.)?rts\.ch/archives/tv/[^/]+/(?P<id>[0-9]+)-.*?\.html' + + _TEST = { + 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', + 'md5': '753b877968ad8afaeddccc374d4256a5', + 'info_dict': { + 'id': '3449373', + 'ext': 'mp4', + 'duration': 1488, + 'title': 'Les Enfants Terribles', + 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.', + 'uploader': 'Divers', + 'upload_date': '19680921', + 'timestamp': -40280400, + }, + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + all_info = self._download_json( + 'http://www.rts.ch/a/%s.html?f=json/article' % video_id, video_id) + info = all_info['video']['JSONinfo'] + + upload_timestamp = parse_iso8601(info.get('broadcast_date')) + duration = parse_duration(info.get('duration')) + thumbnail = unescapeHTML(info.get('preview_image_url')) + formats = [{ + 'format_id': fid, + 'url': furl, + 'tbr': int_or_none(self._search_regex( + r'-([0-9]+)k\.', furl, 'bitrate', default=None)), + } for fid, furl in info['streams'].items()] + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': info['title'], + 'description': info.get('intro'), + 'duration': duration, + 'uploader': info.get('programName'), + 'timestamp': upload_timestamp, + } diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 2f254f023..8893699aa 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -100,7 +100,7 @@ class SoundcloudIE(InfoExtractor): def report_resolve(self, video_id): """Report information extraction.""" - self.to_screen(u'%s: Resolving id' % video_id) + self.to_screen('%s: Resolving id' % video_id) @classmethod def _resolv_url(cls, url): @@ -124,45 +124,46 @@ class SoundcloudIE(InfoExtractor): 'description': info['description'], 'thumbnail': thumbnail, } + formats = [] if info.get('downloadable', False): # We can build a direct link to the song format_url = ( 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format( track_id, self._CLIENT_ID)) - result['formats'] = [{ + formats.append({ 'format_id': 'download', 'ext': info.get('original_format', 'mp3'), 'url': format_url, 'vcodec': 'none', - }] - else: - # We have to retrieve the url - streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?' - 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token)) - stream_json = self._download_webpage( - streams_url, - track_id, 'Downloading track url') - - formats = [] - format_dict = json.loads(stream_json) - for key, stream_url in format_dict.items(): - if key.startswith(u'http'): - formats.append({ - 'format_id': key, - 'ext': ext, - 'url': stream_url, - 'vcodec': 'none', - }) - elif key.startswith(u'rtmp'): - # The url doesn't have an rtmp app, we have to extract the playpath - url, path = stream_url.split('mp3:', 1) - formats.append({ - 'format_id': key, - 'url': url, - 'play_path': 'mp3:' + path, - 'ext': ext, - 'vcodec': 'none', - }) + 'preference': 10, + }) + + # We have to retrieve the url + streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?' + 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token)) + stream_json = self._download_webpage( + streams_url, + track_id, 'Downloading track url') + + format_dict = json.loads(stream_json) + for key, stream_url in format_dict.items(): + if key.startswith('http'): + formats.append({ + 'format_id': key, + 'ext': ext, + 'url': stream_url, + 'vcodec': 'none', + }) + elif key.startswith('rtmp'): + # The url doesn't have an rtmp app, we have to extract the playpath + url, path = stream_url.split('mp3:', 1) + formats.append({ + 'format_id': key, + 'url': url, + 'play_path': 'mp3:' + path, + 'ext': ext, + 'vcodec': 'none', + }) if not formats: # We fallback to the stream_url in the original info, this @@ -188,7 +189,7 @@ class SoundcloudIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) track_id = mobj.group('track_id') token = None @@ -226,7 +227,7 @@ class SoundcloudSetIE(SoundcloudIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) # extract uploader (which is in the url) uploader = mobj.group(1) @@ -243,7 +244,7 @@ class SoundcloudSetIE(SoundcloudIE): info = json.loads(info_json) if 'errors' in info: for err in info['errors']: - self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message'])) + self._downloader.report_error('unable to download video webpage: %s' % compat_str(err['error_message'])) return self.report_extraction(full_title) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 3968b718e..ad1a46c33 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -11,7 +11,9 @@ from ..utils import ( class TEDIE(SubtitlesInfoExtractor): - _VALID_URL = r'''(?x)http://www\.ted\.com/ + _VALID_URL = r'''(?x) + (?P<proto>https?://) + (?P<type>www|embed)(?P<urlmain>\.ted\.com/ ( (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist | @@ -19,6 +21,7 @@ class TEDIE(SubtitlesInfoExtractor): ) (/lang/(.*?))? # The url may contain the language /(?P<name>\w+) # Here goes the name and then ".html" + .*)$ ''' _TEST = { 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', @@ -48,6 +51,9 @@ class TEDIE(SubtitlesInfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) + if m.group('type') == 'embed': + desktop_url = m.group('proto') + 'www' + m.group('urlmain') + return self.url_result(desktop_url, 'TED') name = m.group('name') if m.group('type_talk'): return self._talk_info(url, name) diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py new file mode 100644 index 000000000..34008afc6 --- /dev/null +++ b/youtube_dl/extractor/toypics.py @@ -0,0 +1,75 @@ +from .common import InfoExtractor +import re + + +class ToypicsIE(InfoExtractor): + IE_DESC = 'Toypics user profile' + _VALID_URL = r'http://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*' + _TEST = { + 'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/', + 'md5': '16e806ad6d6f58079d210fe30985e08b', + 'info_dict': { + 'id': '514', + 'ext': 'mp4', + 'title': 'Chance-Bulge\'d, 2', + 'age_limit': 18, + 'uploader': 'kidsune', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + page = self._download_webpage(url, video_id) + video_url = self._html_search_regex( + r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL') + title = self._html_search_regex( + r'<title>Toypics - ([^<]+)</title>', page, 'title') + username = self._html_search_regex( + r'toypics.net/([^/"]+)" class="user-name">', page, 'username') + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'uploader': username, + 'age_limit': 18, + } + + +class ToypicsUserIE(InfoExtractor): + IE_DESC = 'Toypics user profile' + _VALID_URL = r'http://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + username = mobj.group('username') + + profile_page = self._download_webpage( + url, username, note='Retrieving profile page') + + video_count = int(self._search_regex( + r'public/">Public Videos \(([0-9]+)\)</a></li>', profile_page, + 'video count')) + + PAGE_SIZE = 8 + urls = [] + page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE + for n in range(1, page_count + 1): + lpage_url = url + '/public/%d' % n + lpage = self._download_webpage( + lpage_url, username, + note='Downloading page %d/%d' % (n, page_count)) + urls.extend( + re.findall( + r'<p class="video-entry-title">\n\s*<a href="(http://videos.toypics.net/view/[^"]+)">', + lpage)) + + return { + '_type': 'playlist', + 'id': username, + 'entries': [{ + '_type': 'url', + 'url': eurl, + 'ie_key': 'Toypics', + } for eurl in urls] + } diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py new file mode 100644 index 000000000..ebd2a3dca --- /dev/null +++ b/youtube_dl/extractor/videolecturesnet.py @@ -0,0 +1,70 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + find_xpath_attr, + int_or_none, + parse_duration, + unified_strdate, +) + + +class VideoLecturesNetIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/' + IE_NAME = 'videolectures.net' + + _TEST = { + 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', + 'info_dict': { + 'id': 'promogram_igor_mekjavic_eng', + 'ext': 'mp4', + 'title': 'Automatics, robotics and biocybernetics', + 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'upload_date': '20130627', + 'duration': 565, + 'thumbnail': 're:http://.*\.jpg', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id + smil = self._download_xml(smil_url, video_id) + + title = find_xpath_attr(smil, './/meta', 'name', 'title').attrib['content'] + description_el = find_xpath_attr(smil, './/meta', 'name', 'abstract') + description = ( + None if description_el is None + else description_el.attrib['content']) + upload_date = unified_strdate( + find_xpath_attr(smil, './/meta', 'name', 'date').attrib['content']) + + switch = smil.find('.//switch') + duration = parse_duration(switch.attrib.get('dur')) + thumbnail_el = find_xpath_attr(switch, './image', 'type', 'thumbnail') + thumbnail = ( + None if thumbnail_el is None else thumbnail_el.attrib.get('src')) + + formats = [{ + 'url': v.attrib['src'], + 'width': int_or_none(v.attrib.get('width')), + 'height': int_or_none(v.attrib.get('height')), + 'filesize': int_or_none(v.attrib.get('size')), + 'tbr': int_or_none(v.attrib.get('systemBitrate')) / 1000.0, + 'ext': v.attrib.get('ext'), + } for v in switch.findall('./video') + if v.attrib.get('proto') == 'http'] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'upload_date': upload_date, + 'duration': duration, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 2206a06d5..15f315298 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,29 +1,33 @@ +from __future__ import unicode_literals + import re from ..utils import ( ExtractorError, unescapeHTML, unified_strdate, + US_RATINGS, ) from .subtitles import SubtitlesInfoExtractor class VikiIE(SubtitlesInfoExtractor): - IE_NAME = u'viki' + IE_NAME = 'viki' _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)' _TEST = { - u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14', - u'file': u'1023585v.mp4', - u'md5': u'a21454021c2646f5433514177e2caa5f', - u'info_dict': { - u'title': u'Heirs Episode 14', - u'uploader': u'SBS', - u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e', - u'upload_date': u'20131121', - u'age_limit': 13, + 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', + 'md5': 'a21454021c2646f5433514177e2caa5f', + 'info_dict': { + 'id': '1023585v', + 'ext': 'mp4', + 'title': 'Heirs Episode 14', + 'uploader': 'SBS', + 'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e', + 'upload_date': '20131121', + 'age_limit': 13, }, - u'skip': u'Blocked in the US', + 'skip': 'Blocked in the US', } def _real_extract(self, url): @@ -44,28 +48,21 @@ class VikiIE(SubtitlesInfoExtractor): rating_str = self._html_search_regex( r'<strong>Rating: </strong>\s*([^<]*)<', webpage, - u'rating information', default='').strip() - RATINGS = { - 'G': 0, - 'PG': 10, - 'PG-13': 13, - 'R': 16, - 'NC': 18, - } - age_limit = RATINGS.get(rating_str) + 'rating information', default='').strip() + age_limit = US_RATINGS.get(rating_str) info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id info_webpage = self._download_webpage( - info_url, video_id, note=u'Downloading info page') + info_url, video_id, note='Downloading info page') if re.match(r'\s*<div\s+class="video-error', info_webpage): raise ExtractorError( - u'Video %s is blocked from your location.' % video_id, + 'Video %s is blocked from your location.' % video_id, expected=True) video_url = self._html_search_regex( - r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL') + r'<source[^>]+src="([^"]+)"', info_webpage, 'video URL') upload_date_str = self._html_search_regex( - r'"created_at":"([^"]+)"', info_webpage, u'upload date') + r'"created_at":"([^"]+)"', info_webpage, 'upload date') upload_date = ( unified_strdate(upload_date_str) if upload_date_str is not None diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index fc9237a3f..4e89acd81 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -7,14 +9,14 @@ class WorldStarHipHopIE(InfoExtractor): _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)' _TEST = { "url": "http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO", - "file": "wshh6a7q1ny0G34ZwuIO.mp4", "md5": "9d04de741161603bf7071bbf4e883186", "info_dict": { + "id": "wshh6a7q1ny0G34ZwuIO", + "ext": "mp4", "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" } } - def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('id') @@ -23,41 +25,32 @@ class WorldStarHipHopIE(InfoExtractor): m_vevo_id = re.search(r'videoId=(.*?)&?', webpage_src) - if m_vevo_id is not None: - self.to_screen(u'Vevo video detected:') return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo') - video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)', - webpage_src, u'video URL') + video_url = self._search_regex( + r'so\.addVariable\("file","(.*?)"\)', webpage_src, 'video URL') if 'youtube' in video_url: - self.to_screen(u'Youtube video detected:') return self.url_result(video_url, ie='Youtube') - if 'mp4' in video_url: - ext = 'mp4' - else: - ext = 'flv' - - video_title = self._html_search_regex(r"<title>(.*)</title>", - webpage_src, u'title') + video_title = self._html_search_regex( + r"<title>(.*)</title>", webpage_src, 'title') # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />', - webpage_src, u'thumbnail', fatal=False) - + thumbnail = self._html_search_regex( + r'rel="image_src" href="(.*)" />', webpage_src, 'thumbnail', + fatal=False) if not thumbnail: _title = r"""candytitles.*>(.*)</span>""" mobj = re.search(_title, webpage_src) if mobj is not None: video_title = mobj.group(1) - results = [{ - 'id': video_id, - 'url' : video_url, - 'title' : video_title, - 'thumbnail' : thumbnail, - 'ext' : ext, - }] - return results + return { + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'thumbnail': thumbnail, + } + diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py new file mode 100644 index 000000000..71bd7c463 --- /dev/null +++ b/youtube_dl/extractor/xbef.py @@ -0,0 +1,50 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, +) + + +class XBefIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking', + 'md5': 'a478b565baff61634a98f5e5338be995', + 'info_dict': { + 'id': '5119', + 'ext': 'mp4', + 'title': 'md5:7358a9faef8b7b57acda7c04816f170e', + 'age_limit': 18, + 'thumbnail': 're:^http://.*\.jpg', + } + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex( + r'<h1[^>]*>(.*?)</h1>', webpage, 'title') + + config_url_enc = self._download_webpage( + 'http://xbef.com/Main/GetVideoURLEncoded/%s' % video_id, video_id, + note='Retrieving config URL') + config_url = compat_urllib_parse.unquote(config_url_enc) + config = self._download_xml( + config_url, video_id, note='Retrieving config') + + video_url = config.find('./file').text + thumbnail = config.find('./image').text + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'age_limit': 18, + } + diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index d3eefd086..b293e2665 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -1,11 +1,10 @@ from __future__ import unicode_literals -import os import re +import json from .common import InfoExtractor from ..utils import ( - compat_urllib_parse_urlparse, compat_urllib_request, parse_duration, str_to_int, @@ -42,7 +41,6 @@ class XTubeIE(InfoExtractor): r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False) video_description = self._html_search_regex( r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False) - video_url = self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/') duration = parse_duration(self._html_search_regex( r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False)) view_count = self._html_search_regex( @@ -54,12 +52,18 @@ class XTubeIE(InfoExtractor): if comment_count: comment_count = str_to_int(comment_count) - path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[5].split('_')[:2] - format[0] += 'p' - format[1] += 'k' - format = "-".join(format) + player_quality_option = json.loads(self._html_search_regex( + r'playerQualityOption = ({.+?});', webpage, 'player quality option')) + + QUALITIES = ['3gp', 'mp4_normal', 'mp4_high', 'flv', 'mp4_ultra', 'mp4_720', 'mp4_1080'] + formats = [ + { + 'url': furl, + 'format_id': format_id, + 'preference': QUALITIES.index(format_id) if format_id in QUALITIES else -1, + } for format_id, furl in player_quality_option.items() + ] + self._sort_formats(formats) return { 'id': video_id, @@ -69,9 +73,42 @@ class XTubeIE(InfoExtractor): 'duration': duration, 'view_count': view_count, 'comment_count': comment_count, - 'url': video_url, - 'ext': extension, - 'format': format, - 'format_id': format, + 'formats': formats, 'age_limit': 18, } + +class XTubeUserIE(InfoExtractor): + IE_DESC = 'XTube user profile' + _VALID_URL = r'https?://(?:www\.)?xtube\.com/community/profile\.php\?(.*?)user=(?P<username>[^&#]+)(?:$|[&#])' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + username = mobj.group('username') + + profile_page = self._download_webpage( + url, username, note='Retrieving profile page') + + video_count = int(self._search_regex( + r'<strong>%s\'s Videos \(([0-9]+)\)</strong>'%username, profile_page, + 'video count')) + + PAGE_SIZE = 25 + urls = [] + page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE + for n in range(1, page_count + 1): + lpage_url = 'http://www.xtube.com/user_videos.php?page=%d&u=%s' % (n, username) + lpage = self._download_webpage( + lpage_url, username, + note='Downloading page %d/%d' % (n, page_count)) + urls.extend( + re.findall(r'addthis:url="([^"]+)"', lpage)) + + return { + '_type': 'playlist', + 'id': username, + 'entries': [{ + '_type': 'url', + 'url': eurl, + 'ie_key': 'XTube', + } for eurl in urls] + } diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 77ad423c4..d456c4da5 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -1,3 +1,6 @@ +from __future__ import unicode_literals + + import json import re import sys @@ -17,24 +20,25 @@ from ..aes import ( class YouPornIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))' + _VALID_URL = r'^(?P<proto>https?://)(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))' _TEST = { - u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', - u'file': u'505835.mp4', - u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89', - u'info_dict': { - u"upload_date": u"20101221", - u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", - u"uploader": u"Ask Dan And Jennifer", - u"title": u"Sex Ed: Is It Safe To Masturbate Daily?", - u"age_limit": 18, + 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', + 'md5': '71ec5fcfddacf80f495efa8b6a8d9a89', + 'info_dict': { + 'id': '505835', + 'ext': 'mp4', + 'upload_date': '20101221', + 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', + 'uploader': 'Ask Dan And Jennifer', + 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', + 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') - url = 'http://www.' + mobj.group('url') + url = mobj.group('proto') + 'www.' + mobj.group('url') req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') @@ -42,7 +46,7 @@ class YouPornIE(InfoExtractor): age_limit = self._rta_search(webpage) # Get JSON parameters - json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters') + json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, 'JSON parameters') try: params = json.loads(json_params) except: @@ -61,7 +65,7 @@ class YouPornIE(InfoExtractor): # Get all of the links from the page DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' download_list_html = self._search_regex(DOWNLOAD_LIST_RE, - webpage, u'download list').strip() + webpage, 'download list').strip() LINK_RE = r'<a href="([^"]+)">' links = re.findall(LINK_RE, download_list_html) @@ -86,7 +90,7 @@ class YouPornIE(InfoExtractor): resolution = format_parts[0] height = int(resolution[:-len('p')]) bitrate = int(format_parts[1][:-len('k')]) - format = u'-'.join(format_parts) + u'-' + dn + format = '-'.join(format_parts) + '-' + dn formats.append({ 'url': video_url, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f7cb497a8..3a3a5a39e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -176,32 +176,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # 3d videos - '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20}, - '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20}, - '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20}, + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20}, # Apple HTTP Live Streaming - '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10}, - '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10}, - '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10}, - '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10}, - '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10}, - '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10}, - '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10}, # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '138': {'ext': 'mp4', 'height': 2160, 'resolution': '2160p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '264': {'ext': 'mp4', 'height': 1440, 'resolution': '1440p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Dash mp4 audio '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50}, @@ -215,13 +215,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40}, - '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40}, - '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, - '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, - '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, - '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40}, - '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH webm', 'preference': -40}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH webm', 'preference': -40}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH webm', 'preference': -40}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH webm', 'preference': -40}, # Dash webm audio '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50}, @@ -1130,14 +1130,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.') def _real_extract(self, url): + proto = ( + u'http' if self._downloader.params.get('prefer_insecure', False) + else u'https') + # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) if mobj: - url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') + url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') video_id = self.extract_id(url) # Get video webpage - url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id + url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id video_webpage = self._download_webpage(url, video_id) # Attempt to extract SWF player URL @@ -1162,7 +1166,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'asv': 3, 'sts':'1588', }) - video_info_url = 'https://www.youtube.com/get_video_info?' + data + video_info_url = proto + '://www.youtube.com/get_video_info?' + data video_info_webpage = self._download_webpage(video_info_url, video_id, note=False, errnote='unable to download video info webpage') @@ -1170,7 +1174,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: age_gate = False for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' + video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' % (video_id, el_type)) video_info_webpage = self._download_webpage(video_info_url, video_id, note=False, @@ -1445,7 +1449,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, - 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, + 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 3cf29e63a..68d590ba2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,11 +1,13 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import calendar import contextlib import ctypes import datetime import email.utils import errno +import getpass import gzip import itertools import io @@ -500,13 +502,13 @@ def orderedSet(iterable): res.append(el) return res + def unescapeHTML(s): - """ - @param s a string - """ - assert type(s) == type(u'') + if s is None: + return None + assert type(s) == compat_str - result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s) + result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s) return result @@ -760,8 +762,37 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): https_response = http_response +def parse_iso8601(date_str): + """ Return a UNIX timestamp from the given date """ + + if date_str is None: + return None + + m = re.search( + r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$', + date_str) + if not m: + timezone = datetime.timedelta() + else: + date_str = date_str[:-len(m.group(0))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) + + dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone + return calendar.timegm(dt.timetuple()) + + def unified_strdate(date_str): """Return a string with the date in the format YYYYMMDD""" + + if date_str is None: + return None + upload_date = None #Replace commas date_str = date_str.replace(',', ' ') @@ -1121,11 +1152,11 @@ def setproctitle(title): libc = ctypes.cdll.LoadLibrary("libc.so.6") except OSError: return - title = title - buf = ctypes.create_string_buffer(len(title) + 1) - buf.value = title.encode('utf-8') + title_bytes = title.encode('utf-8') + buf = ctypes.create_string_buffer(len(title_bytes)) + buf.value = title_bytes try: - libc.prctl(15, ctypes.byref(buf), 0, 0, 0) + libc.prctl(15, buf, 0, 0, 0) except AttributeError: return # Strange libc, just skip this @@ -1279,3 +1310,21 @@ def parse_xml(s): parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) + + +if sys.version_info < (3, 0) and sys.platform == 'win32': + def compat_getpass(prompt, *args, **kwargs): + if isinstance(prompt, compat_str): + prompt = prompt.encode(preferredencoding()) + return getpass.getpass(prompt, *args, **kwargs) +else: + compat_getpass = getpass.getpass + + +US_RATINGS = { + 'G': 0, + 'PG': 10, + 'PG-13': 13, + 'R': 16, + 'NC': 18, +} diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 525169e2a..3a936ef3e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.03.17' +__version__ = '2014.03.24.1' |