([^<]+)

[^/]+)/(?P[^/?]+)[/?]?.*$' IE_NAME = u'escapist' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) showName = mobj.group('showname') videoId = mobj.group('episode') self.report_extraction(videoId) webpage = self._download_webpage(url, videoId) videoDesc = self._html_search_regex('[0-9]+)/(?P.*)$' IE_NAME = u'collegehumor' def report_manifest(self, video_id): """Report information extraction.""" self.to_screen(u'%s: Downloading XML manifest' % video_id) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('videoid') info = { 'id': video_id, 'uploader': None, 'upload_date': None, } self.report_extraction(video_id) xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id try: metaXml = compat_urllib_request.urlopen(xmlUrl).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) mdoc = xml.etree.ElementTree.fromstring(metaXml) try: videoNode = mdoc.findall('./video')[0] info['description'] = videoNode.findall('./description')[0].text info['title'] = videoNode.findall('./caption')[0].text info['thumbnail'] = videoNode.findall('./thumbnail')[0].text manifest_url = videoNode.findall('./file')[0].text except IndexError: raise ExtractorError(u'Invalid metadata XML file') manifest_url += '?hdcore=2.10.3' self.report_manifest(video_id) try: manifestXml = compat_urllib_request.urlopen(manifest_url).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) adoc = xml.etree.ElementTree.fromstring(manifestXml) try: media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] node_id = media_node.attrib['url'] video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text except IndexError as err: raise ExtractorError(u'Invalid manifest file') url_pr = compat_urllib_parse_urlparse(manifest_url) url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1' info['url'] = url info['ext'] = 'f4f' return [info] class XVideosIE(InfoExtractor): """Information extractor for xvideos.com""" _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)' IE_NAME = u'xvideos' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) # Extract video URL video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&', webpage, u'video URL')) # Extract title video_title = self._html_search_regex(r'(.*?)\s+-\s+XVID', webpage, u'title') # Extract video thumbnail video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage, u'thumbnail', fatal=False) info = { 'id': video_id, 'url': video_url, 'uploader': None, 'upload_date': None, 'title': video_title, 'ext': 'flv', 'thumbnail': video_thumbnail, 'description': None, } return [info] class SoundcloudIE(InfoExtractor): """Information extractor for soundcloud.com To access the media, the uid of the song and a stream token must be extracted from the page source and the script must make a request to media.soundcloud.com/crossdomain.xml. Then the media can be grabbed by requesting from an url composed of the stream token and uid """ _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)' IE_NAME = u'soundcloud' def report_resolve(self, video_id): """Report information extraction.""" self.to_screen(u'%s: Resolving id' % video_id) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) # extract uploader (which is in the url) uploader = mobj.group(1) # extract simple title (uploader + slug of song title) slug_title = mobj.group(2) simple_title = uploader + u'-' + slug_title full_title = '%s/%s' % (uploader, slug_title) self.report_resolve(full_title) url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title) resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON') info = json.loads(info_json) video_id = info['id'] self.report_extraction(full_title) streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' stream_json = self._download_webpage(streams_url, full_title, u'Downloading stream definitions', u'unable to download stream definitions') streams = json.loads(stream_json) mediaURL = streams['http_mp3_128_url'] upload_date = unified_strdate(info['created_at']) return [{ 'id': info['id'], 'url': mediaURL, 'uploader': info['user']['username'], 'upload_date': upload_date, 'title': info['title'], 'ext': u'mp3', 'description': info['description'], }] class SoundcloudSetIE(InfoExtractor): """Information extractor for soundcloud.com sets To access the media, the uid of the song and a stream token must be extracted from the page source and the script must make a request to media.soundcloud.com/crossdomain.xml. Then the media can be grabbed by requesting from an url composed of the stream token and uid """ _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)' IE_NAME = u'soundcloud:set' def report_resolve(self, video_id): """Report information extraction.""" self.to_screen(u'%s: Resolving id' % video_id) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) # extract uploader (which is in the url) uploader = mobj.group(1) # extract simple title (uploader + slug of song title) slug_title = mobj.group(2) simple_title = uploader + u'-' + slug_title full_title = '%s/sets/%s' % (uploader, slug_title) self.report_resolve(full_title) url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' info_json = self._download_webpage(resolv_url, full_title) videos = [] info = json.loads(info_json) if 'errors' in info: for err in info['errors']: self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message'])) return self.report_extraction(full_title) for track in info['tracks']: video_id = track['id'] streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON') self.report_extraction(video_id) streams = json.loads(stream_json) mediaURL = streams['http_mp3_128_url'] videos.append({ 'id': video_id, 'url': mediaURL, 'uploader': track['user']['username'], 'upload_date': unified_strdate(track['created_at']), 'title': track['title'], 'ext': u'mp3', 'description': track['description'], }) return videos class InfoQIE(InfoExtractor): """Information extractor for infoq.com""" _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) webpage = self._download_webpage(url, video_id=url) self.report_extraction(url) # Extract video URL mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage) if mobj is None: raise ExtractorError(u'Unable to extract video url') real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8')) video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id # Extract title video_title = self._search_regex(r'contentTitle = "(.*?)";', webpage, u'title') # Extract description video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage, u'description', fatal=False) video_filename = video_url.split('/')[-1] video_id, extension = video_filename.split('.') info = { 'id': video_id, 'url': video_url, 'uploader': None, 'upload_date': None, 'title': video_title, 'ext': extension, # Extension is always(?) mp4, but seems to be flv 'thumbnail': None, 'description': video_description, } return [info] class MixcloudIE(InfoExtractor): """Information extractor for www.mixcloud.com""" _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/ _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' IE_NAME = u'mixcloud' def report_download_json(self, file_id): """Report JSON download.""" self.to_screen(u'Downloading json') def get_urls(self, jsonData, fmt, bitrate='best'): """Get urls from 'audio_formats' section in json""" file_url = None try: bitrate_list = jsonData[fmt] if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list: bitrate = max(bitrate_list) # select highest url_list = jsonData[fmt][bitrate] except TypeError: # we have no bitrate info. url_list = jsonData[fmt] return url_list def check_urls(self, url_list): """Returns 1st active url from list""" for url in url_list: try: compat_urllib_request.urlopen(url) return url except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: url = None return None def _print_formats(self, formats): print('Available formats:') for fmt in formats.keys(): for b in formats[fmt]: try: ext = formats[fmt][b][0] print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])) except TypeError: # we have no bitrate info ext = formats[fmt][0] print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])) break def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) # extract uploader & filename from url uploader = mobj.group(1).decode('utf-8') file_id = uploader + "-" + mobj.group(2).decode('utf-8') # construct API request file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json' # retrieve .json file with links to files request = compat_urllib_request.Request(file_url) try: self.report_download_json(file_url) jsonData = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err)) # parse JSON json_data = json.loads(jsonData) player_url = json_data['player_swf_url'] formats = dict(json_data['audio_formats']) req_format = self._downloader.params.get('format', None) bitrate = None if self._downloader.params.get('listformats', None): self._print_formats(formats) return if req_format is None or req_format == 'best': for format_param in formats.keys(): url_list = self.get_urls(formats, format_param) # check urls file_url = self.check_urls(url_list) if file_url is not None: break # got it! else: if req_format not in formats: raise ExtractorError(u'Format is not available') url_list = self.get_urls(formats, req_format) file_url = self.check_urls(url_list) format_param = req_format return [{ 'id': file_id.decode('utf-8'), 'url': file_url.decode('utf-8'), 'uploader': uploader.decode('utf-8'), 'upload_date': None, 'title': json_data['name'], 'ext': file_url.split('.')[-1].decode('utf-8'), 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'thumbnail': json_data['thumbnail_url'], 'description': json_data['description'], 'player_url': player_url.decode('utf-8'), }] class StanfordOpenClassroomIE(InfoExtractor): """Information extractor for Stanford's Open ClassRoom""" _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' IE_NAME = u'stanfordoc' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) if mobj.group('course') and mobj.group('video'): # A specific video course = mobj.group('course') video = mobj.group('video') info = { 'id': course + '_' + video, 'uploader': None, 'upload_date': None, } self.report_extraction(info['id']) baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' xmlUrl = baseUrl + video + '.xml' try: metaXml = compat_urllib_request.urlopen(xmlUrl).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) mdoc = xml.etree.ElementTree.fromstring(metaXml) try: info['title'] = mdoc.findall('./title')[0].text info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text except IndexError: raise ExtractorError(u'Invalid metadata XML file') info['ext'] = info['url'].rpartition('.')[2] return [info] elif mobj.group('course'): # A course page course = mobj.group('course') info = { 'id': course, 'type': 'playlist', 'uploader': None, 'upload_date': None, } coursepage = self._download_webpage(url, info['id'], note='Downloading course info page', errnote='Unable to download course info page') info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) info['description'] = self._html_search_regex('<description>([^<]+)</description>', coursepage, u'description', fatal=False) links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) info['list'] = [ { 'type': 'reference', 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), } for vpage in links] results = [] for entry in info['list']: assert entry['type'] == 'reference' results += self.extract(entry['url']) return results else: # Root page info = { 'id': 'Stanford OpenClassroom', 'type': 'playlist', 'uploader': None, 'upload_date': None, } self.report_download_webpage(info['id']) rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' try: rootpage = compat_urllib_request.urlopen(rootURL).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to download course info page: ' + compat_str(err)) info['title'] = info['id'] links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) info['list'] = [ { 'type': 'reference', 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), } for cpage in links] results = [] for entry in info['list']: assert entry['type'] == 'reference' results += self.extract(entry['url']) return results class MTVIE(InfoExtractor): """Information extractor for MTV.com""" _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$' IE_NAME = u'mtv' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) if not mobj.group('proto'): url = 'http://' + url video_id = mobj.group('videoid') webpage = self._download_webpage(url, video_id) song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage, u'song name', fatal=False) video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', webpage, u'title') mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage, u'mtvn_uri', fatal=False) content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage, u'content id', fatal=False) videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri self.report_extraction(video_id) request = compat_urllib_request.Request(videogen_url) try: metadataXml = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err)) mdoc = xml.etree.ElementTree.fromstring(metadataXml) renditions = mdoc.findall('.//rendition') # For now, always pick the highest quality. rendition = renditions[-1] try: _,_,ext = rendition.attrib['type'].partition('/') format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] video_url = rendition.find('./src').text except KeyError: raise ExtractorError('Invalid rendition field.') info = { 'id': video_id, 'url': video_url, 'uploader': performer, 'upload_date': None, 'title': video_title, 'ext': ext, 'format': format, } return [info] class YoukuIE(InfoExtractor): _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html' def _gen_sid(self): nowTime = int(time.time() * 1000) random1 = random.randint(1000,1998) random2 = random.randint(1000,9999) return "%d%d%d" %(nowTime,random1,random2) def _get_file_ID_mix_string(self, seed): mixed = [] source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890") seed = float(seed) for i in range(len(source)): seed = (seed * 211 + 30031 ) % 65536 index = math.floor(seed / 65536 * len(source) ) mixed.append(source[int(index)]) source.remove(source[int(index)]) #return ''.join(mixed) return mixed def _get_file_id(self, fileId, seed): mixed = self._get_file_ID_mix_string(seed) ids = fileId.split('*') realId = [] for ch in ids: if ch: realId.append(mixed[int(ch)]) return ''.join(realId) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('ID') info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id jsondata = self._download_webpage(info_url, video_id) self.report_extraction(video_id) try: config = json.loads(jsondata) video_title = config['data'][0]['title'] seed = config['data'][0]['seed'] format = self._downloader.params.get('format', None) supported_format = list(config['data'][0]['streamfileids'].keys()) if format is None or format == 'best': if 'hd2' in supported_format: format = 'hd2' else: format = 'flv' ext = u'flv' elif format == 'worst': format = 'mp4' ext = u'mp4' else: format = 'flv' ext = u'flv' fileid = config['data'][0]['streamfileids'][format] keys = [s['k'] for s in config['data'][0]['segs'][format]] except (UnicodeDecodeError, ValueError, KeyError): raise ExtractorError(u'Unable to extract info section') files_info=[] sid = self._gen_sid() fileid = self._get_file_id(fileid, seed) #column 8,9 of fileid represent the segment number #fileid[7:9] should be changed for index, key in enumerate(keys): temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:]) download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key) info = { 'id': '%s_part%02d' % (video_id, index), 'url': download_url, 'uploader': None, 'upload_date': None, 'title': video_title, 'ext': ext, } files_info.append(info) return files_info class XNXXIE(InfoExtractor): """Information extractor for xnxx.com""" _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)' IE_NAME = u'xnxx' VIDEO_URL_RE = r'flv_url=(.*?)&' VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM' VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(1) # Get webpage content webpage = self._download_webpage(url, video_id) video_url = self._search_regex(self.VIDEO_URL_RE, webpage, u'video URL') video_url = compat_urllib_parse.unquote(video_url) video_title = self._html_search_regex(self.VIDEO_TITLE_RE, webpage, u'title') video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE, webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, 'url': video_url, 'uploader': None, 'upload_date': None, 'title': video_title, 'ext': 'flv', 'thumbnail': video_thumbnail, 'description': None, }] class GooglePlusIE(InfoExtractor): """Information extractor for plus.google.com.""" _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)' IE_NAME = u'plus.google' def _real_extract(self, url): # Extract id from URL mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) post_url = mobj.group(0) video_id = mobj.group(1) video_extension = 'flv' # Step 1, Retrieve post webpage to extract further information webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage') self.report_extraction(video_id) # Extract update date upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>', webpage, u'upload date', fatal=False) if upload_date: # Convert timestring to a format suitable for filename upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d") upload_date = upload_date.strftime('%Y%m%d') # Extract uploader uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>', webpage, u'uploader', fatal=False) # Extract title # Get the first line for title video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', webpage, 'title', default=u'NA') # Step 2, Stimulate clicking the image box to launch video video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]', webpage, u'video page URL') webpage = self._download_webpage(video_page, video_id, u'Downloading video page') # Extract video links on video page """Extract video links of all sizes""" pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"' mobj = re.findall(pattern, webpage) if len(mobj) == 0: raise ExtractorError(u'Unable to extract video links') # Sort in resolution links = sorted(mobj) # Choose the lowest of the sort, i.e. highest resolution video_url = links[-1] # Only get the url. The resolution part in the tuple has no use anymore video_url = video_url[-1] # Treat escaped \u0026 style hex try: video_url = video_url.decode("unicode_escape") except AttributeError: # Python 3 video_url = bytes(video_url, 'ascii').decode('unicode-escape') return [{ 'id': video_id, 'url': video_url, 'uploader': uploader, 'upload_date': upload_date, 'title': video_title, 'ext': video_extension, }] class NBAIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$' IE_NAME = u'nba' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' shortened_video_id = video_id.rpartition('/')[2] title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"', webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '') # It isn't there in the HTML it returns to us # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) info = { 'id': shortened_video_id, 'url': video_url, 'ext': 'mp4', 'title': title, # 'uploader_date': uploader_date, 'description': description, } return [info] class JustinTVIE(InfoExtractor): """Information extractor for justin.tv and twitch.tv""" # TODO: One broadcast may be split into multiple videos. The key # 'broadcast_id' is the same for all parts, and 'broadcast_part' # starts at 1 and increases. Can we treat all parts as one video? _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/ (?: (?P<channelid>[^/]+)| (?:(?:[^/]+)/b/(?P<videoid>[^/]+))| (?:(?:[^/]+)/c/(?P<chapterid>[^/]+)) ) /?(?:\#.*)?$ """ _JUSTIN_PAGE_LIMIT = 100 IE_NAME = u'justin.tv' def report_download_page(self, channel, offset): """Report attempt to download a single page of videos.""" self.to_screen(u'%s: Downloading video information from %d to %d' % (channel, offset, offset + self._JUSTIN_PAGE_LIMIT)) # Return count of items, list of *valid* items def _parse_page(self, url, video_id): webpage = self._download_webpage(url, video_id, u'Downloading video info JSON', u'unable to download video info JSON') response = json.loads(webpage) if type(response) != list: error_text = response.get('error', 'unknown error') raise ExtractorError(u'Justin.tv API: %s' % error_text) info = [] for clip in response: video_url = clip['video_file_url'] if video_url: video_extension = os.path.splitext(video_url)[1][1:] video_date = re.sub('-', '', clip['start_time'][:10]) video_uploader_id = clip.get('user_id', clip.get('channel_id')) video_id = clip['id'] video_title = clip.get('title', video_id) info.append({ 'id': video_id, 'url': video_url, 'title': video_title, 'uploader': clip.get('channel_name', video_uploader_id), 'uploader_id': video_uploader_id, 'upload_date': video_date, 'ext': video_extension, }) return (len(response), info) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'invalid URL: %s' % url) api_base = 'http://api.justin.tv' paged = False if mobj.group('channelid'): paged = True video_id = mobj.group('channelid') api = api_base + '/channel/archives/%s.json' % video_id elif mobj.group('chapterid'): chapter_id = mobj.group('chapterid') webpage = self._download_webpage(url, chapter_id) m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage) if not m: raise ExtractorError(u'Cannot find archive of a chapter') archive_id = m.group(1) api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id chapter_info_xml = self._download_webpage(api, chapter_id, note=u'Downloading chapter information', errnote=u'Chapter information download failed') doc = xml.etree.ElementTree.fromstring(chapter_info_xml) for a in doc.findall('.//archive'): if archive_id == a.find('./id').text: break else: raise ExtractorError(u'Could not find chapter in chapter information') video_url = a.find('./video_file_url').text video_ext = video_url.rpartition('.')[2] or u'flv' chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id, note='Downloading chapter metadata', errnote='Download of chapter metadata failed') chapter_info = json.loads(chapter_info_json) bracket_start = int(doc.find('.//bracket_start').text) bracket_end = int(doc.find('.//bracket_end').text) # TODO determine start (and probably fix up file) # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457 #video_url += u'?start=' + TODO:start_timestamp # bracket_start is 13290, but we want 51670615 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. ' u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end))) info = { 'id': u'c' + chapter_id, 'url': video_url, 'ext': video_ext, 'title': chapter_info['title'], 'thumbnail': chapter_info['preview'], 'description': chapter_info['description'], 'uploader': chapter_info['channel']['display_name'], 'uploader_id': chapter_info['channel']['name'], } return [info] else: video_id = mobj.group('videoid') api = api_base + '/broadcast/by_archive/%s.json' % video_id self.report_extraction(video_id) info = [] offset = 0 limit = self._JUSTIN_PAGE_LIMIT while True: if paged: self.report_download_page(video_id, offset) page_url = api + ('?offset=%d&limit=%d' % (offset, limit)) page_count, page_info = self._parse_page(page_url, video_id) info.extend(page_info) if not paged or page_count != limit: break offset += limit return info class FunnyOrDieIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'invalid URL: %s' % url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, u'video URL', flags=re.DOTALL) title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) video_description = self._html_search_regex(r'video|app)/ #If the page is only for videos or for a game (?P\d+)/? (?P\d*)(?P\??) #For urltype == video we sometimes get the videoID """ _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/' _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) gameID = m.group('gameID') videourl = self._VIDEO_PAGE_TEMPLATE % gameID webpage = self._download_webpage(videourl, gameID) if re.search('

Please enter your birth date to continue:

', webpage) is not None: videourl = self._AGECHECK_TEMPLATE % gameID self.report_age_confirmation() webpage = self._download_webpage(videourl, gameID) self.report_extraction(gameID) game_title = self._html_search_regex(r'

(.*?)

', webpage, 'game title') urlRE = r"'movie_(?P\d+)': \{\s*FILENAME: \"(?P[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P[\w:/\.\?=\+-]+)\")?\s*\}," mweb = re.finditer(urlRE, webpage) namesRE = r'(?P.+?)' titles = re.finditer(namesRE, webpage) thumbsRE = r'

' thumbs = re.finditer(thumbsRE, webpage) videos = [] for vid,vtitle,thumb in zip(mweb,titles,thumbs): video_id = vid.group('videoID') title = vtitle.group('videoName') video_url = vid.group('videoURL') video_thumb = thumb.group('thumbnail') if not video_url: raise ExtractorError(u'Cannot find video url for %s' % video_id) info = { 'id':video_id, 'url':video_url, 'ext': 'flv', 'title': unescapeHTML(title), 'thumbnail': video_thumb } videos.append(info) return [self.playlist_result(videos, gameID, game_title)] class UstreamIE(InfoExtractor): _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P\d+)' IE_NAME = u'ustream' def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') video_url = u'http://tcdn.ustream.tv/video/%s' % video_id webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) video_title = self._html_search_regex(r'data-title="(?P.+)"', webpage, u'title') uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', webpage, u'uploader', fatal=False, flags=re.DOTALL) thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage, u'thumbnail', fatal=False) info = { 'id': video_id, 'url': video_url, 'ext': 'flv', 'title': video_title, 'uploader': uploader, 'thumbnail': thumbnail, } return info class WorldStarHipHopIE(InfoExtractor): _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)' IE_NAME = u'WorldStarHipHop' def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('id') webpage_src = self._download_webpage(url, video_id) video_url = self._search_regex(r'so\.addVariable$"file","(.*?)"$', webpage_src, u'video URL') if 'mp4' in video_url: ext = 'mp4' else: ext = 'flv' video_title = self._html_search_regex(r"<title>(.*)", webpage_src, u'title') # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />', webpage_src, u'thumbnail', fatal=False) if not thumbnail: _title = r"""candytitles.*>(.*)""" mobj = re.search(_title, webpage_src) if mobj is not None: video_title = mobj.group(1) results = [{ 'id': video_id, 'url' : video_url, 'title' : video_title, 'thumbnail' : thumbnail, 'ext' : ext, }] return results class RBMARadioIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)$' def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') webpage = self._download_webpage(url, video_id) json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', webpage, u'json data', flags=re.MULTILINE) try: data = json.loads(json_data) except ValueError as e: raise ExtractorError(u'Invalid JSON: ' + str(e)) video_url = data['akamai_url'] + '&cbr=256' url_parts = compat_urllib_parse_urlparse(video_url) video_ext = url_parts.path.rpartition('.')[2] info = { 'id': video_id, 'url': video_url, 'ext': video_ext, 'title': data['title'], 'description': data.get('teaser_text'), 'location': data.get('country_of_origin'), 'uploader': data.get('host', {}).get('name'), 'uploader_id': data.get('host', {}).get('slug'), 'thumbnail': data.get('image', {}).get('large_url_2x'), 'duration': data.get('duration'), } return [info] class YouPornIE(InfoExtractor): """Information extractor for youporn.com.""" _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P[0-9]+)/(?P[^/]+)' def _print_formats(self, formats): """Print all available formats""" print(u'Available formats:') print(u'ext\t\tformat') print(u'---------------------------------') for format in formats: print(u'%s\t\t%s' % (format['ext'], format['format'])) def _specific(self, req_format, formats): for x in formats: if(x["format"]==req_format): return x return None def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('videoid') req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) # Get JSON parameters json_params = self._search_regex(r'var currentVideo = new Video$(.*)$;', webpage, u'JSON parameters') try: params = json.loads(json_params) except: raise ExtractorError(u'Invalid JSON') self.report_extraction(video_id) try: video_title = params['title'] upload_date = unified_strdate(params['release_date_f']) video_description = params['description'] video_uploader = params['submitted_by'] thumbnail = params['thumbnails'][0]['image'] except KeyError: raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) # Get all of the formats available DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' download_list_html = self._search_regex(DOWNLOAD_LIST_RE, webpage, u'download list').strip() # Get all of the links from the page LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' links = re.findall(LINK_RE, download_list_html) if(len(links) == 0): raise ExtractorError(u'ERROR: no known formats available for video') self.to_screen(u'Links found: %d' % len(links)) formats = [] for link in links: # A link looks like this: # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0 # A path looks like this: # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 video_url = unescapeHTML( link ) path = compat_urllib_parse_urlparse( video_url ).path extension = os.path.splitext( path )[1][1:] format = path.split('/')[4].split('_')[:2] size = format[0] bitrate = format[1] format = "-".join( format ) # title = u'%s-%s-%s' % (video_title, size, bitrate) formats.append({ 'id': video_id, 'url': video_url, 'uploader': video_uploader, 'upload_date': upload_date, 'title': video_title, 'ext': extension, 'format': format, 'thumbnail': thumbnail, 'description': video_description }) if self._downloader.params.get('listformats', None): self._print_formats(formats) return req_format = self._downloader.params.get('format', None) self.to_screen(u'Format: %s' % req_format) if req_format is None or req_format == 'best': return [formats[0]] elif req_format == 'worst': return [formats[-1]] elif req_format in ('-1', 'all'): return formats else: format = self._specific( req_format, formats ) if result is None: raise ExtractorError(u'Requested format not available') return [format] class PornotubeIE(InfoExtractor): """Information extractor for pornotube.com.""" _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('videoid') video_title = mobj.group('title') # Get webpage content webpage = self._download_webpage(url, video_id) # Get the video URL VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url') video_url = compat_urllib_parse.unquote(video_url) #Get the uploaded date VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) if upload_date: upload_date = unified_strdate(upload_date) info = {'id': video_id, 'url': video_url, 'uploader': None, 'upload_date': upload_date, 'title': video_title, 'ext': 'flv', 'format': 'flv'} return [info] class YouJizzIE(InfoExtractor): """Information extractor for youjizz.com.""" _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('videoid') # Get webpage content webpage = self._download_webpage(url, video_id) # Get the video title video_title = self._html_search_regex(r'<title>(?P<title>.*)', webpage, u'title').strip() # Get the embed page result = re.search(r'https?://www.youjizz.com/videos/embed/(?P[0-9]+)', webpage) if result is None: raise ExtractorError(u'ERROR: unable to extract embed page') embed_page_url = result.group(0).strip() video_id = result.group('videoid') webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL video_url = self._search_regex(r'so.addVariable$"file",encodeURIComponent\("(?P[^"]+)"$\);', webpage, u'video URL') info = {'id': video_id, 'url': video_url, 'title': video_title, 'ext': 'flv', 'format': 'flv', 'player_url': embed_page_url} return [info] class EightTracksIE(InfoExtractor): IE_NAME = '8tracks' _VALID_URL = r'https?://8tracks.com/(?P[^/]+)/(?P[^/#]+)(?:#.*)?$' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) playlist_id = mobj.group('id') webpage = self._download_webpage(url, playlist_id) json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL) data = json.loads(json_like) session = str(random.randint(0, 1000000000)) mix_id = data['id'] track_count = data['tracks_count'] first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id) next_url = first_url res = [] for i in itertools.count(): api_json = self._download_webpage(next_url, playlist_id, note=u'Downloading song information %s/%s' % (str(i+1), track_count), errnote=u'Failed to download song information') api_data = json.loads(api_json) track_data = api_data[u'set']['track'] info = { 'id': track_data['id'], 'url': track_data['track_file_stream_url'], 'title': track_data['performer'] + u' - ' + track_data['name'], 'raw_title': track_data['name'], 'uploader_id': data['user']['login'], 'ext': 'm4a', } res.append(info) if api_data['set']['at_last_track']: break next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id']) return res class KeekIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' IE_NAME = u'keek' def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') video_url = u'http://cdn.keek.com/keek/video/%s' % video_id thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) video_title = self._html_search_regex(r'[\S\s]+?

(?P.+?)

', webpage, u'uploader', fatal=False) info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': video_title, 'thumbnail': thumbnail, 'uploader': uploader } return [info] class TEDIE(InfoExtractor): _VALID_URL=r'''http://www\.ted\.com/ ( ((?Pplaylists)/(?P\d+)) # We have a playlist | ((?Ptalks)) # We have a simple talk ) (/lang/(.*?))? # The url may contain the language /(?P\w+) # Here goes the name and then ".html" ''' @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def _real_extract(self, url): m=re.match(self._VALID_URL, url, re.VERBOSE) if m.group('type_talk'): return [self._talk_info(url)] else : playlist_id=m.group('playlist_id') name=m.group('name') self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) return [self._playlist_videos_info(url,name,playlist_id)] def _playlist_videos_info(self,url,name,playlist_id=0): '''Returns the videos of the playlist''' video_RE=r''' (?P.+?)

' webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') m_videos=re.finditer(video_RE,webpage,re.VERBOSE) m_names=re.finditer(video_name_RE,webpage) playlist_title = self._html_search_regex(r'div class="headline">\s*?

x-www-form-urlencoded') info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage') if info_response is None: raise ExtractorError(u'Unable to extract the media url') (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) return [{ 'id': video_id, 'url': final_url, 'ext': ext, 'title': title, 'thumbnail': thumbnail_url, }] def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. """ return [ YoutubePlaylistIE(), YoutubeChannelIE(), YoutubeUserIE(), YoutubeSearchIE(), YoutubeIE(), MetacafeIE(), DailymotionIE(), GoogleSearchIE(), PhotobucketIE(), YahooIE(), YahooSearchIE(), DepositFilesIE(), FacebookIE(), BlipTVIE(), BlipTVUserIE(), VimeoIE(), MyVideoIE(), ComedyCentralIE(), EscapistIE(), CollegeHumorIE(), XVideosIE(), SoundcloudSetIE(), SoundcloudIE(), InfoQIE(), MixcloudIE(), StanfordOpenClassroomIE(), MTVIE(), YoukuIE(), XNXXIE(), YouJizzIE(), PornotubeIE(), YouPornIE(), GooglePlusIE(), ArteTvIE(), NBAIE(), WorldStarHipHopIE(), JustinTVIE(), FunnyOrDieIE(), SteamIE(), UstreamIE(), RBMARadioIE(), EightTracksIE(), KeekIE(), TEDIE(), MySpassIE(), SpiegelIE(), LiveLeakIE(), ARDIE(), ZDFIE(), TumblrIE(), BandcampIE(), RedTubeIE(), InaIE(), HowcastIE(), VineIE(), FlickrIE(), TeamcocoIE(), XHamsterIE(), HypemIE(), Vbox7IE(), GametrailersIE(), StatigramIE(), GenericIE() ] def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" return globals()[ie_name+'IE']