diff options
Diffstat (limited to 'youtube_dl')
-rw-r--r-- | youtube_dl/__init__.py | 3 | ||||
-rw-r--r-- | youtube_dl/downloader/__init__.py | 3 | ||||
-rw-r--r-- | youtube_dl/downloader/f4m.py | 315 | ||||
-rw-r--r-- | youtube_dl/extractor/__init__.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/fourtube.py | 95 | ||||
-rw-r--r-- | youtube_dl/extractor/francetv.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/ndr.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/nfb.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/smotri.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/sohu.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/syfy.py | 27 | ||||
-rw-r--r-- | youtube_dl/extractor/theplatform.py | 66 | ||||
-rw-r--r-- | youtube_dl/extractor/vesti.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/vk.py | 85 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 3 | ||||
-rw-r--r-- | youtube_dl/utils.py | 21 |
16 files changed, 590 insertions, 51 deletions
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index e81366851..57aaff5da 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -41,7 +41,10 @@ __authors__ = ( 'Chris Gahan', 'Saimadhav Heblikar', 'Mike Col', + 'Oleg Prutz', + 'pulpe', 'Andreas Schmitz', + 'Michael Kaiser', ) __license__ = 'Public Domain' diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index aaa92bc75..4ea5811a5 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -5,6 +5,7 @@ from .hls import HlsFD from .http import HttpFD from .mplayer import MplayerFD from .rtmp import RtmpFD +from .f4m import F4mFD from ..utils import ( determine_ext, @@ -22,5 +23,7 @@ def get_suitable_downloader(info_dict): return HlsFD if url.startswith('mms') or url.startswith('rtsp'): return MplayerFD + if determine_ext(url) == 'f4m': + return F4mFD else: return HttpFD diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py new file mode 100644 index 000000000..2a870a758 --- /dev/null +++ b/youtube_dl/downloader/f4m.py @@ -0,0 +1,315 @@ +from __future__ import unicode_literals + +import base64 +import io +import itertools +import os +import time +import xml.etree.ElementTree as etree + +from .common import FileDownloader +from .http import HttpFD +from ..utils import ( + struct_pack, + struct_unpack, + compat_urllib_request, + compat_urlparse, + format_bytes, + encodeFilename, + sanitize_open, +) + + +class FlvReader(io.BytesIO): + """ + Reader for Flv files + The file format is documented in https://www.adobe.com/devnet/f4v.html + """ + + # Utility functions for reading numbers and strings + def read_unsigned_long_long(self): + return struct_unpack('!Q', self.read(8))[0] + + def read_unsigned_int(self): + return struct_unpack('!I', self.read(4))[0] + + def read_unsigned_char(self): + return struct_unpack('!B', self.read(1))[0] + + def read_string(self): + res = b'' + while True: + char = self.read(1) + if char == b'\x00': + break + res += char + return res + + def read_box_info(self): + """ + Read a box and return the info as a tuple: (box_size, box_type, box_data) + """ + real_size = size = self.read_unsigned_int() + box_type = self.read(4) + header_end = 8 + if size == 1: + real_size = self.read_unsigned_long_long() + header_end = 16 + return real_size, box_type, self.read(real_size-header_end) + + def read_asrt(self): + # version + self.read_unsigned_char() + # flags + self.read(3) + quality_entry_count = self.read_unsigned_char() + # QualityEntryCount + for i in range(quality_entry_count): + self.read_string() + + segment_run_count = self.read_unsigned_int() + segments = [] + for i in range(segment_run_count): + first_segment = self.read_unsigned_int() + fragments_per_segment = self.read_unsigned_int() + segments.append((first_segment, fragments_per_segment)) + + return { + 'segment_run': segments, + } + + def read_afrt(self): + # version + self.read_unsigned_char() + # flags + self.read(3) + # time scale + self.read_unsigned_int() + + quality_entry_count = self.read_unsigned_char() + # QualitySegmentUrlModifiers + for i in range(quality_entry_count): + self.read_string() + + fragments_count = self.read_unsigned_int() + fragments = [] + for i in range(fragments_count): + first = self.read_unsigned_int() + first_ts = self.read_unsigned_long_long() + duration = self.read_unsigned_int() + if duration == 0: + discontinuity_indicator = self.read_unsigned_char() + else: + discontinuity_indicator = None + fragments.append({ + 'first': first, + 'ts': first_ts, + 'duration': duration, + 'discontinuity_indicator': discontinuity_indicator, + }) + + return { + 'fragments': fragments, + } + + def read_abst(self): + # version + self.read_unsigned_char() + # flags + self.read(3) + # BootstrapinfoVersion + bootstrap_info_version = self.read_unsigned_int() + # Profile,Live,Update,Reserved + self.read(1) + # time scale + self.read_unsigned_int() + # CurrentMediaTime + self.read_unsigned_long_long() + # SmpteTimeCodeOffset + self.read_unsigned_long_long() + # MovieIdentifier + movie_identifier = self.read_string() + server_count = self.read_unsigned_char() + # ServerEntryTable + for i in range(server_count): + self.read_string() + quality_count = self.read_unsigned_char() + # QualityEntryTable + for i in range(server_count): + self.read_string() + # DrmData + self.read_string() + # MetaData + self.read_string() + + segments_count = self.read_unsigned_char() + segments = [] + for i in range(segments_count): + box_size, box_type, box_data = self.read_box_info() + assert box_type == b'asrt' + segment = FlvReader(box_data).read_asrt() + segments.append(segment) + fragments_run_count = self.read_unsigned_char() + fragments = [] + for i in range(fragments_run_count): + box_size, box_type, box_data = self.read_box_info() + assert box_type == b'afrt' + fragments.append(FlvReader(box_data).read_afrt()) + + return { + 'segments': segments, + 'fragments': fragments, + } + + def read_bootstrap_info(self): + total_size, box_type, box_data = self.read_box_info() + assert box_type == b'abst' + return FlvReader(box_data).read_abst() + + +def read_bootstrap_info(bootstrap_bytes): + return FlvReader(bootstrap_bytes).read_bootstrap_info() + + +def build_fragments_list(boot_info): + """ Return a list of (segment, fragment) for each fragment in the video """ + res = [] + segment_run_table = boot_info['segments'][0] + # I've only found videos with one segment + segment_run_entry = segment_run_table['segment_run'][0] + n_frags = segment_run_entry[1] + fragment_run_entry_table = boot_info['fragments'][0]['fragments'] + first_frag_number = fragment_run_entry_table[0]['first'] + for (i, frag_number) in zip(range(1, n_frags+1), itertools.count(first_frag_number)): + res.append((1, frag_number)) + return res + + +def write_flv_header(stream, metadata): + """Writes the FLV header and the metadata to stream""" + # FLV header + stream.write(b'FLV\x01') + stream.write(b'\x05') + stream.write(b'\x00\x00\x00\x09') + # FLV File body + stream.write(b'\x00\x00\x00\x00') + # FLVTAG + # Script data + stream.write(b'\x12') + # Size of the metadata with 3 bytes + stream.write(struct_pack('!L', len(metadata))[1:]) + stream.write(b'\x00\x00\x00\x00\x00\x00\x00') + stream.write(metadata) + # Magic numbers extracted from the output files produced by AdobeHDS.php + #(https://github.com/K-S-V/Scripts) + stream.write(b'\x00\x00\x01\x73') + + +def _add_ns(prop): + return '{http://ns.adobe.com/f4m/1.0}%s' % prop + + +class HttpQuietDownloader(HttpFD): + def to_screen(self, *args, **kargs): + pass + + +class F4mFD(FileDownloader): + """ + A downloader for f4m manifests or AdobeHDS. + """ + + def real_download(self, filename, info_dict): + man_url = info_dict['url'] + self.to_screen('[download] Downloading f4m manifest') + manifest = self.ydl.urlopen(man_url).read() + self.report_destination(filename) + http_dl = HttpQuietDownloader(self.ydl, + { + 'continuedl': True, + 'quiet': True, + 'noprogress': True, + 'test': self.params.get('test', False), + }) + + doc = etree.fromstring(manifest) + formats = [(int(f.attrib.get('bitrate', -1)), f) for f in doc.findall(_add_ns('media'))] + formats = sorted(formats, key=lambda f: f[0]) + rate, media = formats[-1] + base_url = compat_urlparse.urljoin(man_url, media.attrib['url']) + bootstrap = base64.b64decode(doc.find(_add_ns('bootstrapInfo')).text) + metadata = base64.b64decode(media.find(_add_ns('metadata')).text) + boot_info = read_bootstrap_info(bootstrap) + fragments_list = build_fragments_list(boot_info) + if self.params.get('test', False): + # We only download the first fragment + fragments_list = fragments_list[:1] + total_frags = len(fragments_list) + + tmpfilename = self.temp_name(filename) + (dest_stream, tmpfilename) = sanitize_open(tmpfilename, 'wb') + write_flv_header(dest_stream, metadata) + + # This dict stores the download progress, it's updated by the progress + # hook + state = { + 'downloaded_bytes': 0, + 'frag_counter': 0, + } + start = time.time() + + def frag_progress_hook(status): + frag_total_bytes = status.get('total_bytes', 0) + estimated_size = (state['downloaded_bytes'] + + (total_frags - state['frag_counter']) * frag_total_bytes) + if status['status'] == 'finished': + state['downloaded_bytes'] += frag_total_bytes + state['frag_counter'] += 1 + progress = self.calc_percent(state['frag_counter'], total_frags) + byte_counter = state['downloaded_bytes'] + else: + frag_downloaded_bytes = status['downloaded_bytes'] + byte_counter = state['downloaded_bytes'] + frag_downloaded_bytes + frag_progress = self.calc_percent(frag_downloaded_bytes, + frag_total_bytes) + progress = self.calc_percent(state['frag_counter'], total_frags) + progress += frag_progress / float(total_frags) + + eta = self.calc_eta(start, time.time(), estimated_size, byte_counter) + self.report_progress(progress, format_bytes(estimated_size), + status.get('speed'), eta) + http_dl.add_progress_hook(frag_progress_hook) + + frags_filenames = [] + for (seg_i, frag_i) in fragments_list: + name = 'Seg%d-Frag%d' % (seg_i, frag_i) + url = base_url + name + frag_filename = '%s-%s' % (tmpfilename, name) + success = http_dl.download(frag_filename, {'url': url}) + if not success: + return False + with open(frag_filename, 'rb') as down: + down_data = down.read() + reader = FlvReader(down_data) + while True: + _, box_type, box_data = reader.read_box_info() + if box_type == b'mdat': + dest_stream.write(box_data) + break + frags_filenames.append(frag_filename) + + self.report_finish(format_bytes(state['downloaded_bytes']), time.time() - start) + + self.try_rename(tmpfilename, filename) + for frag_file in frags_filenames: + os.remove(frag_file) + + fsize = os.path.getsize(encodeFilename(filename)) + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': filename, + 'status': 'finished', + }) + + return True diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9490df0d8..725371883 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -73,6 +73,7 @@ from .fktv import ( FKTVPosteckeIE, ) from .flickr import FlickrIE +from .fourtube import FourTubeIE from .franceinter import FranceInterIE from .francetv import ( PluzzIE, @@ -210,6 +211,7 @@ from .statigram import StatigramIE from .steam import SteamIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .syfy import SyfyIE from .sztvhu import SztvHuIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py new file mode 100644 index 000000000..8db7fc6cb --- /dev/null +++ b/youtube_dl/extractor/fourtube.py @@ -0,0 +1,95 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, + unified_strdate, + str_to_int, + parse_duration, +) +from youtube_dl.utils import clean_html + + +class FourTubeIE(InfoExtractor): + IE_NAME = '4tube' + _VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P<id>\d+)' + + _TEST = { + 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '209733', + 'ext': 'mp4', + 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', + 'uploader': 'WCP Club', + 'uploader_id': 'wcp-club', + 'upload_date': '20131031', + 'duration': 583, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage_url = 'http://www.4tube.com/videos/' + video_id + webpage = self._download_webpage(webpage_url, video_id) + + self.report_extraction(video_id) + + playlist_json = self._html_search_regex(r'var playerConfigPlaylist\s+=\s+([^;]+)', webpage, 'Playlist') + media_id = self._search_regex(r'idMedia:\s*(\d+)', playlist_json, 'Media Id') + sources = self._search_regex(r'sources:\s*\[([^\]]*)\]', playlist_json, 'Sources').split(',') + title = self._search_regex(r'title:\s*"([^"]*)', playlist_json, 'Title') + thumbnail_url = self._search_regex(r'image:\s*"([^"]*)', playlist_json, 'Thumbnail', fatal=False) + + uploader_str = self._search_regex(r'<span>Uploaded by</span>(.*?)<span>', webpage, 'uploader', fatal=False) + mobj = re.search(r'<a href="/sites/(?P<id>[^"]+)"><strong>(?P<name>[^<]+)</strong></a>', uploader_str) + (uploader, uploader_id) = (mobj.group('name'), mobj.group('id')) if mobj else (clean_html(uploader_str), None) + + upload_date = None + view_count = None + duration = None + description = self._html_search_meta('description', webpage, 'description') + if description: + upload_date = self._search_regex(r'Published Date: (\d{2} [a-zA-Z]{3} \d{4})', description, 'upload date', + fatal=False) + if upload_date: + upload_date = unified_strdate(upload_date) + view_count = self._search_regex(r'Views: ([\d,\.]+)', description, 'view count', fatal=False) + if view_count: + view_count = str_to_int(view_count) + duration = parse_duration(self._search_regex(r'Length: (\d+m\d+s)', description, 'duration', fatal=False)) + + token_url = "http://tkn.4tube.com/{0}/desktop/{1}".format(media_id, "+".join(sources)) + headers = { + b'Content-Type': b'application/x-www-form-urlencoded', + b'Origin': b'http://www.4tube.com', + } + token_req = compat_urllib_request.Request(token_url, b'{}', headers) + tokens = self._download_json(token_req, video_id) + + formats = [{ + 'url': tokens[format]['token'], + 'format_id': format + 'p', + 'resolution': format + 'p', + 'quality': int(format), + } for format in sources] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail_url, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'view_count': view_count, + 'duration': duration, + 'age_limit': 18, + 'webpage_url': webpage_url, + }
\ No newline at end of file diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index ae342341c..51eb97b2f 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -184,6 +184,7 @@ class GenerationQuoiIE(InfoExtractor): # It uses Dailymotion 'skip_download': True, }, + 'skip': 'Only available from France', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 811ef5201..0650f9564 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -26,15 +26,15 @@ class NDRIE(InfoExtractor): }, }, { - 'url': 'http://www.ndr.de/903/audio191719.html', - 'md5': '41ed601768534dd18a9ae34d84798129', + 'url': 'http://www.ndr.de/info/audio51535.html', + 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', 'note': 'Audio file', 'info_dict': { - 'id': '191719', + 'id': '51535', 'ext': 'mp3', - 'title': '"Es war schockierend"', - 'description': 'md5:ed7ff8364793545021a6355b97e95f10', - 'duration': 112, + 'title': 'La Valette entgeht der Hinrichtung', + 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', + 'duration': 884, } } ] diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py index a8c514f53..e88566c69 100644 --- a/youtube_dl/extractor/nfb.py +++ b/youtube_dl/extractor/nfb.py @@ -74,7 +74,8 @@ class NFBIE(InfoExtractor): description = media.find('description').text # It seems assets always go from lower to better quality, so no need to sort formats = [{ - 'url': x.find('default/streamerURI').text + '/', + 'url': x.find('default/streamerURI').text, + 'app': x.find('default/streamerURI').text.split('/', 3)[3], 'play_path': x.find('default/url').text, 'rtmp_live': False, 'ext': 'mp4', diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index f249f013c..540c55703 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -20,6 +20,7 @@ class SmotriIE(InfoExtractor): IE_DESC = 'Smotri.com' IE_NAME = 'smotri' _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))' + _NETRC_MACHINE = 'smotri' _TESTS = [ # real video id 2610366 diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 2b9bf0cb7..bebcafb62 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -17,6 +17,7 @@ class SohuIE(InfoExtractor): u'info_dict': { u'title': u'MV:Far East Movement《The Illest》', }, + u'skip': u'Only available from China', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py new file mode 100644 index 000000000..8809a57fe --- /dev/null +++ b/youtube_dl/extractor/syfy.py @@ -0,0 +1,27 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class SyfyIE(InfoExtractor): + _VALID_URL = r'https?://www\.syfy\.com/videos/.+?vid:(?P<id>\d+)' + + _TEST = { + 'url': 'http://www.syfy.com/videos/Robot%20Combat%20League/Behind%20the%20Scenes/vid:2631458', + 'md5': 'e07de1d52c7278adbb9b9b1c93a66849', + 'info_dict': { + 'id': 'NmqMrGnXvmO1', + 'ext': 'flv', + 'title': 'George Lucas has Advice for his Daughter', + 'description': 'Listen to what insights George Lucas give his daughter Amanda.', + }, + 'add_ie': ['ThePlatform'], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + return self.url_result(self._og_search_video_url(webpage)) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 23172143e..d60702325 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -11,7 +11,10 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language class ThePlatformIE(InfoExtractor): - _VALID_URL = r'(?:https?://link\.theplatform\.com/s/[^/]+/|theplatform:)(?P<id>[^/\?]+)' + _VALID_URL = r'''(?x) + (?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/ + (?P<config>[^/\?]+/(?:swf|config)/select/)? + |theplatform:)(?P<id>[^/\?&]+)''' _TEST = { # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/ @@ -29,9 +32,7 @@ class ThePlatformIE(InfoExtractor): }, } - def _get_info(self, video_id): - smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' - 'format=smil&mbr=true'.format(video_id)) + def _get_info(self, video_id, smil_url): meta = self._download_xml(smil_url, video_id) try: @@ -50,26 +51,34 @@ class ThePlatformIE(InfoExtractor): head = meta.find(_x('smil:head')) body = meta.find(_x('smil:body')) - base_url = head.find(_x('smil:meta')).attrib['base'] - switch = body.find(_x('smil:switch')) - formats = [] - for f in switch.findall(_x('smil:video')): - attr = f.attrib - width = int(attr['width']) - height = int(attr['height']) - vbr = int(attr['system-bitrate']) // 1000 - format_id = '%dx%d_%dk' % (width, height, vbr) - formats.append({ - 'format_id': format_id, - 'url': base_url, - 'play_path': 'mp4:' + attr['src'], - 'ext': 'flv', - 'width': width, - 'height': height, - 'vbr': vbr, - }) - self._sort_formats(formats) + f4m_node = body.find(_x('smil:seq/smil:video')) + if f4m_node is not None: + formats = [{ + 'ext': 'flv', + # the parameters are from syfy.com, other sites may use others + 'url': f4m_node.attrib['src'] + '?g=UXWGVKRWHFSP&hdcore=3.0.3', + }] + else: + base_url = head.find(_x('smil:meta')).attrib['base'] + switch = body.find(_x('smil:switch')) + formats = [] + for f in switch.findall(_x('smil:video')): + attr = f.attrib + width = int(attr['width']) + height = int(attr['height']) + vbr = int(attr['system-bitrate']) // 1000 + format_id = '%dx%d_%dk' % (width, height, vbr) + formats.append({ + 'format_id': format_id, + 'url': base_url, + 'play_path': 'mp4:' + attr['src'], + 'ext': 'flv', + 'width': width, + 'height': height, + 'vbr': vbr, + }) + self._sort_formats(formats) return { 'id': video_id, @@ -83,4 +92,13 @@ class ThePlatformIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - return self._get_info(video_id) + if mobj.group('config'): + config_url = url+ '&form=json' + config_url = config_url.replace('swf/', 'config/') + config_json = self._download_webpage(config_url, video_id, u'Downloading config') + config = json.loads(config_json) + smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4' + else: + smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' + 'format=smil&mbr=true'.format(video_id)) + return self._get_info(video_id, smil_url) diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py index 7773cec14..f51d4dcfa 100644 --- a/youtube_dl/extractor/vesti.py +++ b/youtube_dl/extractor/vesti.py @@ -87,8 +87,7 @@ class VestiIE(InfoExtractor): video_id = mobj.group('id') else: mobj = re.search( - r'<div.+?id="current-video-holder".*?>\s*<iframe src="http://player\.rutv\.ru/iframe/(?P<type>[^/]+)/id/(?P<id>\d+)[^"]*"', - page) + r'<iframe.+?src="http://player\.rutv\.ru/iframe/(?P<type>[^/]+)/id/(?P<id>\d+)[^"]*".*?></iframe>', page) if not mobj: raise ExtractorError('No media found') diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index f13ba1c8e..a293b8875 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -6,6 +6,9 @@ import json from .common import InfoExtractor from ..utils import ( + ExtractorError, + compat_urllib_request, + compat_urllib_parse, compat_str, unescapeHTML, ) @@ -14,31 +17,80 @@ from ..utils import ( class VKIE(InfoExtractor): IE_NAME = 'vk.com' _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)' + _NETRC_MACHINE = 'vk' - _TESTS = [{ - 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', - 'file': '162222515.flv', - 'md5': '0deae91935c54e00003c2a00646315f0', - 'info_dict': { - 'title': 'ProtivoGunz - Хуёвая песня', - 'uploader': 'Noize MC', + _TESTS = [ + { + 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', + 'md5': '0deae91935c54e00003c2a00646315f0', + 'info_dict': { + 'id': '162222515', + 'ext': 'flv', + 'title': 'ProtivoGunz - Хуёвая песня', + 'uploader': 'Noize MC', + 'duration': 195, + }, }, - }, - { - 'url': 'http://vk.com/video4643923_163339118', - 'file': '163339118.mp4', - 'md5': 'f79bccb5cd182b1f43502ca5685b2b36', - 'info_dict': { - 'uploader': 'Elvira Dzhonik', - 'title': 'Dream Theater - Hollow Years Live at Budokan 720*', + { + 'url': 'http://vk.com/video4643923_163339118', + 'md5': 'f79bccb5cd182b1f43502ca5685b2b36', + 'info_dict': { + 'id': '163339118', + 'ext': 'mp4', + 'uploader': 'Elvira Dzhonik', + 'title': 'Dream Theater - Hollow Years Live at Budokan 720*', + 'duration': 558, + } + }, + { + 'url': 'http://vk.com/video-8871596_164049491', + 'md5': 'a590bcaf3d543576c9bd162812387666', + 'note': 'Only available for registered users', + 'info_dict': { + 'id': '164049491', + 'ext': 'mp4', + 'uploader': 'Триллеры', + 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]\u00a0', + 'duration': 8352, + }, + 'skip': 'Requires vk account credentials', } - }] + ] + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_form = { + 'act': 'login', + 'role': 'al_frame', + 'expire': '1', + 'email': username, + 'pass': password, + } + + request = compat_urllib_request.Request('https://login.vk.com/?act=login', + compat_urllib_parse.urlencode(login_form).encode('utf-8')) + login_page = self._download_webpage(request, None, note='Logging in as %s' % username) + + if re.search(r'onLoginFailed', login_page): + raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) + + def _real_initialize(self): + self._login() def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id info_page = self._download_webpage(info_url, video_id) + + if re.search(r'<!>Please log in or <', info_page): + raise ExtractorError('This video is only available for registered users, ' + 'use --username and --password options to provide account credentials.', expected=True) + m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page) if m_yt is not None: self.to_screen(u'Youtube video detected') @@ -60,4 +112,5 @@ class VKIE(InfoExtractor): 'title': unescapeHTML(data['md_title']), 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), + 'duration': data.get('duration') } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8c2c4dfa2..a81036843 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1694,7 +1694,8 @@ class YoutubeSearchIE(SearchInfoExtractor): api_response = data['data'] if 'items' not in api_response: - raise ExtractorError(u'[youtube] No video results') + raise ExtractorError( + u'[youtube] No video results', expected=True) new_ids = list(video['id'] for video in api_response['items']) video_ids += new_ids diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 67c6af507..057cd20d1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -17,6 +17,7 @@ import platform import re import ssl import socket +import struct import subprocess import sys import traceback @@ -761,6 +762,7 @@ def unified_strdate(date_str): date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) format_expressions = [ '%d %B %Y', + '%d %b %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', @@ -1143,7 +1145,7 @@ def parse_duration(s): return None m = re.match( - r'(?:(?:(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)$', s) + r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s) if not m: return None res = int(m.group('secs')) @@ -1220,3 +1222,20 @@ def uppercase_escape(s): return re.sub( r'\\U([0-9a-fA-F]{8})', lambda m: compat_chr(int(m.group(1), base=16)), s) + +try: + struct.pack(u'!I', 0) +except TypeError: + # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument + def struct_pack(spec, *args): + if isinstance(spec, compat_str): + spec = spec.encode('ascii') + return struct.pack(spec, *args) + + def struct_unpack(spec, *args): + if isinstance(spec, compat_str): + spec = spec.encode('ascii') + return struct.unpack(spec, *args) +else: + struct_pack = struct.pack + struct_unpack = struct.unpack |