diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2012-01-08 17:20:01 +0100 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2012-01-08 17:20:18 +0100 | 
| commit | fefb166c52e00075a776dba3440039c046998eb4 (patch) | |
| tree | aafe711e81a16705fffe7a9a78babe4bc09ec6f2 | |
| parent | dcb3c22e0beedd7e6ee7ca2cc988a5c9b80d5f11 (diff) | |
Leave out characters the filesystem cannot encode (Closes: #264)
| -rw-r--r-- | devscripts/posix-locale.sh | 6 | ||||
| -rwxr-xr-x | youtube_dl/__init__.py | 301 | 
2 files changed, 161 insertions, 146 deletions
| diff --git a/devscripts/posix-locale.sh b/devscripts/posix-locale.sh new file mode 100644 index 000000000..0aa7a592d --- /dev/null +++ b/devscripts/posix-locale.sh @@ -0,0 +1,6 @@ + +# source this file in your shell to get a POSIX locale (which will break many programs, but that's kind of the point) + +export LC_ALL=POSIX +export LANG=POSIX +export LANGUAGE=POSIX diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f6d6e7515..0f9eeccbf 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -259,14 +259,14 @@ def sanitize_open(filename, open_mode):  				import msvcrt  				msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)  			return (sys.stdout, filename) -		stream = open(filename, open_mode) +		stream = open(_encodeFilename(filename), open_mode)  		return (stream, filename)  	except (IOError, OSError), err:  		# In case of error, try to remove win32 forbidden chars  		filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)  		# An exception here should be caught in the caller -		stream = open(filename, open_mode) +		stream = open(_encodeFilename(filename), open_mode)  		return (stream, filename) @@ -291,13 +291,21 @@ def _orderedSet(iterable):  	return res  def _unescapeHTML(s): -    """ -    @param s a string (of type unicode) -    """ -    assert type(s) == type(u'') +	""" +	@param s a string (of type unicode) +	""" +	assert type(s) == type(u'') + +	htmlParser = HTMLParser.HTMLParser() +	return htmlParser.unescape(s) -    htmlParser = HTMLParser.HTMLParser() -    return htmlParser.unescape(s) +def _encodeFilename(s): +	""" +	@param s The name of the file (of type unicode) +	""" + +	assert type(s) == type(u'') +	return s.encode(sys.getfilesystemencoding(), 'ignore')  class DownloadError(Exception):  	"""Download Error exception. @@ -563,16 +571,17 @@ class FileDownloader(object):  		self._pps.append(pp)  		pp.set_downloader(self) -	def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False): +	def to_screen(self, message, skip_eol=False):  		"""Print message to stdout if not in quiet mode.""" -		try: -			if not self.params.get('quiet', False): -				terminator = [u'\n', u''][skip_eol] -				print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()), +		assert type(message) == type(u'') +		if not self.params.get('quiet', False): +			terminator = [u'\n', u''][skip_eol] +			output = message + terminator + +			if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr +				output = output.encode(preferredencoding(), 'ignore') +			self._screen_file.write(output)  			self._screen_file.flush() -		except (UnicodeEncodeError), err: -			if not ignore_encoding_errors: -				raise  	def to_stderr(self, message):  		"""Print message to stderr.""" @@ -622,7 +631,7 @@ class FileDownloader(object):  	def temp_name(self, filename):  		"""Returns a temporary filename for the given filename."""  		if self.params.get('nopart', False) or filename == u'-' or \ -				(os.path.exists(filename) and not os.path.isfile(filename)): +				(os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):  			return filename  		return filename + u'.part' @@ -635,7 +644,7 @@ class FileDownloader(object):  		try:  			if old_filename == new_filename:  				return -			os.rename(old_filename, new_filename) +			os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))  		except (IOError, OSError), err:  			self.trouble(u'ERROR: unable to rename file') @@ -643,7 +652,7 @@ class FileDownloader(object):  		"""Try to set the last-modified time of the given file."""  		if last_modified_hdr is None:  			return -		if not os.path.isfile(filename): +		if not os.path.isfile(_encodeFilename(filename)):  			return  		timestr = last_modified_hdr  		if timestr is None: @@ -659,15 +668,15 @@ class FileDownloader(object):  	def report_writedescription(self, descfn):  		""" Report that the description file is being written """ -		self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True) +		self.to_screen(u'[info] Writing video description to: ' + descfn)  	def report_writeinfojson(self, infofn):  		""" Report that the metadata file has been written """ -		self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True) +		self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)  	def report_destination(self, filename):  		"""Report destination filename.""" -		self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True) +		self.to_screen(u'[download] Destination: ' + filename)  	def report_progress(self, percent_str, data_len_str, speed_str, eta_str):  		"""Report download progress.""" @@ -769,8 +778,8 @@ class FileDownloader(object):  			return  		try: -			dn = os.path.dirname(filename) -			if dn != '' and not os.path.exists(dn): +			dn = os.path.dirname(_encodeFilename(filename)) +			if dn != '' and not os.path.exists(dn): # dn is already encoded  				os.makedirs(dn)  		except (OSError, IOError), err:  			self.trouble(u'ERROR: unable to create directory ' + unicode(err)) @@ -778,9 +787,9 @@ class FileDownloader(object):  		if self.params.get('writedescription', False):  			try: -				descfn = filename + '.description' +				descfn = filename + u'.description'  				self.report_writedescription(descfn) -				descfile = open(descfn, 'wb') +				descfile = open(_encodeFilename(descfn), 'wb')  				try:  					descfile.write(info_dict['description'].encode('utf-8'))  				finally: @@ -790,7 +799,7 @@ class FileDownloader(object):  				return  		if self.params.get('writeinfojson', False): -			infofn = filename + '.info.json' +			infofn = filename + u'.info.json'  			self.report_writeinfojson(infofn)  			try:  				json.dump @@ -798,7 +807,7 @@ class FileDownloader(object):  				self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')  				return  			try: -				infof = open(infofn, 'wb') +				infof = open(_encodeFilename(infofn), 'wb')  				try:  					json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))  					json.dump(json_info_dict, infof) @@ -809,7 +818,7 @@ class FileDownloader(object):  				return  		if not self.params.get('skip_download', False): -			if self.params.get('nooverwrites', False) and os.path.exists(filename): +			if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):  				success = True  			else:  				try: @@ -882,11 +891,11 @@ class FileDownloader(object):  		basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]  		retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])  		while retval == 2 or retval == 1: -			prevsize = os.path.getsize(tmpfilename) +			prevsize = os.path.getsize(_encodeFilename(tmpfilename))  			self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)  			time.sleep(5.0) # This seems to be needed  			retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) -			cursize = os.path.getsize(tmpfilename) +			cursize = os.path.getsize(_encodeFilename(tmpfilename))  			if prevsize == cursize and retval == 1:  				break  			 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those @@ -895,7 +904,7 @@ class FileDownloader(object):  				retval = 0  				break  		if retval == 0: -			self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename)) +			self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))  			self.try_rename(tmpfilename, filename)  			return True  		else: @@ -907,7 +916,7 @@ class FileDownloader(object):  		player_url = info_dict.get('player_url', None)  		# Check file already present -		if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False): +		if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):  			self.report_file_already_downloaded(filename)  			return True @@ -924,8 +933,8 @@ class FileDownloader(object):  		request = urllib2.Request(url, None, headers)  		# Establish possible resume length -		if os.path.isfile(tmpfilename): -			resume_len = os.path.getsize(tmpfilename) +		if os.path.isfile(_encodeFilename(tmpfilename)): +			resume_len = os.path.getsize(_encodeFilename(tmpfilename))  		else:  			resume_len = 0 @@ -3750,17 +3759,17 @@ class MixcloudIE(InfoExtractor):  		try:  			# Process file information  			self._downloader.process_info({ -				'id':		file_id.decode('utf-8'), -				'url':		file_url.decode('utf-8'), +				'id': file_id.decode('utf-8'), +				'url': file_url.decode('utf-8'),  				'uploader':	uploader.decode('utf-8'), -				'upload_date':	u'NA', -				'title':	json_data['name'], -				'stitle':	_simplify_title(json_data['name']), -				'ext':		file_url.split('.')[-1].decode('utf-8'), -				'format':	(format_param is None and u'NA' or format_param.decode('utf-8')), -				'thumbnail':    json_data['thumbnail_url'], -				'description':  json_data['description'], -				'player_url':	player_url.decode('utf-8'), +				'upload_date': u'NA', +				'title': json_data['name'], +				'stitle': _simplify_title(json_data['name']), +				'ext': file_url.split('.')[-1].decode('utf-8'), +				'format': (format_param is None and u'NA' or format_param.decode('utf-8')), +				'thumbnail': json_data['thumbnail_url'], +				'description': json_data['description'], +				'player_url': player_url.decode('utf-8'),  			})  		except UnavailableVideoError, err:  			self._downloader.trouble(u'ERROR: unable to download file') @@ -3885,98 +3894,98 @@ class StanfordOpenClassroomIE(InfoExtractor):  				self.extract(entry['url'])  class MTVIE(InfoExtractor): -    """Information extractor for MTV.com""" - -    _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$' -    IE_NAME = u'mtv' - -    def report_webpage(self, video_id): -        """Report information extraction.""" -        self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) - -    def report_extraction(self, video_id): -        """Report information extraction.""" -        self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            self._downloader.trouble(u'ERROR: invalid URL: %s' % url) -            return -        if not mobj.group('proto'): -            url = 'http://' + url -        video_id = mobj.group('videoid') -        self.report_webpage(video_id) - -        request = urllib2.Request(url) -        try: -            webpage = urllib2.urlopen(request).read() -        except (urllib2.URLError, httplib.HTTPException, socket.error), err: -            self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) -            return - -        mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage) -        if mobj is None: -            self._downloader.trouble(u'ERROR: unable to extract song name') -            return -        song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1')) -        mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage) -        if mobj is None: -            self._downloader.trouble(u'ERROR: unable to extract performer') -            return -        performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1')) -        video_title = performer + ' - ' + song_name  - -        mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage) -        if mobj is None: -            self._downloader.trouble(u'ERROR: unable to mtvn_uri') -            return -        mtvn_uri = mobj.group(1) - -        mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage) -        if mobj is None: -            self._downloader.trouble(u'ERROR: unable to extract content id') -            return -        content_id = mobj.group(1) - -        videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri -        self.report_extraction(video_id) -        request = urllib2.Request(videogen_url) -        try: -            metadataXml = urllib2.urlopen(request).read() -        except (urllib2.URLError, httplib.HTTPException, socket.error), err: -            self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err)) -            return - -        mdoc = xml.etree.ElementTree.fromstring(metadataXml) -        renditions = mdoc.findall('.//rendition') - -        # For now, always pick the highest quality. -        rendition = renditions[-1] - -        try: -            _,_,ext = rendition.attrib['type'].partition('/') -            format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] -            video_url = rendition.find('./src').text -        except KeyError: -            self._downloader.trouble('Invalid rendition field.') -            return - -        self._downloader.increment_downloads() -        info = { -            'id': video_id, -            'url': video_url, -            'uploader': performer, -            'title': video_title, -            'stitle': _simplify_title(video_title), -            'ext': ext, -            'format': format, -        } - -        try: -            self._downloader.process_info(info) -        except UnavailableVideoError, err: -            self._downloader.trouble(u'\nERROR: unable to download ' + video_id) +	"""Information extractor for MTV.com""" + +	_VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$' +	IE_NAME = u'mtv' + +	def report_webpage(self, video_id): +		"""Report information extraction.""" +		self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) + +	def report_extraction(self, video_id): +		"""Report information extraction.""" +		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) + +	def _real_extract(self, url): +		mobj = re.match(self._VALID_URL, url) +		if mobj is None: +			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) +			return +		if not mobj.group('proto'): +			url = 'http://' + url +		video_id = mobj.group('videoid') +		self.report_webpage(video_id) + +		request = urllib2.Request(url) +		try: +			webpage = urllib2.urlopen(request).read() +		except (urllib2.URLError, httplib.HTTPException, socket.error), err: +			self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) +			return + +		mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage) +		if mobj is None: +			self._downloader.trouble(u'ERROR: unable to extract song name') +			return +		song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1')) +		mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage) +		if mobj is None: +			self._downloader.trouble(u'ERROR: unable to extract performer') +			return +		performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1')) +		video_title = performer + ' - ' + song_name  + +		mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage) +		if mobj is None: +			self._downloader.trouble(u'ERROR: unable to mtvn_uri') +			return +		mtvn_uri = mobj.group(1) + +		mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage) +		if mobj is None: +			self._downloader.trouble(u'ERROR: unable to extract content id') +			return +		content_id = mobj.group(1) + +		videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri +		self.report_extraction(video_id) +		request = urllib2.Request(videogen_url) +		try: +			metadataXml = urllib2.urlopen(request).read() +		except (urllib2.URLError, httplib.HTTPException, socket.error), err: +			self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err)) +			return + +		mdoc = xml.etree.ElementTree.fromstring(metadataXml) +		renditions = mdoc.findall('.//rendition') + +		# For now, always pick the highest quality. +		rendition = renditions[-1] + +		try: +			_,_,ext = rendition.attrib['type'].partition('/') +			format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] +			video_url = rendition.find('./src').text +		except KeyError: +			self._downloader.trouble('Invalid rendition field.') +			return + +		self._downloader.increment_downloads() +		info = { +			'id': video_id, +			'url': video_url, +			'uploader': performer, +			'title': video_title, +			'stitle': _simplify_title(video_title), +			'ext': ext, +			'format': format, +		} + +		try: +			self._downloader.process_info(info) +		except UnavailableVideoError, err: +			self._downloader.trouble(u'\nERROR: unable to download ' + video_id)  class PostProcessor(object): @@ -4042,7 +4051,7 @@ class FFmpegExtractAudioPP(PostProcessor):  	@staticmethod  	def get_audio_codec(path):  		try: -			cmd = ['ffprobe', '-show_streams', '--', path] +			cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]  			handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)  			output = handle.communicate()[0]  			if handle.wait() != 0: @@ -4063,7 +4072,7 @@ class FFmpegExtractAudioPP(PostProcessor):  			acodec_opts = []  		else:  			acodec_opts = ['-acodec', codec] -		cmd = ['ffmpeg', '-y', '-i', path, '-vn'] + acodec_opts + more_opts + ['--', out_path] +		cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]  		try:  			p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)  			stdout,stderr = p.communicate() @@ -4124,9 +4133,9 @@ class FFmpegExtractAudioPP(PostProcessor):  				extension = 'wav'  				more_opts += ['-f', 'wav'] -		(prefix, ext) = os.path.splitext(path) -		new_path = prefix + '.' + extension -		self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path) +		prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups +		new_path = prefix + sep + extension +		self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)  		try:  			self.run_ffmpeg(path, new_path, acodec, more_opts)  		except: @@ -4140,13 +4149,13 @@ class FFmpegExtractAudioPP(PostProcessor):   		# Try to update the date time for extracted audio file.  		if information.get('filetime') is not None:  			try: -				os.utime(new_path, (time.time(), information['filetime'])) +				os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))  			except:  				self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')  		if not self._keepvideo:  			try: -				os.remove(path) +				os.remove(_encodeFilename(path))  			except (IOError, OSError):  				self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')  				return None @@ -4194,9 +4203,9 @@ def parseOpts():  	import optparse  	import shlex -	def _readOptions(filename): +	def _readOptions(filename_bytes):  		try: -			optionf = open(filename) +			optionf = open(filename_bytes)  		except IOError:  			return [] # silently skip if file is not present  		try: | 
