diff options
25 files changed, 813 insertions, 189 deletions
diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 31d6ec952..663ccc422 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -11,30 +11,36 @@ tests = [ # 90 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`", "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"), + # 89 + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'", + "/?;:|}<[{=+-_)(*&^%$#@!MqBVCXZASDFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuyt"), # 88 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<", "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"), - # 87 - vflART1Nf 2013/07/24 + # 87 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", - "tyuioplkjhgfdsazxcv<nm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>"), - # 86 - vflm_D8eE 2013/07/31 + "uioplkjhgfdsazxcvbnm1t34567890QWE2TYUIOPLKJHGFDSAZXCVeNM!@#$^&*()_-+={[]}|:;?/>.<"), + # 86 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", - ">.1}|[{=+-_)(*&^%$#@!MNBVCXZASDFGHJK<POIUYTREW509876L432/mnbvcxzasdfghjklpoiuytre"), - # 85 - vflSAFCP9 2013/07/19 + "yuioplkjhgfdsazecvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<"), + # 85 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<", - "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c"), + ".>/?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ0q876543r1mnbvcx9asdfghjklpoiuyt2"), # 84 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", - "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"), - # 83 - vflTWC9KW 2013/08/01 + "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ09876543q1mnbvcxzasdfghjklpoiuew2"), + # 83 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", - "qwertyuioplkjhg>dsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/f"), - # 82 + ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"), + # 82 - vflZK4ZYR 2013/08/23 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", - "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"), + "wertyuioplkjhgfdsaqxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&z(-+={[};?/>.<"), # 81 - vflLC8JvQ 2013/07/25 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), + # 80 - vflZK4ZYR 2013/08/23 (sporadic) + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>", + "wertyuioplkjhgfdsaqxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&z(-+={[};?/>"), # 79 - vflLC8JvQ 2013/07/25 (sporadic) ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/", "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), diff --git a/test/test_all_urls.py b/test/test_all_urls.py index c73d0e467..c54faa380 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -50,6 +50,7 @@ class TestAllURLsMatching(unittest.TestCase): self.assertEqual(YoutubeIE()._extract_id('http://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc') self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc') self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc'), 'BaW_jenozKc') + self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch_popup?v=BaW_jenozKc'), 'BaW_jenozKc') def test_no_duplicates(self): ies = gen_extractors() diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index 86e09c9b1..fe0eac680 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -35,47 +35,47 @@ class TestYoutubeSubtitles(unittest.TestCase): DL.params['writesubtitles'] = True IE = YoutubeIE(DL) info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles'][0] - self.assertEqual(md5(sub[2]), '4cd9278a35ba2305f47354ee13472260') + sub = info_dict[0]['subtitles']['en'] + self.assertEqual(md5(sub), '4cd9278a35ba2305f47354ee13472260') def test_youtube_subtitles_it(self): DL = FakeYDL() DL.params['writesubtitles'] = True DL.params['subtitleslang'] = 'it' IE = YoutubeIE(DL) info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles'][0] - self.assertEqual(md5(sub[2]), '164a51f16f260476a05b50fe4c2f161d') + sub = info_dict[0]['subtitles']['it'] + self.assertEqual(md5(sub), '164a51f16f260476a05b50fe4c2f161d') def test_youtube_onlysubtitles(self): DL = FakeYDL() DL.params['writesubtitles'] = True DL.params['onlysubtitles'] = True IE = YoutubeIE(DL) info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles'][0] - self.assertEqual(md5(sub[2]), '4cd9278a35ba2305f47354ee13472260') + sub = info_dict[0]['subtitles']['en'] + self.assertEqual(md5(sub), '4cd9278a35ba2305f47354ee13472260') def test_youtube_allsubtitles(self): DL = FakeYDL() DL.params['allsubtitles'] = True IE = YoutubeIE(DL) info_dict = IE.extract('QRS8MkLhQmM') subtitles = info_dict[0]['subtitles'] - self.assertEqual(len(subtitles), 13) + self.assertEqual(len(subtitles.keys()), 13) def test_youtube_subtitles_sbv_format(self): DL = FakeYDL() DL.params['writesubtitles'] = True DL.params['subtitlesformat'] = 'sbv' IE = YoutubeIE(DL) info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles'][0] - self.assertEqual(md5(sub[2]), '13aeaa0c245a8bed9a451cb643e3ad8b') + sub = info_dict[0]['subtitles']['en'] + self.assertEqual(md5(sub), '13aeaa0c245a8bed9a451cb643e3ad8b') def test_youtube_subtitles_vtt_format(self): DL = FakeYDL() DL.params['writesubtitles'] = True DL.params['subtitlesformat'] = 'vtt' IE = YoutubeIE(DL) info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles'][0] - self.assertEqual(md5(sub[2]), '356cdc577fde0c6783b9b822e7206ff7') + sub = info_dict[0]['subtitles']['en'] + self.assertEqual(md5(sub), '356cdc577fde0c6783b9b822e7206ff7') def test_youtube_list_subtitles(self): DL = FakeYDL() DL.params['listsubtitles'] = True @@ -88,8 +88,8 @@ class TestYoutubeSubtitles(unittest.TestCase): DL.params['subtitleslang'] = 'it' IE = YoutubeIE(DL) info_dict = IE.extract('8YoUxe5ncPo') - sub = info_dict[0]['subtitles'][0] - self.assertTrue(sub[2] is not None) + sub = info_dict[0]['subtitles']['it'] + self.assertTrue(sub is not None) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index ea6b9d626..217c4a52f 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -79,9 +79,13 @@ class FileDownloader(object): rate = float(current) / dif eta = int((float(total) - float(current)) / rate) (eta_mins, eta_secs) = divmod(eta, 60) - if eta_mins > 99: - return '--:--' - return '%02d:%02d' % (eta_mins, eta_secs) + (eta_hours, eta_mins) = divmod(eta_mins, 60) + if eta_hours > 99: + return '--:--:--' + if eta_hours == 0: + return '%02d:%02d' % (eta_mins, eta_secs) + else: + return '%02d:%02d:%02d' % (eta_hours, eta_mins, eta_secs) @staticmethod def calc_speed(start, now, bytes): diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index fddf58606..336a42559 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -71,12 +71,17 @@ class FFmpegPostProcessor(PostProcessor): programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] return dict((program, executable(program)) for program in programs) - def run_ffmpeg(self, path, out_path, opts): + def run_ffmpeg_multiple_files(self, input_paths, out_path, opts): if not self._exes['ffmpeg'] and not self._exes['avconv']: raise FFmpegPostProcessorError(u'ffmpeg or avconv not found. Please install one.') - cmd = ([self._exes['avconv'] or self._exes['ffmpeg'], '-y', '-i', encodeFilename(path)] + + files_cmd = [] + for path in input_paths: + files_cmd.extend(['-i', encodeFilename(path)]) + cmd = ([self._exes['avconv'] or self._exes['ffmpeg'], '-y'] + files_cmd + opts + [encodeFilename(self._ffmpeg_filename_argument(out_path))]) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout,stderr = p.communicate() if p.returncode != 0: @@ -84,6 +89,9 @@ class FFmpegPostProcessor(PostProcessor): msg = stderr.strip().split('\n')[-1] raise FFmpegPostProcessorError(msg) + def run_ffmpeg(self, path, out_path, opts): + self.run_ffmpeg_multiple_files([path], out_path, opts) + def _ffmpeg_filename_argument(self, fn): # ffmpeg broke --, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details if fn.startswith(u'-'): @@ -232,3 +240,226 @@ class FFmpegVideoConvertor(FFmpegPostProcessor): information['format'] = self._preferedformat information['ext'] = self._preferedformat return False,information + + +class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): + # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt + _lang_map = { + 'aa': 'aar', + 'ab': 'abk', + 'ae': 'ave', + 'af': 'afr', + 'ak': 'aka', + 'am': 'amh', + 'an': 'arg', + 'ar': 'ara', + 'as': 'asm', + 'av': 'ava', + 'ay': 'aym', + 'az': 'aze', + 'ba': 'bak', + 'be': 'bel', + 'bg': 'bul', + 'bh': 'bih', + 'bi': 'bis', + 'bm': 'bam', + 'bn': 'ben', + 'bo': 'bod', + 'br': 'bre', + 'bs': 'bos', + 'ca': 'cat', + 'ce': 'che', + 'ch': 'cha', + 'co': 'cos', + 'cr': 'cre', + 'cs': 'ces', + 'cu': 'chu', + 'cv': 'chv', + 'cy': 'cym', + 'da': 'dan', + 'de': 'deu', + 'dv': 'div', + 'dz': 'dzo', + 'ee': 'ewe', + 'el': 'ell', + 'en': 'eng', + 'eo': 'epo', + 'es': 'spa', + 'et': 'est', + 'eu': 'eus', + 'fa': 'fas', + 'ff': 'ful', + 'fi': 'fin', + 'fj': 'fij', + 'fo': 'fao', + 'fr': 'fra', + 'fy': 'fry', + 'ga': 'gle', + 'gd': 'gla', + 'gl': 'glg', + 'gn': 'grn', + 'gu': 'guj', + 'gv': 'glv', + 'ha': 'hau', + 'he': 'heb', + 'hi': 'hin', + 'ho': 'hmo', + 'hr': 'hrv', + 'ht': 'hat', + 'hu': 'hun', + 'hy': 'hye', + 'hz': 'her', + 'ia': 'ina', + 'id': 'ind', + 'ie': 'ile', + 'ig': 'ibo', + 'ii': 'iii', + 'ik': 'ipk', + 'io': 'ido', + 'is': 'isl', + 'it': 'ita', + 'iu': 'iku', + 'ja': 'jpn', + 'jv': 'jav', + 'ka': 'kat', + 'kg': 'kon', + 'ki': 'kik', + 'kj': 'kua', + 'kk': 'kaz', + 'kl': 'kal', + 'km': 'khm', + 'kn': 'kan', + 'ko': 'kor', + 'kr': 'kau', + 'ks': 'kas', + 'ku': 'kur', + 'kv': 'kom', + 'kw': 'cor', + 'ky': 'kir', + 'la': 'lat', + 'lb': 'ltz', + 'lg': 'lug', + 'li': 'lim', + 'ln': 'lin', + 'lo': 'lao', + 'lt': 'lit', + 'lu': 'lub', + 'lv': 'lav', + 'mg': 'mlg', + 'mh': 'mah', + 'mi': 'mri', + 'mk': 'mkd', + 'ml': 'mal', + 'mn': 'mon', + 'mr': 'mar', + 'ms': 'msa', + 'mt': 'mlt', + 'my': 'mya', + 'na': 'nau', + 'nb': 'nob', + 'nd': 'nde', + 'ne': 'nep', + 'ng': 'ndo', + 'nl': 'nld', + 'nn': 'nno', + 'no': 'nor', + 'nr': 'nbl', + 'nv': 'nav', + 'ny': 'nya', + 'oc': 'oci', + 'oj': 'oji', + 'om': 'orm', + 'or': 'ori', + 'os': 'oss', + 'pa': 'pan', + 'pi': 'pli', + 'pl': 'pol', + 'ps': 'pus', + 'pt': 'por', + 'qu': 'que', + 'rm': 'roh', + 'rn': 'run', + 'ro': 'ron', + 'ru': 'rus', + 'rw': 'kin', + 'sa': 'san', + 'sc': 'srd', + 'sd': 'snd', + 'se': 'sme', + 'sg': 'sag', + 'si': 'sin', + 'sk': 'slk', + 'sl': 'slv', + 'sm': 'smo', + 'sn': 'sna', + 'so': 'som', + 'sq': 'sqi', + 'sr': 'srp', + 'ss': 'ssw', + 'st': 'sot', + 'su': 'sun', + 'sv': 'swe', + 'sw': 'swa', + 'ta': 'tam', + 'te': 'tel', + 'tg': 'tgk', + 'th': 'tha', + 'ti': 'tir', + 'tk': 'tuk', + 'tl': 'tgl', + 'tn': 'tsn', + 'to': 'ton', + 'tr': 'tur', + 'ts': 'tso', + 'tt': 'tat', + 'tw': 'twi', + 'ty': 'tah', + 'ug': 'uig', + 'uk': 'ukr', + 'ur': 'urd', + 'uz': 'uzb', + 've': 'ven', + 'vi': 'vie', + 'vo': 'vol', + 'wa': 'wln', + 'wo': 'wol', + 'xh': 'xho', + 'yi': 'yid', + 'yo': 'yor', + 'za': 'zha', + 'zh': 'zho', + 'zu': 'zul', + } + + def __init__(self, downloader=None, subtitlesformat='srt'): + super(FFmpegEmbedSubtitlePP, self).__init__(downloader) + self._subformat = subtitlesformat + + @classmethod + def _conver_lang_code(cls, code): + """Convert language code from ISO 639-1 to ISO 639-2/T""" + return cls._lang_map.get(code[:2]) + + def run(self, information): + if information['ext'] != u'mp4': + self._downloader.to_screen(u'[ffmpeg] Subtitles can only be embedded in mp4 files') + return True, information + sub_langs = [key for key in information['subtitles']] + + filename = information['filepath'] + input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] + + opts = ['-map', '0:0', '-map', '0:1', '-c:v', 'copy', '-c:a', 'copy'] + for (i, lang) in enumerate(sub_langs): + opts.extend(['-map', '%d:0' % (i+1), '-c:s:%d' % i, 'mov_text']) + lang_code = self._conver_lang_code(lang) + if lang_code is not None: + opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) + opts.extend(['-f', 'mp4']) + + temp_filename = filename + u'.temp' + self.run_ffmpeg_multiple_files(input_files, temp_filename, opts) + os.remove(encodeFilename(filename)) + os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + + return True, information diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d1618da79..1fd610a6e 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -483,41 +483,28 @@ class YoutubeDL(object): self.report_error(u'Cannot write description file ' + descfn) return - if (self.params.get('writesubtitles', False) or self.params.get('writeautomaticsub')) and 'subtitles' in info_dict and info_dict['subtitles']: + subtitles_are_requested = any([self.params.get('writesubtitles', False), + self.params.get('writeautomaticsub'), + self.params.get('allsubtitles', False)]) + + if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE - subtitle = info_dict['subtitles'][0] - (sub_error, sub_lang, sub) = subtitle + subtitles = info_dict['subtitles'] sub_format = self.params.get('subtitlesformat') - if sub_error: - self.report_warning("Some error while getting the subtitles") - else: + for sub_lang in subtitles.keys(): + sub = subtitles[sub_lang] + if sub is None: + continue try: - sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format + sub_filename = subtitles_filename(filename, sub_lang, sub_format) self.report_writesubtitles(sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub) + subfile.write(sub) except (OSError, IOError): self.report_error(u'Cannot write subtitles file ' + descfn) return - if self.params.get('allsubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: - subtitles = info_dict['subtitles'] - sub_format = self.params.get('subtitlesformat') - for subtitle in subtitles: - (sub_error, sub_lang, sub) = subtitle - if sub_error: - self.report_warning("Some error while getting the subtitles") - else: - try: - sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format - self.report_writesubtitles(sub_filename) - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub) - except (OSError, IOError): - self.report_error(u'Cannot write subtitles file ' + descfn) - return - if self.params.get('writeinfojson', False): infofn = filename + u'.info.json' self.report_writeinfojson(infofn) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index bf040aacd..441ca6b6a 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -27,6 +27,7 @@ __authors__ = ( 'Johny Mo Swag', 'Axel Noack', 'Albert Kim', + 'Pierre Rudloff', ) __license__ = 'Public Domain' @@ -119,6 +120,7 @@ def parseOpts(overrideArguments=None): selection = optparse.OptionGroup(parser, 'Video Selection') authentication = optparse.OptionGroup(parser, 'Authentication Options') video_format = optparse.OptionGroup(parser, 'Video Format Options') + subtitles = optparse.OptionGroup(parser, 'Subtitle Options') downloader = optparse.OptionGroup(parser, 'Download Options') postproc = optparse.OptionGroup(parser, 'Post-processing Options') filesystem = optparse.OptionGroup(parser, 'Filesystem Options') @@ -185,25 +187,26 @@ def parseOpts(overrideArguments=None): action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') video_format.add_option('-F', '--list-formats', action='store_true', dest='listformats', help='list all available formats (currently youtube only)') - video_format.add_option('--write-sub', '--write-srt', + + subtitles.add_option('--write-sub', '--write-srt', action='store_true', dest='writesubtitles', help='write subtitle file (currently youtube only)', default=False) - video_format.add_option('--write-auto-sub', '--write-automatic-sub', + subtitles.add_option('--write-auto-sub', '--write-automatic-sub', action='store_true', dest='writeautomaticsub', help='write automatic subtitle file (currently youtube only)', default=False) - video_format.add_option('--only-sub', + subtitles.add_option('--only-sub', action='store_true', dest='skip_download', help='[deprecated] alias of --skip-download', default=False) - video_format.add_option('--all-subs', + subtitles.add_option('--all-subs', action='store_true', dest='allsubtitles', - help='downloads all the available subtitles of the video (currently youtube only)', default=False) - video_format.add_option('--list-subs', + help='downloads all the available subtitles of the video', default=False) + subtitles.add_option('--list-subs', action='store_true', dest='listsubtitles', - help='lists all available subtitles for the video (currently youtube only)', default=False) - video_format.add_option('--sub-format', + help='lists all available subtitles for the video', default=False) + subtitles.add_option('--sub-format', action='store', dest='subtitlesformat', metavar='FORMAT', - help='subtitle format [srt/sbv/vtt] (default=srt) (currently youtube only)', default='srt') - video_format.add_option('--sub-lang', '--srt-lang', + help='subtitle format (default=srt) ([sbv/vtt] youtube only)', default='srt') + subtitles.add_option('--sub-lang', '--srt-lang', action='store', dest='subtitleslang', metavar='LANG', help='language of the subtitles to download (optional) use IETF language tags like \'en\'') @@ -320,6 +323,8 @@ def parseOpts(overrideArguments=None): help='keeps the video file on disk after the post-processing; the video is erased by default') postproc.add_option('--no-post-overwrites', action='store_true', dest='nopostoverwrites', default=False, help='do not overwrite post-processed files; the post-processed files are overwritten by default') + postproc.add_option('--embed-subs', action='store_true', dest='embedsubtitles', default=False, + help='embed subtitles in the video (only for mp4 videos)') parser.add_option_group(general) @@ -328,6 +333,7 @@ def parseOpts(overrideArguments=None): parser.add_option_group(filesystem) parser.add_option_group(verbosity) parser.add_option_group(video_format) + parser.add_option_group(subtitles) parser.add_option_group(authentication) parser.add_option_group(postproc) @@ -343,7 +349,7 @@ def parseOpts(overrideArguments=None): userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') systemConf = _readOptions('/etc/youtube-dl.conf') userConf = _readOptions(userConfFile) - commandLineConf = sys.argv[1:] + commandLineConf = sys.argv[1:] argv = systemConf + userConf + commandLineConf opts, args = parser.parse_args(argv) if opts.verbose: @@ -377,7 +383,7 @@ def _real_main(argv=None): # Set user agent if opts.user_agent is not None: std_headers['User-Agent'] = opts.user_agent - + # Set referer if opts.referer is not None: std_headers['Referer'] = opts.referer @@ -607,6 +613,8 @@ def _real_main(argv=None): ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites)) if opts.recodevideo: ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo)) + if opts.embedsubtitles: + ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat)) # Update version if opts.update_self: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 84c02c2ed..b4db8f0bf 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -36,6 +36,7 @@ from .ign import IGNIE, OneUPIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE +from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE from .kankan import KankanIE @@ -50,13 +51,16 @@ from .myspass import MySpassIE from .myvideo import MyVideoIE from .nba import NBAIE from .ooyala import OoyalaIE +from .pbs import PBSIE from .photobucket import PhotobucketIE from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE from .roxwel import RoxwelIE +from .rtlnow import RTLnowIE from .sina import SinaIE +from .slashdot import SlashdotIE from .soundcloud import SoundcloudIE, SoundcloudSetIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE @@ -71,6 +75,7 @@ from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE from .ustream import UstreamIE +from .unistra import UnistraIE from .vbox7 import Vbox7IE from .veoh import VeohIE from .vevo import VevoIE diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 30b9c7549..8d4c93d6d 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -4,6 +4,7 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( compat_urllib_parse_urlparse, + determine_ext, ExtractorError, ) @@ -12,7 +13,7 @@ from ..utils import ( class CollegeHumorIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$' - _TEST = { + _TESTS = [{ u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', u'file': u'6902724.mp4', u'md5': u'1264c12ad95dca142a9f0bf7968105a0', @@ -20,7 +21,16 @@ class CollegeHumorIE(InfoExtractor): u'title': u'Comic-Con Cosplay Catastrophe', u'description': u'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.', }, - } + }, + { + u'url': u'http://www.collegehumor.com/video/3505939/font-conference', + u'file': u'3505939.mp4', + u'md5': u'c51ca16b82bb456a4397987791a835f5', + u'info_dict': { + u'title': u'Font Conference', + u'description': u'This video wasn\'t long enough, so we made it double-spaced.', + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -49,25 +59,29 @@ class CollegeHumorIE(InfoExtractor): info['description'] = videoNode.findall('./description')[0].text info['title'] = videoNode.findall('./caption')[0].text info['thumbnail'] = videoNode.findall('./thumbnail')[0].text - manifest_url = videoNode.findall('./file')[0].text + next_url = videoNode.findall('./file')[0].text except IndexError: raise ExtractorError(u'Invalid metadata XML file') - manifest_url += '?hdcore=2.10.3' - manifestXml = self._download_webpage(manifest_url, video_id, - u'Downloading XML manifest', - u'Unable to download video info XML') - - adoc = xml.etree.ElementTree.fromstring(manifestXml) - try: - media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] - node_id = media_node.attrib['url'] - video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text - except IndexError as err: - raise ExtractorError(u'Invalid manifest file') + if next_url.endswith(u'manifest.f4m'): + manifest_url = next_url + '?hdcore=2.10.3' + manifestXml = self._download_webpage(manifest_url, video_id, + u'Downloading XML manifest', + u'Unable to download video info XML') - url_pr = compat_urllib_parse_urlparse(info['thumbnail']) + adoc = xml.etree.ElementTree.fromstring(manifestXml) + try: + media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] + node_id = media_node.attrib['url'] + video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text + except IndexError as err: + raise ExtractorError(u'Invalid manifest file') + url_pr = compat_urllib_parse_urlparse(info['thumbnail']) + info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') + info['ext'] = 'mp4' + else: + # Old-style direct links + info['url'] = next_url + info['ext'] = determine_ext(info['url']) - info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') - info['ext'] = 'mp4' - return [info] + return info diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index da50abfc1..52c4483c9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -47,7 +47,8 @@ class InfoExtractor(object): uploader_id: Nickname or id of the video uploader. location: Physical location of the video. player_url: SWF Player URL (used for rtmpdump). - subtitles: The subtitle file contents. + subtitles: The subtitle file contents as a dictionary in the format + {language: subtitles}. view_count: How many users have watched the video on the platform. urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen @@ -77,7 +78,13 @@ class InfoExtractor(object): @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url) is not None + + # This does not use has/getattr intentionally - we want to know whether + # we have cached the regexp for *this* class, whereas getattr would also + # match the superclass + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + return cls._VALID_URL_RE.match(url) is not None @classmethod def working(cls): diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 67a7e5f76..4508f0dfa 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,17 +21,14 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', + video_url = self._search_regex(r'type: "video/mp4", src: "(.*?)"', webpage, u'video URL', flags=re.DOTALL) - title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", - r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL) - info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, + 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), } return [info] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b633e896c..da016f7ee 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -107,8 +107,13 @@ class GenericIE(InfoExtractor): return new_url def _real_extract(self, url): - new_url = self._test_redirect(url) - if new_url: return [self.url_result(new_url)] + try: + new_url = self._test_redirect(url) + if new_url: + return [self.url_result(new_url)] + except compat_urllib_error.HTTPError: + # This may be a stupid server that doesn't like HEAD, our UA, or so + pass video_id = url.split('/')[-1] try: @@ -145,6 +150,9 @@ class GenericIE(InfoExtractor): if m_video_type is not None: mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) if mobj is None: + # HTML5 video + mobj = re.search(r'<video[^<]*>.*?<source .*?src="([^"]+)"', webpage, flags=re.DOTALL) + if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) # It's possible that one of the regexes diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py new file mode 100644 index 000000000..4327bc13d --- /dev/null +++ b/youtube_dl/extractor/jeuxvideo.py @@ -0,0 +1,47 @@ +# coding: utf-8 + +import json +import re +import xml.etree.ElementTree + +from .common import InfoExtractor + +class JeuxVideoIE(InfoExtractor): + _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm' + + _TEST = { + u'url': u'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', + u'file': u'5182.mp4', + u'md5': u'e0fdb0cd3ce98713ef9c1e1e025779d0', + u'info_dict': { + u'title': u'GC 2013 : Tearaway nous présente ses papiers d\'identité', + u'description': u'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.\n', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, title) + m_download = re.search(r'<param name="flashvars" value="config=(.*?)" />', webpage) + + xml_link = m_download.group(1) + + id = re.search(r'http://www.jeuxvideo.com/config/\w+/0011/(.*?)/\d+_player\.xml', xml_link).group(1) + + xml_config = self._download_webpage(xml_link, title, + 'Downloading XML config') + config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8')) + info = re.search(r'<format\.json>(.*?)</format\.json>', + xml_config, re.MULTILINE|re.DOTALL).group(1) + info = json.loads(info)['versions'][0] + + video_url = 'http://video720.jeuxvideo.com/' + info['file'] + + return {'id': id, + 'title' : config.find('titre_video').text, + 'ext' : 'mp4', + 'url' : video_url, + 'description': self._og_search_description(webpage), + 'thumbnail': config.find('image').text, + } diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py new file mode 100644 index 000000000..65462d867 --- /dev/null +++ b/youtube_dl/extractor/pbs.py @@ -0,0 +1,34 @@ +import re +import json + +from .common import InfoExtractor + + +class PBSIE(InfoExtractor): + _VALID_URL = r'https?://video.pbs.org/video/(?P<id>\d+)/?' + + _TEST = { + u'url': u'http://video.pbs.org/video/2365006249/', + u'file': u'2365006249.mp4', + u'md5': 'ce1888486f0908d555a8093cac9a7362', + u'info_dict': { + u'title': u'A More Perfect Union', + u'description': u'md5:ba0c207295339c8d6eced00b7c363c6a', + u'duration': 3190, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id + info_page = self._download_webpage(info_url, video_id) + info =json.loads(info_page) + return {'id': video_id, + 'title': info['title'], + 'url': info['alternate_encoding']['url'], + 'ext': 'mp4', + 'description': info['program'].get('description'), + 'thumbnail': info.get('image_url'), + 'duration': info.get('duration'), + } diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py new file mode 100644 index 000000000..2f134e6a7 --- /dev/null +++ b/youtube_dl/extractor/rtlnow.py @@ -0,0 +1,113 @@ +# encoding: utf-8 +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + ExtractorError, +) + +class RTLnowIE(InfoExtractor): + """Information Extractor for RTLnow, RTL2now and VOXnow""" + _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl(?:(?P<is_rtl2>2)|-)now\.rtl(?(is_rtl2)2|)\.de/|(?:www\.)?voxnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' + _TESTS = [{ + u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', + u'file': u'90419.flv', + u'info_dict': { + u'upload_date': u'20070416', + u'title': u'Ahornallee - Folge 1 - Der Einzug', + u'description': u'Folge 1 - Der Einzug', + }, + u'params': { + u'skip_download': True, + }, + u'skip': u'Only works from Germany', + }, + { + u'url': u'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', + u'file': u'69756.flv', + u'info_dict': { + u'upload_date': u'20120519', + u'title': u'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit...', + u'description': u'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.', + u'thumbnail': u'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg', + }, + u'params': { + u'skip_download': True, + }, + u'skip': u'Only works from Germany', + }, + { + u'url': u'www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', + u'file': u'13883.flv', + u'info_dict': { + u'upload_date': u'20090627', + u'title': u'Voxtours - Südafrika-Reporter II', + u'description': u'Südafrika-Reporter II', + }, + u'params': { + u'skip_download': True, + }, + }] + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + + webpage_url = u'http://' + mobj.group('url') + video_page_url = u'http://' + mobj.group('base_url') + video_id = mobj.group(u'video_id') + + webpage = self._download_webpage(webpage_url, video_id) + + note_m = re.search(r'''(?sx) + <div[ ]style="margin-left:[ ]20px;[ ]font-size:[ ]13px;">(.*?) + <div[ ]id="playerteaser">''', webpage) + if note_m: + msg = clean_html(note_m.group(1)) + raise ExtractorError(msg) + + video_title = self._html_search_regex(r'<title>(?P<title>[^<]+)</title>', + webpage, u'title') + playerdata_url = self._html_search_regex(r'\'playerdata\': \'(?P<playerdata_url>[^\']+)\'', + webpage, u'playerdata_url') + + playerdata = self._download_webpage(playerdata_url, video_id) + mobj = re.search(r'<title><!\[CDATA\[(?P<description>.+?)\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr\]\]></title>', playerdata) + if mobj: + video_description = mobj.group(u'description') + if mobj.group('upload_date_Y'): + video_upload_date = mobj.group('upload_date_Y') + else: + video_upload_date = u'20' + mobj.group('upload_date_y') + video_upload_date += mobj.group('upload_date_m')+mobj.group('upload_date_d') + else: + video_description = None + video_upload_date = None + self._downloader.report_warning(u'Unable to extract description and upload date') + + # Thumbnail: not every video has an thumbnail + mobj = re.search(r'<meta property="og:image" content="(?P<thumbnail>[^"]+)">', webpage) + if mobj: + video_thumbnail = mobj.group(u'thumbnail') + else: + video_thumbnail = None + + mobj = re.search(r'<filename [^>]+><!\[CDATA\[(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>[^\]]+)\]\]></filename>', playerdata) + if mobj is None: + raise ExtractorError(u'Unable to extract media URL') + video_url = mobj.group(u'url') + video_play_path = u'mp4:' + mobj.group(u'play_path') + video_player_url = video_page_url + u'includes/vodplayer.swf' + + return [{ + 'id': video_id, + 'url': video_url, + 'play_path': video_play_path, + 'page_url': video_page_url, + 'player_url': video_player_url, + 'ext': 'flv', + 'title': video_title, + 'description': video_description, + 'upload_date': video_upload_date, + 'thumbnail': video_thumbnail, + }] diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py new file mode 100644 index 000000000..2cba53076 --- /dev/null +++ b/youtube_dl/extractor/slashdot.py @@ -0,0 +1,23 @@ +import re + +from .common import InfoExtractor + + +class SlashdotIE(InfoExtractor): + _VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P<id>.*?)(&|$)' + + _TEST = { + u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz', + u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4', + u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735', + u'info_dict': { + u'title': u' Meet the Stampede Supercomputing Cluster\'s Administrator', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + ooyala_url = self._search_regex(r'<script src="(.*?)"', webpage, 'ooyala url') + return self.url_result(ooyala_url, 'Ooyala') diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 7c9f1c6b6..5f3a5540d 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( compat_str, + compat_urlparse, ExtractorError, unified_strdate, @@ -22,6 +23,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''^(?:https?://)? (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)) + |(?P<widget>w.soundcloud.com/player/?.*?url=.*) ) ''' IE_NAME = u'soundcloud' @@ -79,6 +81,9 @@ class SoundcloudIE(InfoExtractor): if track_id is not None: info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID full_title = track_id + elif mobj.group('widget'): + query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + return self.url_result(query['url'][0], ie='Soundcloud') else: # extract uploader (which is in the url) uploader = mobj.group(1) diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/statigram.py index b8e6b3bf9..1ea4a9f2f 100644 --- a/youtube_dl/extractor/statigram.py +++ b/youtube_dl/extractor/statigram.py @@ -5,13 +5,13 @@ from .common import InfoExtractor class StatigramIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)' _TEST = { - u'url': u'http://statigr.am/p/484091715184808010_284179915', - u'file': u'484091715184808010_284179915.mp4', - u'md5': u'deda4ff333abe2e118740321e992605b', + u'url': u'http://statigr.am/p/522207370455279102_24101272', + u'file': u'522207370455279102_24101272.mp4', + u'md5': u'6eb93b882a3ded7c378ee1d6884b1814', u'info_dict': { - u"uploader_id": u"videoseconds", - u"title": u"Instagram photo by @videoseconds" - } + u'uploader_id': u'aguynamedpatrick', + u'title': u'Instagram photo by @aguynamedpatrick (Patrick Janelle)', + }, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py new file mode 100644 index 000000000..5ba0a9061 --- /dev/null +++ b/youtube_dl/extractor/unistra.py @@ -0,0 +1,32 @@ +import re + +from .common import InfoExtractor + +class UnistraIE(InfoExtractor): + _VALID_URL = r'http://utv.unistra.fr/(?:index|video).php\?id_video\=(\d+)' + + _TEST = { + u'url': u'http://utv.unistra.fr/video.php?id_video=154', + u'file': u'154.mp4', + u'md5': u'736f605cfdc96724d55bb543ab3ced24', + u'info_dict': { + u'title': u'M!ss Yella', + u'description': u'md5:75e8439a3e2981cd5d4b6db232e8fdfc', + }, + } + + def _real_extract(self, url): + id = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, id) + file = re.search(r'file: "(.*?)",', webpage).group(1) + title = self._html_search_regex(r'<title>UTV - (.*?)</', webpage, u'title') + + video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file + + return {'id': id, + 'title': title, + 'ext': 'mp4', + 'url': video_url, + 'description': self._html_search_regex(r'<meta name="Description" content="(.*?)"', webpage, u'description', flags=re.DOTALL), + 'thumbnail': self._search_regex(r'image: "(.*?)"', webpage, u'thumbnail'), + } diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 67537eae5..70408c4f0 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -8,18 +8,18 @@ from ..utils import ( class VevoIE(InfoExtractor): """ - Accecps urls from vevo.com or in the format 'vevo:{id}' + Accepts urls from vevo.com or in the format 'vevo:{id}' (currently used by MTVIE) """ - _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*)$' + _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)' _TEST = { u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', u'file': u'GB1101300280.mp4', u'md5': u'06bea460acb744eab74a9d7dcb4bfd61', u'info_dict': { - u"upload_date": u"20130624", - u"uploader": u"Hurts", - u"title": u"Somebody To Die For" + u"upload_date": u"20130624", + u"uploader": u"Hurts", + u"title": u"Somebody to Die For" } } diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py index 04106672b..94f64ffa5 100644 --- a/youtube_dl/extractor/videofyme.py +++ b/youtube_dl/extractor/videofyme.py @@ -14,7 +14,7 @@ class VideofyMeIE(InfoExtractor): _TEST = { u'url': u'http://www.videofy.me/thisisvideofyme/1100701', u'file': u'1100701.mp4', - u'md5': u'2046dd5758541d630bfa93e741e2fd79', + u'md5': u'c77d700bdc16ae2e9f3c26019bd96143', u'info_dict': { u'title': u'This is VideofyMe', u'description': None, @@ -32,9 +32,8 @@ class VideofyMeIE(InfoExtractor): config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) video = config.find('video') sources = video.find('sources') - url_node = find_xpath_attr(sources, 'source', 'id', 'HQ on') - if url_node is None: - url_node = find_xpath_attr(sources, 'source', 'id', 'HQ off') + url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key) + for key in ['on', 'av', 'off']] if node is not None) video_url = url_node.find('url').text return {'id': video_id, diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cc9c8d018..512e06e2a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -20,18 +20,31 @@ class VimeoIE(InfoExtractor): _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)(?:[?].*)?$' _NETRC_MACHINE = 'vimeo' IE_NAME = u'vimeo' - _TEST = { - u'url': u'http://vimeo.com/56015672', - u'file': u'56015672.mp4', - u'md5': u'8879b6cc097e987f02484baf890129e5', - u'info_dict': { - u"upload_date": u"20121220", - u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", - u"uploader_id": u"user7108434", - u"uploader": u"Filippo Valsorda", - u"title": u"youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550" - } - } + _TESTS = [ + { + u'url': u'http://vimeo.com/56015672', + u'file': u'56015672.mp4', + u'md5': u'8879b6cc097e987f02484baf890129e5', + u'info_dict': { + u"upload_date": u"20121220", + u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", + u"uploader_id": u"user7108434", + u"uploader": u"Filippo Valsorda", + u"title": u"youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", + }, + }, + { + u'url': u'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', + u'file': u'68093876.mp4', + u'md5': u'3b5ca6aa22b60dfeeadf50b72e44ed82', + u'note': u'Vimeo Pro video (#1197)', + u'info_dict': { + u'uploader_id': u'openstreetmapus', + u'uploader': u'OpenStreetMap US', + u'title': u'Andy Allan - Putting the Carto into OpenStreetMap Cartography', + }, + }, + ] def _login(self): (username, password) = self._get_login_info() @@ -83,7 +96,9 @@ class VimeoIE(InfoExtractor): video_id = mobj.group('id') if not mobj.group('proto'): url = 'https://' + url - if mobj.group('direct_link') or mobj.group('pro'): + elif mobj.group('pro'): + url = 'http://player.vimeo.com/video/' + video_id + elif mobj.group('direct_link'): url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b191021db..446d53f64 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -141,7 +141,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ |(?: # or the v= param in all its forms - (?:watch|movie(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) + (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) v= @@ -155,11 +155,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Listed in order of quality _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13', '95', '94', '93', '92', '132', '151', + # 3D '85', '84', '102', '83', '101', '82', '100', + # Dash video + '138', '137', '248', '136', '247', '135', '246', + '245', '244', '134', '243', '133', '242', '160', + # Dash audio + '141', '172', '140', '171', '139', ] _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13', '95', '94', '93', '92', '132', '151', '85', '102', '84', '101', '83', '100', '82', + # Dash video + '138', '248', '137', '247', '136', '246', '245', + '244', '135', '243', '134', '242', '133', '160', + # Dash audio + '172', '141', '171', '140', '139', ] _video_extensions = { '13': '3gp', @@ -181,7 +192,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '100': 'webm', '101': 'webm', '102': 'webm', - + # videos that use m3u8 '92': 'mp4', '93': 'mp4', @@ -190,6 +201,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '96': 'mp4', '132': 'mp4', '151': 'mp4', + + # Dash mp4 + '133': 'mp4', + '134': 'mp4', + '135': 'mp4', + '136': 'mp4', + '137': 'mp4', + '138': 'mp4', + '139': 'mp4', + '140': 'mp4', + '141': 'mp4', + '160': 'mp4', + + # Dash webm + '171': 'webm', + '172': 'webm', + '242': 'webm', + '243': 'webm', + '244': 'webm', + '245': 'webm', + '246': 'webm', + '247': 'webm', + '248': 'webm', } _video_dimensions = { '5': '240x400', @@ -217,11 +251,58 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '96': '1080p', '100': '360p', '101': '480p', - '102': '720p', + '102': '720p', '132': '240p', '151': '72p', + '133': '240p', + '134': '360p', + '135': '480p', + '136': '720p', + '137': '1080p', + '138': '>1080p', + '139': '48k', + '140': '128k', + '141': '256k', + '160': '192p', + '171': '128k', + '172': '256k', + '242': '240p', + '243': '360p', + '244': '480p', + '245': '480p', + '246': '480p', + '247': '720p', + '248': '1080p', + } + _special_itags = { + '82': '3D', + '83': '3D', + '84': '3D', + '85': '3D', + '100': '3D', + '101': '3D', + '102': '3D', + '133': 'DASH Video', + '134': 'DASH Video', + '135': 'DASH Video', + '136': 'DASH Video', + '137': 'DASH Video', + '138': 'DASH Video', + '139': 'DASH Audio', + '140': 'DASH Audio', + '141': 'DASH Audio', + '160': 'DASH Video', + '171': 'DASH Audio', + '172': 'DASH Audio', + '242': 'DASH Video', + '243': 'DASH Video', + '244': 'DASH Video', + '245': 'DASH Video', + '246': 'DASH Video', + '247': 'DASH Video', + '248': 'DASH Video', } - _3d_itags = ['85', '84', '102', '83', '101', '82', '100'] + IE_NAME = u'youtube' _TESTS = [ { @@ -255,7 +336,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): u"upload_date": u"20120506", u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", u"description": u"md5:b085c9804f5ab69f4adea963a2dceb3c", - u"uploader": u"IconaPop", + u"uploader": u"Icona Pop", u"uploader_id": u"IconaPop" } }, @@ -335,22 +416,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] elif len(s) == 90: return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] + elif len(s) == 89: + return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1] elif len(s) == 88: return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12] elif len(s) == 87: - return s[83:53:-1] + s[3] + s[52:40:-1] + s[86] + s[39:10:-1] + s[0] + s[9:3:-1] + s[53] + return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: - return s[83:85] + s[26] + s[79:46:-1] + s[85] + s[45:36:-1] + s[30] + s[35:30:-1] + s[46] + s[29:26:-1] + s[82] + s[25:1:-1] + return s[5:20] + s[2] + s[21:] elif len(s) == 85: - return s[2:8] + s[0] + s[9:21] + s[65] + s[22:65] + s[84] + s[66:82] + s[21] + return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27] elif len(s) == 84: return s[83:27:-1] + s[0] + s[26:5:-1] + s[2:0:-1] + s[27] elif len(s) == 83: - return s[:15] + s[80] + s[16:80] + s[15] + return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0] elif len(s) == 82: - return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34] + return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82] elif len(s) == 81: return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] + elif len(s) == 80: + return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80] elif len(s) == 79: return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] @@ -373,11 +458,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): try: sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'unable to download video subtitles: %s' % compat_str(err), None) + self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) + return {} sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) if not sub_lang_list: - return (u'video doesn\'t have subtitles', None) + self._downloader.report_warning(u'video doesn\'t have subtitles') + return {} return sub_lang_list def _list_available_subtitles(self, video_id): @@ -386,8 +473,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _request_subtitle(self, sub_lang, sub_name, video_id, format): """ - Return tuple: - (error_message, sub_lang, sub) + Return the subtitle as a string or None if they are not found """ self.report_video_subtitles_request(video_id, sub_lang, format) params = compat_urllib_parse.urlencode({ @@ -400,10 +486,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): try: sub = compat_urllib_request.urlopen(url).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'unable to download video subtitles: %s' % compat_str(err), None, None) + self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) + return if not sub: - return (u'Did not fetch video subtitles', None, None) - return (None, sub_lang, sub) + self._downloader.report_warning(u'Did not fetch video subtitles') + return + return sub def _request_automatic_caption(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an @@ -414,7 +502,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): mobj = re.search(r';ytplayer.config = ({.*?});', webpage) err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang if mobj is None: - return [(err_msg, None, None)] + self._downloader.report_warning(err_msg) + return {} player_config = json.loads(mobj.group(1)) try: args = player_config[u'args'] @@ -429,40 +518,40 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }) subtitles_url = caption_url + '&' + params sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') - return [(None, sub_lang, sub)] - except KeyError: - return [(err_msg, None, None)] - - def _extract_subtitle(self, video_id): + return {sub_lang: sub} + # An extractor error can be raise by the download process if there are + # no automatic captions but there are subtitles + except (KeyError, ExtractorError): + self._downloader.report_warning(err_msg) + return {} + + def _extract_subtitles(self, video_id): """ - Return a list with a tuple: - [(error_message, sub_lang, sub)] + Return a dictionary: {language: subtitles} or {} if the subtitles + couldn't be found """ sub_lang_list = self._get_available_subtitles(video_id) sub_format = self._downloader.params.get('subtitlesformat') - if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles - return [(sub_lang_list[0], None, None)] - if self._downloader.params.get('subtitleslang', False): - sub_lang = self._downloader.params.get('subtitleslang') - elif 'en' in sub_lang_list: - sub_lang = 'en' + if not sub_lang_list: #There was some error, it didn't get the available subtitles + return {} + if self._downloader.params.get('allsubtitles', False): + pass else: - sub_lang = list(sub_lang_list.keys())[0] - if not sub_lang in sub_lang_list: - return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)] - - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) - return [subtitle] - - def _extract_all_subtitles(self, video_id): - sub_lang_list = self._get_available_subtitles(video_id) - sub_format = self._downloader.params.get('subtitlesformat') - if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles - return [(sub_lang_list[0], None, None)] - subtitles = [] + if self._downloader.params.get('subtitleslang', False): + sub_lang = self._downloader.params.get('subtitleslang') + elif 'en' in sub_lang_list: + sub_lang = 'en' + else: + sub_lang = list(sub_lang_list.keys())[0] + if not sub_lang in sub_lang_list: + self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) + return {} + sub_lang_list = {sub_lang: sub_lang_list[sub_lang]} + subtitles = {} for sub_lang in sub_lang_list: subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) - subtitles.append(subtitle) + if subtitle: + subtitles[sub_lang] = subtitle return subtitles def _print_formats(self, formats): @@ -470,7 +559,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for x in formats: print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'), - ' (3D)' if x in self._3d_itags else '')) + ' ('+self._special_itags[x]+')' if x in self._special_itags else '')) def _extract_id(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -653,25 +742,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # subtitles video_subtitles = None - if self._downloader.params.get('writesubtitles', False): - video_subtitles = self._extract_subtitle(video_id) - if video_subtitles: - (sub_error, sub_lang, sub) = video_subtitles[0] - if sub_error: - self._downloader.report_warning(sub_error) - - if self._downloader.params.get('writeautomaticsub', False): + if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): + video_subtitles = self._extract_subtitles(video_id) + elif self._downloader.params.get('writeautomaticsub', False): video_subtitles = self._request_automatic_caption(video_id, video_webpage) - (sub_error, sub_lang, sub) = video_subtitles[0] - if sub_error: - self._downloader.report_warning(sub_error) - - if self._downloader.params.get('allsubtitles', False): - video_subtitles = self._extract_all_subtitles(video_id) - for video_subtitle in video_subtitles: - (sub_error, sub_lang, sub) = video_subtitle - if sub_error: - self._downloader.report_warning(sub_error) if self._downloader.params.get('listsubtitles', False): self._list_available_subtitles(video_id) @@ -697,6 +771,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if m_s is not None: self.to_screen(u'%s: Encrypted signatures detected.' % video_id) video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']] + m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u'')) + if m_s is not None: + if 'url_encoded_fmt_stream_map' in video_info: + video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts'] + else: + video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']] + elif 'adaptive_fmts' in video_info: + if 'url_encoded_fmt_stream_map' in video_info: + video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0] + else: + video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts'] except ValueError: pass @@ -756,7 +841,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension, self._video_dimensions.get(format_param, '???'), - ' (3D)' if format_param in self._3d_itags else '') + ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '') results.append({ 'id': video_id, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 59eeaf4a8..52cfb8a6d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -497,7 +497,7 @@ class ExtractorError(Exception): if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): expected = True if not expected: - msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output.' + msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.' super(ExtractorError, self).__init__(msg) self.traceback = tb @@ -657,6 +657,9 @@ def determine_ext(url, default_ext=u'unknown_video'): else: return default_ext +def subtitles_filename(filename, sub_lang, sub_format): + return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format + def date_from_str(date_str): """ Return a datetime object from a string in the format YYYYMMDD or diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 82c24646c..3536e923f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.08.08' +__version__ = '2013.08.22' |