diff options
author | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2015-04-17 19:02:49 +0200 |
---|---|---|
committer | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2015-04-17 19:02:49 +0200 |
commit | c5826a491b7b214a7e81030ad53103c4aca04dc2 (patch) | |
tree | 290b8a7bfdc2685969b680fb8706f96b1dd59d89 | |
parent | d8e7ef04dcb583f3271a6b6a099a3da2e650fb45 (diff) |
[mixcloud] Simplify url extraction
On the tracks I tested the server number in the url from the webpage is valid
for the mp3 or the m4a file and any other number is invalid, it's a
waste of time to check them.
-rw-r--r-- | youtube_dl/extractor/mixcloud.py | 72 |
1 files changed, 15 insertions, 57 deletions
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 84f291558..425a4ccf1 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import re -import itertools from .common import InfoExtractor from ..compat import ( @@ -46,20 +45,16 @@ class MixcloudIE(InfoExtractor): }, }] - def _get_url(self, track_id, template_url, server_number): - boundaries = (1, 30) - for nr in server_numbers(server_number, boundaries): - url = template_url % nr - try: - # We only want to know if the request succeed - # don't download the whole file - self._request_webpage( - HEADRequest(url), track_id, - 'Checking URL %d/%d ...' % (nr, boundaries[-1])) - return url - except ExtractorError: - pass - return None + def _check_url(self, url, track_id, ext): + try: + # We only want to know if the request succeed + # don't download the whole file + self._request_webpage( + HEADRequest(url), track_id, + 'Trying %s URL' % ext) + return True + except ExtractorError: + return False def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -72,15 +67,10 @@ class MixcloudIE(InfoExtractor): preview_url = self._search_regex( r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url') song_url = preview_url.replace('/previews/', '/c/originals/') - server_number = int(self._search_regex(r'stream(\d+)', song_url, 'server number')) - template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) - final_song_url = self._get_url(track_id, template_url, server_number) - if final_song_url is None: - self.to_screen('Trying with m4a extension') - template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') - final_song_url = self._get_url(track_id, template_url, server_number) - if final_song_url is None: - raise ExtractorError('Unable to extract track url') + if not self._check_url(song_url, track_id, 'mp3'): + song_url = song_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') + if not self._check_url(song_url, track_id, 'm4a'): + raise ExtractorError('Unable to extract track url') PREFIX = ( r'm-play-on-spacebar[^>]+' @@ -107,7 +97,7 @@ class MixcloudIE(InfoExtractor): return { 'id': track_id, 'title': title, - 'url': final_song_url, + 'url': song_url, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, @@ -115,35 +105,3 @@ class MixcloudIE(InfoExtractor): 'view_count': view_count, 'like_count': like_count, } - - -def server_numbers(first, boundaries): - """ Server numbers to try in descending order of probable availability. - Starting from first (i.e. the number of the server hosting the preview file) - and going further and further up to the higher boundary and down to the - lower one in an alternating fashion. Namely: - - server_numbers(2, (1, 5)) - - # Where the preview server is 2, min number is 1 and max is 5. - # Yields: 2, 3, 1, 4, 5 - - Why not random numbers or increasing sequences? Since from what I've seen, - full length files seem to be hosted on servers whose number is closer to - that of the preview; to be confirmed. - """ - zip_longest = getattr(itertools, 'zip_longest', None) - if zip_longest is None: - # python 2.x - zip_longest = itertools.izip_longest - - if len(boundaries) != 2: - raise ValueError("boundaries should be a two-element tuple") - min, max = boundaries - highs = range(first + 1, max + 1) - lows = range(first - 1, min - 1, -1) - rest = filter( - None, itertools.chain.from_iterable(zip_longest(highs, lows))) - yield first - for n in rest: - yield n |