diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2014-01-23 04:04:35 +0100 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2014-01-23 04:04:39 +0100 | 
| commit | 22ff1c4a9374b45359db5d9bab6abf26aaaf0f73 (patch) | |
| tree | 58d8201c1448df6e35c9a3f9d7e7cbeebad394f9 | |
| parent | 35409e11014da3fddd3df405e1503dba91f1f208 (diff) | |
[xhamster] Futher simplification
| -rw-r--r-- | youtube_dl/extractor/xhamster.py | 52 | 
1 files changed, 25 insertions, 27 deletions
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 833d2d1b4..d317f29f2 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -6,7 +6,6 @@ from .common import InfoExtractor  from ..utils import (      compat_urllib_parse,      unescapeHTML, -    determine_ext,      ExtractorError,  ) @@ -16,11 +15,11 @@ class XHamsterIE(InfoExtractor):      _VALID_URL = r'(?:http://)?(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'      _TESTS = [{          'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', -        'file': '1509445.flv', -        'md5': '9f48e0e8d58e3076bb236ff412ab62fa', +        'file': '1509445.mp4', +        'md5': '8281348b8d3c53d39fffb377d24eac4e',          'info_dict': { -            "upload_date": "20121014",  -            "uploader_id": "Ruseful2011",  +            "upload_date": "20121014", +            "uploader_id": "Ruseful2011",              "title": "FemaleAgent Shy beauty takes the bait",              "age_limit": 18,          } @@ -41,7 +40,7 @@ class XHamsterIE(InfoExtractor):          def extract_video_url(webpage):              mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)              if mobj is None: -                raise ExtractorError(u'Unable to extract media URL') +                raise ExtractorError('Unable to extract media URL')              if len(mobj.group('server')) == 0:                  return compat_urllib_parse.unquote(mobj.group('file'))              else: @@ -55,7 +54,7 @@ class XHamsterIE(InfoExtractor):                  return mp4.group(1)          def is_hd(webpage): -            return webpage.find('<div class=\'icon iconHD\'') != -1 +            return '<div class=\'icon iconHD\'' in webpage          mobj = re.match(self._VALID_URL, url) @@ -64,61 +63,60 @@ class XHamsterIE(InfoExtractor):          mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo)          webpage = self._download_webpage(mrss_url, video_id) -        video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', -            webpage, 'title') +        video_title = self._html_search_regex( +            r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')          # Only a few videos have an description -        mobj = re.search('<span>Description: </span>(?P<description>[^<]+)', webpage) -        if mobj: -            video_description = unescapeHTML(mobj.group('description')) -        else: -            video_description = None +        mobj = re.search(r'<span>Description: </span>([^<]+)', webpage) +        video_description = mobj.group(1) if mobj else None          mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)          if mobj:              video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')          else:              video_upload_date = None -            self._downloader.report_warning(u'Unable to extract upload date') +            self._downloader.report_warning('Unable to extract upload date') -        video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)', -            webpage, 'uploader id', default=u'anonymous') +        video_uploader_id = self._html_search_regex( +            r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)', +            webpage, 'uploader id', default='anonymous') -        video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'', +        video_thumbnail = self._search_regex( +            r'\'image\':\'(?P<thumbnail>[^\']+)\'',              webpage, 'thumbnail', fatal=False)          age_limit = self._rta_search(webpage)          hd = is_hd(webpage) -          video_url = extract_video_url(webpage)          formats = [{              'url': video_url, -            'ext': determine_ext(video_url), -            'format': 'hd' if hd else 'sd',              'format_id': 'hd' if hd else 'sd', +            'preference': 0,          }]          video_mp4_url = extract_mp4_video_url(webpage) -        if (not video_mp4_url is None) and (formats[0]['ext'] != 'mp4'): +        if video_mp4_url is not None:              formats.append({                  'url': video_mp4_url,                  'ext': 'mp4', -                'format': 'hd' if hd else 'sd', -                'format_id': 'hd' if hd else 'sd', +                'format_id': 'mp4-hd' if hd else 'mp4-sd', +                'preference': 1,              })          if not hd: -            webpage = self._download_webpage(mrss_url+'?hd', video_id) +            webpage = self._download_webpage( +                mrss_url + '?hd', video_id, note='Downloading HD webpage')              if is_hd(webpage):                  video_url = extract_video_url(webpage)                  formats.append({                      'url': video_url, -                    'ext': determine_ext(video_url), -                    'format': 'hd',                      'format_id': 'hd', +                    'preference': 2,                  }) +        self._sort_formats(formats) +          return {              'id': video_id,              'title': video_title,  | 
