diff options
| author | Sergey M․ <dstftw@gmail.com> | 2014-07-16 20:40:28 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2014-07-16 20:40:28 +0700 | 
| commit | 7bb49d1057527776c3b4b4ccf2bd995a291c883d (patch) | |
| tree | d0331d889378c26de15e649c983735cd66f51d60 | |
| parent | 1aa42fedee7df58ccd4e39b04a2a0438c5630f03 (diff) | |
[mlb] Extract more metadata and all formats, provide more tests
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/mlb.py | 132 | 
2 files changed, 81 insertions, 53 deletions
| diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 14133c315..c5961cab9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -170,7 +170,7 @@ from .metacafe import MetacafeIE  from .metacritic import MetacriticIE  from .mit import TechTVMITIE, MITIE, OCWMITIE  from .mixcloud import MixcloudIE -from .mlb import MlbIE +from .mlb import MLBIE  from .mpora import MporaIE  from .mofosex import MofosexIE  from .mooshare import MooshareIE diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 61ba58843..18ab2c135 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -3,72 +3,100 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..utils import ( +    parse_duration, +    parse_iso8601, +    find_xpath_attr, +) -class MlbIE(InfoExtractor): -    _VALID_URL = r'http?://m\.mlb\.com/video/topic/[0-9]+/v(?P<id>n?\d+)/.*$' -    _TEST = { -        'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', -        'md5': u'd9c022c10d21f849f49c05ae12a8a7e9', -        'info_dict': { -            'id': '34496663', -            'ext': 'mp4', -            'format': 'mp4', -            'description': "7/11/14: Giancarlo Stanton practices for the Home Run Derby prior to the game against the Mets", -            'title': "Stanton prepares for Derby", +class MLBIE(InfoExtractor): +    _VALID_URL = r'http?://m\.mlb\.com/video/(?:topic/[\da-z_-]+/)?v(?P<id>n?\d+)' +    _TESTS = [ +        { +            'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', +            'md5': 'd9c022c10d21f849f49c05ae12a8a7e9', +            'info_dict': { +                'id': '34496663', +                'ext': 'mp4', +                'title': 'Stanton prepares for Derby', +                'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57', +                'duration': 46, +                'timestamp': 1405105800, +                'upload_date': '20140711', +                'thumbnail': 're:^https?://.*\.jpg$', +            },          }, -    } +        { +            'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby', +            'md5': '0e6e73d509321e142409b695eadd541f', +            'info_dict': { +                'id': '34578115', +                'ext': 'mp4', +                'title': 'Cespedes repeats as Derby champ', +                'description': 'md5:08df253ce265d4cf6fb09f581fafad07', +                'duration': 488, +                'timestamp': 1405399936, +                'upload_date': '20140715', +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +        }, +        { +            'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance', +            'md5': 'b8fd237347b844365d74ea61d4245967', +            'info_dict': { +                'id': '34577915', +                'ext': 'mp4', +                'title': 'Bautista on Home Run Derby', +                'description': 'md5:b80b34031143d0986dddc64a8839f0fb', +                'duration': 52, +                'timestamp': 1405390722, +                'upload_date': '20140715', +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +        }, +    ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') -        webpage = self._download_webpage(url, video_id) +        detail = self._download_xml( +            'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml' +            % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id) + +        title = detail.find('./headline').text +        description = detail.find('./big-blurb').text +        duration = parse_duration(detail.find('./duration').text) +        timestamp = parse_iso8601(detail.attrib['date'][:-5]) + +        thumbnail = find_xpath_attr( +            detail, './thumbnailScenarios/thumbnailScenario', 'type', '45').text -        title = self._og_search_title(webpage, default=video_id) -        description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)"/>', webpage, 'description', fatal=False) -        thumbnail = self._html_search_regex(r'<meta itemprop="image" (?:content|value)="(.*?)" />', webpage, 'image', fatal=False) +        formats = [] +        for media_url in detail.findall('./url'): +            playback_scenario = media_url.attrib['playback_scenario'] +            fmt = { +                'url': media_url.text, +                'format_id': playback_scenario, +            } +            m = re.search(r'(?P<vbr>\d+)K_(?P<width>\d+)X(?P<height>\d+)', playback_scenario) +            if m: +                fmt.update({ +                    'vbr': int(m.group('vbr')) * 1000, +                    'width': int(m.group('width')), +                    'height': int(m.group('height')), +                }) +            formats.append(fmt) -        # use the video_id to find the Media detail XML -        id_len = len(video_id) -        _mediadetail_url = 'http://m.mlb.com/gen/multimedia/detail/'+video_id[id_len-3]+'/'+video_id[id_len-2]+'/'+video_id[id_len-1]+'/'+video_id+'.xml' -         -        mediadetails = self._download_xml(_mediadetail_url, video_id, "Downloading media detail...") -        has1500K = 0 -        has1200K = 0 -        has600K = 0 -        # loop through the list of url's and only get the highest quality MP4 content -        for element in mediadetails.findall('url'): -            scenario = element.attrib['playback_scenario'] -            if scenario.startswith(u'FLASH'): -                if scenario.startswith(u'FLASH_1800K'): -                    video_url = element.text -                    # 1800K is the current highest quality video on MLB.com -                    break -                else: -                    if scenario.startswith(u'FLASH_1500K'): -                        video_url = element.text -                        has1500K = 1 -                    else: -                        if (scenario.startswith(u'FLASH_1200K') and not has1500K): -                            video_url = element.text -                            has1200K = 1 -                        else: -                            if (scenario.startswith(u'FLASH_600K') and not has1200K): -                                video_url = element.text -                                has600K = 1 -                            else: -                                if (scenario.startswith(u'FLASH_300K') and not has600K): -                                    video_url = element.text +        self._sort_formats(formats)          return {              'id': video_id, -            'url': video_url, -            'extractor': 'mlb', -            'webpage_url': url,              'title': title, -            'ext': 'mp4', -            'format': 'mp4',              'description': description, +            'duration': duration, +            'timestamp': timestamp, +            'formats': formats,              'thumbnail': thumbnail,          } | 
