diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2014-08-21 11:57:52 +0200 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2014-08-21 11:57:52 +0200 | 
| commit | 35f76e0061373ad344b3cbea30422c586abc16b5 (patch) | |
| tree | c74508b28e396c9c81d0764315ed9263afa1ab24 | |
| parent | 3f338cd6de1e198e810ca8e0c85a346c9537a47f (diff) | |
| parent | f83dda12ad37d1b83142e2821e72f8e6c0b4405e (diff) | |
Merge remote-tracking branch 'origin/master'
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/dfb.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/howstuffworks.py | 134 | ||||
| -rw-r--r-- | youtube_dl/extractor/jove.py | 80 | ||||
| -rw-r--r-- | youtube_dl/extractor/mitele.py | 60 | ||||
| -rw-r--r-- | youtube_dl/extractor/pbs.py | 53 | ||||
| -rw-r--r-- | youtube_dl/extractor/rtlnl.py | 52 | ||||
| -rw-r--r-- | youtube_dl/extractor/teamcoco.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/yahoo.py | 15 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 1 | 
11 files changed, 393 insertions, 17 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 381a5d999..be7616edc 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -130,6 +130,7 @@ from .helsinki import HelsinkiIE  from .hentaistigma import HentaiStigmaIE  from .hotnewhiphop import HotNewHipHopIE  from .howcast import HowcastIE +from .howstuffworks import HowStuffWorksIE  from .huffpost import HuffPostIE  from .hypem import HypemIE  from .iconosquare import IconosquareIE @@ -150,6 +151,7 @@ from .ivi import (  from .izlesene import IzleseneIE  from .jadorecettepub import JadoreCettePubIE  from .jeuxvideo import JeuxVideoIE +from .jove import JoveIE  from .jukebox import JukeboxIE  from .justintv import JustinTVIE  from .jpopsukitv import JpopsukiIE @@ -181,6 +183,7 @@ from .mdr import MDRIE  from .metacafe import MetacafeIE  from .metacritic import MetacriticIE  from .mit import TechTVMITIE, MITIE, OCWMITIE +from .mitele import MiTeleIE  from .mixcloud import MixcloudIE  from .mlb import MLBIE  from .mpora import MporaIE @@ -255,6 +258,7 @@ from .ro220 import Ro220IE  from .rottentomatoes import RottenTomatoesIE  from .roxwel import RoxwelIE  from .rtbf import RTBFIE +from .rtlnl import RtlXlIE  from .rtlnow import RTLnowIE  from .rts import RTSIE  from .rtve import RTVEALaCartaIE diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index cb8e06822..8049779b0 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -30,7 +30,7 @@ class DFBIE(InfoExtractor):              video_id)          video_info = player_info.find('video') -        f4m_info = self._download_xml(video_info.find('url').text, video_id) +        f4m_info = self._download_xml(self._proto_relative_url(video_info.find('url').text.strip()), video_id)          token_el = f4m_info.find('token')          manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index bcb076594..8e915735e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -706,6 +706,13 @@ class GenericIE(InfoExtractor):              url = unescapeHTML(mobj.group('url'))              return self.url_result(url, ie='MTVServicesEmbedded') +        # Look for embedded yahoo player +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1', +            webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'Yahoo') +          # Start with something easy: JW Player in SWFObject          found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)          if not found: diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py new file mode 100644 index 000000000..68684b997 --- /dev/null +++ b/youtube_dl/extractor/howstuffworks.py @@ -0,0 +1,134 @@ +from __future__ import unicode_literals + +import re +import json +import random +import string + +from .common import InfoExtractor +from ..utils import find_xpath_attr + + +class HowStuffWorksIE(InfoExtractor): +    _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*\d+-(?P<id>.+?)-video\.htm' +    _TESTS = [ +        { +            'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm', +            'info_dict': { +                'id': '450221', +                'display_id': 'cool-jobs-iditarod-musher', +                'ext': 'flv', +                'title': 'Cool Jobs - Iditarod Musher', +                'description': 'md5:82bb58438a88027b8186a1fccb365f90', +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +            'params': { +                # md5 is not consistent +                'skip_download': True +            } +        }, +        { +            'url': 'http://adventure.howstuffworks.com/39516-deadliest-catch-jakes-farewell-pots-video.htm', +            'info_dict': { +                'id': '553470', +                'display_id': 'deadliest-catch-jakes-farewell-pots', +                'ext': 'mp4', +                'title': 'Deadliest Catch: Jake\'s Farewell Pots', +                'description': 'md5:9632c346d5e43ee238028c9cefd8dbbc', +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +            'params': { +                # md5 is not consistent +                'skip_download': True +            } +        }, +        { +            'url': 'http://entertainment.howstuffworks.com/arts/2706-sword-swallowing-1-by-dan-meyer-video.htm', +            'info_dict': { +                'id': '440011', +                'display_id': 'sword-swallowing-1-by-dan-meyer', +                'ext': 'flv', +                'title': 'Sword Swallowing #1 by Dan Meyer', +                'description': 'md5:b2409e88172913e2e7d3d1159b0ef735', +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +            'params': { +                # md5 is not consistent +                'skip_download': True +            } +        }, +    ] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        display_id = mobj.group('id') +        webpage = self._download_webpage(url, display_id) + +        content_id = self._search_regex(r'var siteSectionId="(\d+)";', webpage, 'content id') + +        mp4 = self._search_regex( +            r'''(?xs)var\s+clip\s*=\s*{\s* +                .+?\s* +                content_id\s*:\s*%s\s*,\s* +                .+?\s* +                mp4\s*:\s*\[(.*?),?\]\s* +                };\s* +                videoData\.push\(clip\);''' % content_id, +            webpage, 'mp4', fatal=False, default=None) + +        smil = self._download_xml( +            'http://services.media.howstuffworks.com/videos/%s/smil-service.smil' % content_id, +            content_id, 'Downloading video SMIL') + +        http_base = find_xpath_attr( +            smil, +            './{0}head/{0}meta'.format('{http://www.w3.org/2001/SMIL20/Language}'), +            'name', +            'httpBase').get('content') + +        def random_string(str_len=0): +            return ''.join([random.choice(string.ascii_uppercase) for _ in range(str_len)]) + +        URL_SUFFIX = '?v=2.11.3&fp=LNX 11,2,202,356&r=%s&g=%s' % (random_string(5), random_string(12)) + +        formats = [] + +        if mp4: +            for video in json.loads('[%s]' % mp4): +                bitrate = video['bitrate'] +                fmt = { +                    'url': video['src'].replace('http://pmd.video.howstuffworks.com', http_base) + URL_SUFFIX, +                    'format_id': bitrate, +                } +                m = re.search(r'(?P<vbr>\d+)[Kk]', bitrate) +                if m: +                    fmt['vbr'] = int(m.group('vbr')) +                formats.append(fmt) +        else: +            for video in smil.findall( +                    './/{0}body/{0}switch/{0}video'.format('{http://www.w3.org/2001/SMIL20/Language}')): +                vbr = int(video.attrib['system-bitrate']) / 1000 +                formats.append({ +                    'url': '%s/%s%s' % (http_base, video.attrib['src'], URL_SUFFIX), +                    'format_id': '%dk' % vbr, +                    'vbr': vbr, +                }) + +        self._sort_formats(formats) + +        title = self._og_search_title(webpage) +        TITLE_SUFFIX = ' : HowStuffWorks' +        if title.endswith(TITLE_SUFFIX): +            title = title[:-len(TITLE_SUFFIX)] + +        description = self._og_search_description(webpage) +        thumbnail = self._og_search_thumbnail(webpage) + +        return { +            'id': content_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/jove.py b/youtube_dl/extractor/jove.py new file mode 100644 index 000000000..cf73cd753 --- /dev/null +++ b/youtube_dl/extractor/jove.py @@ -0,0 +1,80 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    unified_strdate +) + + +class JoveIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)' +    _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}' +    _TESTS = [ +        { +            'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current', +            'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b', +            'info_dict': { +                'id': '2744', +                'ext': 'mp4', +                'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation', +                'description': 'md5:015dd4509649c0908bc27f049e0262c6', +                'thumbnail': 're:^https?://.*\.png$', +                'upload_date': '20110523', +            } +        }, +        { +            'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation', +            'md5': '914aeb356f416811d911996434811beb', +            'info_dict': { +                'id': '51796', +                'ext': 'mp4', +                'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment', +                'description': 'md5:35ff029261900583970c4023b70f1dc9', +                'thumbnail': 're:^https?://.*\.png$', +                'upload_date': '20140802', +            } +        }, + +    ] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        chapters_id = self._html_search_regex( +            r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id') + +        chapters_xml = self._download_xml( +            self._CHAPTERS_URL.format(video_id=chapters_id), +            video_id, note='Downloading chapters XML', +            errnote='Failed to download chapters XML') + +        video_url = chapters_xml.attrib.get('video') +        if not video_url: +            raise ExtractorError('Failed to get the video URL') + +        title = self._html_search_meta('citation_title', webpage, 'title') +        thumbnail = self._og_search_thumbnail(webpage) +        description = self._html_search_regex( +            r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>', +            webpage, 'description', fatal=False) +        publish_date = unified_strdate(self._html_search_meta( +            'citation_publication_date', webpage, 'publish date', fatal=False)) +        comment_count = self._html_search_regex( +            r'<meta name="num_comments" content="(\d+) Comments?"', +            webpage, 'comment count', fatal=False) + +        return { +            'id': video_id, +            'title': title, +            'url': video_url, +            'thumbnail': thumbnail, +            'description': description, +            'upload_date': publish_date, +            'comment_count': comment_count, +        } diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py new file mode 100644 index 000000000..979f3d692 --- /dev/null +++ b/youtube_dl/extractor/mitele.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_parse, +    get_element_by_attribute, +    parse_duration, +    strip_jsonp, +) + + +class MiTeleIE(InfoExtractor): +    IE_NAME = 'mitele.es' +    _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<episode>[^/]+)/' + +    _TEST = { +        'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', +        'md5': '6a75fe9d0d3275bead0cb683c616fddb', +        'info_dict': { +            'id': '0fce117d', +            'ext': 'mp4', +            'title': 'Programa 144 - Tor, la web invisible', +            'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', +            'display_id': 'programa-144', +            'duration': 2913, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        episode = mobj.group('episode') +        webpage = self._download_webpage(url, episode) +        embed_data_json = self._search_regex( +            r'MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', +            flags=re.DOTALL +        ).replace('\'', '"') +        embed_data = json.loads(embed_data_json) + +        info_url = embed_data['flashvars']['host'] +        info_el = self._download_xml(info_url, episode).find('./video/info') + +        video_link = info_el.find('videoUrl/link').text +        token_query = compat_urllib_parse.urlencode({'id': video_link}) +        token_info = self._download_json( +            'http://token.mitele.es/?' + token_query, episode, +            transform_source=strip_jsonp +        ) + +        return { +            'id': embed_data['videoId'], +            'display_id': episode, +            'title': info_el.find('title').text, +            'url': token_info['tokenizedUrl'], +            'description': get_element_by_attribute('class', 'text', webpage), +            'thumbnail': info_el.find('thumb').text, +            'duration': parse_duration(info_el.find('duration').text), +        } diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index ec95d0704..dee4af6f1 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -20,17 +20,41 @@ class PBSIE(InfoExtractor):          )      ''' -    _TEST = { -        'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', -        'md5': 'ce1888486f0908d555a8093cac9a7362', -        'info_dict': { -            'id': '2365006249', -            'ext': 'mp4', -            'title': 'A More Perfect Union', -            'description': 'md5:ba0c207295339c8d6eced00b7c363c6a', -            'duration': 3190, +    _TESTS = [ +        { +            'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', +            'md5': 'ce1888486f0908d555a8093cac9a7362', +            'info_dict': { +                'id': '2365006249', +                'ext': 'mp4', +                'title': 'A More Perfect Union', +                'description': 'md5:ba0c207295339c8d6eced00b7c363c6a', +                'duration': 3190, +            }, +        }, +        { +            'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', +            'md5': '143c98aa54a346738a3d78f54c925321', +            'info_dict': { +                'id': '2365297690', +                'ext': 'mp4', +                'title': 'Losing Iraq', +                'description': 'md5:f5bfbefadf421e8bb8647602011caf8e', +                'duration': 5050, +            },          }, -    } +        { +            'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', +            'md5': 'b19856d7f5351b17a5ab1dc6a64be633', +            'info_dict': { +                'id': '2201174722', +                'ext': 'mp4', +                'title': 'Cyber Schools Gain Popularity, but Quality Questions Persist', +                'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28', +                'duration': 801, +            }, +        }, +    ]      def _extract_ids(self, url):          mobj = re.match(self._VALID_URL, url) @@ -40,10 +64,13 @@ class PBSIE(InfoExtractor):          if presumptive_id:              webpage = self._download_webpage(url, display_id) -            # frontline video embed +            MEDIA_ID_REGEXES = [ +                r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed +                r'class="coveplayerid">([^<]+)<',                       # coveplayer +            ] +              media_id = self._search_regex( -                r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", -                webpage, 'frontline video ID', fatal=False, default=None) +                MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)              if media_id:                  return media_id, presumptive_id diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py new file mode 100644 index 000000000..14928cd62 --- /dev/null +++ b/youtube_dl/extractor/rtlnl.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RtlXlIE(InfoExtractor): +    IE_NAME = 'rtlxl.nl' +    _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)' + +    _TEST = { +        'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', +        'info_dict': { +            'id': '6e4203a6-0a5e-3596-8424-c599a59e0677', +            'ext': 'flv', +            'title': 'RTL Nieuws - Laat', +            'description': 'Dagelijks het laatste nieuws uit binnen- en ' +                'buitenland. Voor nog meer nieuws kunt u ook gebruikmaken van ' +                'onze mobiele apps.', +            'timestamp': 1408051800, +            'upload_date': '20140814', +        }, +        'params': { +            # We download the first bytes of the first fragment, it can't be +            # processed by the f4m downloader beacuse it isn't complete +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        uuid = mobj.group('uuid') + +        info = self._download_json( +            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid, +            uuid) +        meta = info['meta'] +        material = info['material'][0] +        episode_info = info['episodes'][0] + +        f4m_url = 'http://manifest.us.rtl.nl' + material['videopath'] +        progname = info['abstracts'][0]['name'] +        subtitle = material['title'] or info['episodes'][0]['name'] + +        return { +            'id': uuid, +            'title': '%s - %s' % (progname, subtitle),  +            'formats': self._extract_f4m_formats(f4m_url, uuid), +            'timestamp': material['original_date'], +            'description': episode_info['synopsis'], +        } diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index f8dd7e955..fa796ce72 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -37,7 +37,7 @@ class TeamcocoIE(InfoExtractor):          video_id = mobj.group("video_id")          if not video_id:              video_id = self._html_search_regex( -                r'<article class="video" data-id="(\d+?)"', +                r'data-node-id="(\d+?)"',                  webpage, 'video id')          data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index d84be2562..0e3b33b16 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -15,7 +15,7 @@ from ..utils import (  class YahooIE(InfoExtractor):      IE_DESC = 'Yahoo screen and movies' -    _VALID_URL = r'https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html' +    _VALID_URL = r'(?P<url>https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'      _TESTS = [          {              'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', @@ -46,12 +46,23 @@ class YahooIE(InfoExtractor):                  'title': 'The World Loves Spider-Man',                  'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''',              } -        } +        }, +        { +            'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', +            'md5': '60e8ac193d8fb71997caa8fce54c6460', +            'info_dict': { +                'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', +                'ext': 'mp4', +                'title': "Yahoo Saves 'Community'", +                'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', +            } +        },      ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') +        url = mobj.group('url')          webpage = self._download_webpage(url, video_id)          items_json = self._search_regex( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 65b492fb3..1081a9368 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -827,6 +827,7 @@ def unified_strdate(date_str):          '%b %dnd %Y %I:%M%p',          '%b %dth %Y %I:%M%p',          '%Y-%m-%d', +        '%Y/%m/%d',          '%d.%m.%Y',          '%d/%m/%Y',          '%Y/%m/%d %H:%M:%S',  | 
