diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2014-12-12 02:57:36 +0100 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2014-12-12 02:57:36 +0100 | 
| commit | 42bdd9d0516be5b71c89c9cccde16a880a14b0b1 (patch) | |
| tree | 5bb4344cfe824b6de71c7e03893c9bd491554747 | |
| parent | 4e40de6e2a62b56044a98828bc2df9b93e3dc665 (diff) | |
[cinchcast] Add new extractor (Fixes #4428)
| -rw-r--r-- | test/test_utils.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/cinchcast.py | 53 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 20 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 17 | 
5 files changed, 88 insertions, 6 deletions
| diff --git a/test/test_utils.py b/test/test_utils.py index aaa293ff8..d42df6d96 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -144,6 +144,9 @@ class TestUtil(unittest.TestCase):          self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')          self.assertEqual(unified_strdate('1968-12-10'), '19681210')          self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128') +        self.assertEqual( +            unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False), +            '20141126')      def test_find_xpath_attr(self):          testxml = '''<root> diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 982a134bf..746ee69e4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -51,6 +51,7 @@ from .cbsnews import CBSNewsIE  from .ceskatelevize import CeskaTelevizeIE  from .channel9 import Channel9IE  from .chilloutzone import ChilloutzoneIE +from .cinchcast import CinchcastIE  from .clipfish import ClipfishIE  from .cliphunter import CliphunterIE  from .clipsyndicate import ClipsyndicateIE diff --git a/youtube_dl/extractor/cinchcast.py b/youtube_dl/extractor/cinchcast.py new file mode 100644 index 000000000..8ce8b3128 --- /dev/null +++ b/youtube_dl/extractor/cinchcast.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    unified_strdate, +    xpath_text, +) + + +class CinchcastIE(InfoExtractor): +    _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)' +    _TEST = { +        # Actual test is run in generic, look for undergroundwellness +        'url': 'http://player.cinchcast.com/?platformId=1&assetType=single&assetId=7141703', +        'only_matching': True, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        doc = self._download_xml( +            'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id, +            video_id) + +        item = doc.find('.//item') +        title = xpath_text(item, './title', fatal=True) +        date_str = xpath_text( +            item, './{http://developer.longtailvideo.com/trac/}date') +        upload_date = unified_strdate(date_str, day_first=False) +        # duration is present but wrong +        formats = [] +        formats.append({ +            'format_id': 'main', +            'url': item.find( +                './{http://search.yahoo.com/mrss/}content').attrib['url'], +        }) +        backup_url = xpath_text( +            item, './{http://developer.longtailvideo.com/trac/}backupContent') +        if backup_url: +            formats.append({ +                'preference': 2,  # seems to be more reliable +                'format_id': 'backup', +                'url': backup_url, +            }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'upload_date': upload_date, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 328301de3..2b4d8c62f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -467,8 +467,17 @@ class GenericIE(InfoExtractor):              'expected_warnings': [                  'URL could be a direct video link, returning it as such.'              ] -        } - +        }, +        # Cinchcast embed +        { +            'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', +            'info_dict': { +                'id': '7141703', +                'ext': 'mp3', +                'upload_date': '20141126', +                'title': 'Jack Tips: 5 Steps to Permanent Gut Healing', +            } +        },      ]      def report_following_redirect(self, new_url): @@ -962,6 +971,13 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result(mobj.group('url'), 'SBS') +        # Look for embedded Cinchcast player +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1', +            webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'Cinchcast') +          mobj = re.search(              r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',              webpage) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 75f9594e6..4b0567c93 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -166,7 +166,7 @@ def xpath_text(node, xpath, name=None, fatal=False):          xpath = xpath.encode('ascii')      n = node.find(xpath) -    if n is None: +    if n is None or n.text is None:          if fatal:              name = xpath if name is None else name              raise ExtractorError('Could not find XML element %s' % name) @@ -644,17 +644,19 @@ def parse_iso8601(date_str, delimiter='T'):      return calendar.timegm(dt.timetuple()) -def unified_strdate(date_str): +def unified_strdate(date_str, day_first=True):      """Return a string with the date in the format YYYYMMDD"""      if date_str is None:          return None -      upload_date = None      # Replace commas      date_str = date_str.replace(',', ' ')      # %z (UTC offset) is only supported in python>=3.2      date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) +    # Remove AM/PM + timezone +    date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str) +      format_expressions = [          '%d %B %Y',          '%d %b %Y', @@ -669,7 +671,6 @@ def unified_strdate(date_str):          '%d/%m/%Y',          '%d/%m/%y',          '%Y/%m/%d %H:%M:%S', -        '%d/%m/%Y %H:%M:%S',          '%Y-%m-%d %H:%M:%S',          '%Y-%m-%d %H:%M:%S.%f',          '%d.%m.%Y %H:%M', @@ -681,6 +682,14 @@ def unified_strdate(date_str):          '%Y-%m-%dT%H:%M:%S.%f',          '%Y-%m-%dT%H:%M',      ] +    if day_first: +        format_expressions.extend([ +            '%d/%m/%Y %H:%M:%S', +        ]) +    else: +        format_expressions.extend([ +            '%m/%d/%Y %H:%M:%S', +        ])      for expression in format_expressions:          try:              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') | 
