diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/dailymotion.py | 8 | ||||
| -rw-r--r-- | youtube_dl/extractor/dumpert.py | 56 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 16 | ||||
| -rw-r--r-- | youtube_dl/extractor/nbc.py | 51 | ||||
| -rw-r--r-- | youtube_dl/extractor/phoenix.py | 40 | ||||
| -rw-r--r-- | youtube_dl/extractor/pornhub.py | 17 | ||||
| -rw-r--r-- | youtube_dl/extractor/soundcloud.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/theplatform.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/xuite.py | 14 | ||||
| -rw-r--r-- | youtube_dl/extractor/yahoo.py | 15 | 
11 files changed, 195 insertions, 31 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a65c0c25b..9fddb8e32 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -115,6 +115,7 @@ from .drtuber import DrTuberIE  from .drtv import DRTVIE  from .dvtv import DVTVIE  from .dump import DumpIE +from .dumpert import DumpertIE  from .defense import DefenseGouvFrIE  from .discovery import DiscoveryIE  from .divxstage import DivxStageIE @@ -310,6 +311,8 @@ from .nba import NBAIE  from .nbc import (      NBCIE,      NBCNewsIE, +    NBCSportsIE, +    NBCSportsVPlayerIE,  )  from .ndr import NDRIE  from .ndtv import NDTVIE diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 4f67c3aac..47d58330b 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -25,8 +25,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):      def _build_request(url):          """Build a request with the family filter disabled"""          request = compat_urllib_request.Request(url) -        request.add_header('Cookie', 'family_filter=off') -        request.add_header('Cookie', 'ff=off') +        request.add_header('Cookie', 'family_filter=off; ff=off')          return request @@ -112,8 +111,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor):              video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)          embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id -        embed_page = self._download_webpage(embed_url, video_id, -                                            'Downloading embed page') +        embed_request = self._build_request(embed_url) +        embed_page = self._download_webpage( +            embed_request, video_id, 'Downloading embed page')          info = self._search_regex(r'var info = ({.*?}),$', embed_page,                                    'video info', flags=re.MULTILINE)          info = json.loads(info) diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py new file mode 100644 index 000000000..e43bc81b2 --- /dev/null +++ b/youtube_dl/extractor/dumpert.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 + +from .common import InfoExtractor +from ..utils import qualities + + +class DumpertIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?dumpert\.nl/mediabase/(?P<id>[0-9]+/[0-9a-zA-Z]+)' +    _TEST = { +        'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/', +        'md5': '1b9318d7d5054e7dcb9dc7654f21d643', +        'info_dict': { +            'id': '6646981/951bc60f', +            'ext': 'mp4', +            'title': 'Ik heb nieuws voor je', +            'description': 'Niet schrikken hoor', +            'thumbnail': 're:^https?://.*\.jpg$', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        files_base64 = self._search_regex( +            r'data-files="([^"]+)"', webpage, 'data files') + +        files = self._parse_json( +            base64.b64decode(files_base64.encode('utf-8')).decode('utf-8'), +            video_id) + +        quality = qualities(['flv', 'mobile', 'tablet', '720p']) + +        formats = [{ +            'url': video_url, +            'format_id': format_id, +            'quality': quality(format_id), +        } for format_id, video_url in files.items() if format_id != 'still'] +        self._sort_formats(formats) + +        title = self._html_search_meta( +            'title', webpage) or self._og_search_title(webpage) +        description = self._html_search_meta( +            'description', webpage) or self._og_search_description(webpage) +        thumbnail = files.get('still') or self._og_search_thumbnail(webpage) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'formats': formats +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 042d23a13..2ff002643 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,6 +29,7 @@ from ..utils import (      xpath_text,  )  from .brightcove import BrightcoveIE +from .nbc import NBCSportsVPlayerIE  from .ooyala import OoyalaIE  from .rutv import RUTVIE  from .smotri import SmotriIE @@ -639,6 +640,16 @@ class GenericIE(InfoExtractor):                  'upload_date': '20150228',                  'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',              } +        }, +        # NBC Sports vplayer embed +        { +            'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a', +            'info_dict': { +                'id': 'ln7x1qSThw4k', +                'ext': 'flv', +                'title': "PFT Live: New leader in the 'new-look' defense", +                'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', +            },          }      ] @@ -1252,6 +1263,11 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin') +        # Look for NBC Sports VPlayer embeds +        nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) +        if nbc_sports_url: +            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') +          def check_video(vurl):              if YoutubeIE.suitable(vurl):                  return True diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 80a01c778..ecd0ac8b1 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -50,6 +50,57 @@ class NBCIE(InfoExtractor):          return self.url_result(theplatform_url) +class NBCSportsVPlayerIE(InfoExtractor): +    _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)' + +    _TESTS = [{ +        'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI', +        'info_dict': { +            'id': '9CsDKds0kvHI', +            'ext': 'flv', +            'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', +            'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', +        } +    }, { +        'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z', +        'only_matching': True, +    }] + +    @staticmethod +    def _extract_url(webpage): +        iframe_m = re.search( +            r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage) +        if iframe_m: +            return iframe_m.group('url') + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        theplatform_url = self._og_search_video_url(webpage) +        return self.url_result(theplatform_url, 'ThePlatform') + + +class NBCSportsIE(InfoExtractor): +    # Does not include https becuase its certificate is invalid +    _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)' + +    _TEST = { +        'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', +        'info_dict': { +            'id': 'PHJSaFWbrTY9', +            'ext': 'flv', +            'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', +            'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        return self.url_result( +            NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') + +  class NBCNewsIE(InfoExtractor):      _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/          (?:video/.+?/(?P<id>\d+)| diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index a20672c0c..46cebc0d7 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -5,19 +5,33 @@ from .zdf import extract_from_xml_url  class PhoenixIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?phoenix\.de/content/(?P<id>[0-9]+)' -    _TEST = { -        'url': 'http://www.phoenix.de/content/884301', -        'md5': 'ed249f045256150c92e72dbb70eadec6', -        'info_dict': { -            'id': '884301', -            'ext': 'mp4', -            'title': 'Michael Krons mit Hans-Werner Sinn', -            'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', -            'upload_date': '20141025', -            'uploader': 'Im Dialog', -        } -    } +    _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ +        (?: +            phoenix/die_sendungen/(?:[^/]+/)? +        )? +        (?P<id>[0-9]+)''' +    _TESTS = [ +        { +            'url': 'http://www.phoenix.de/content/884301', +            'md5': 'ed249f045256150c92e72dbb70eadec6', +            'info_dict': { +                'id': '884301', +                'ext': 'mp4', +                'title': 'Michael Krons mit Hans-Werner Sinn', +                'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', +                'upload_date': '20141025', +                'uploader': 'Im Dialog', +            } +        }, +        { +            'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815', +            'only_matching': True, +        }, +        { +            'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234', +            'only_matching': True, +        }, +    ]      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 3a27e3789..0c8b731cf 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,10 +33,8 @@ class PornHubIE(InfoExtractor):      }      def _extract_count(self, pattern, webpage, name): -        count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False) -        if count: -            count = str_to_int(count) -        return count +        return str_to_int(self._search_regex( +            pattern, webpage, '%s count' % name, fatal=False))      def _real_extract(self, url):          video_id = self._match_id(url) @@ -62,11 +60,14 @@ class PornHubIE(InfoExtractor):          if thumbnail:              thumbnail = compat_urllib_parse.unquote(thumbnail) -        view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') -        like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') -        dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') +        view_count = self._extract_count( +            r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') +        like_count = self._extract_count( +            r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') +        dislike_count = self._extract_count( +            r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')          comment_count = self._extract_count( -            r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment') +            r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')          video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))          if webpage.find('"encrypted":true') != -1: diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 9d4505972..316b2c90f 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -242,7 +242,7 @@ class SoundcloudIE(InfoExtractor):  class SoundcloudSetIE(SoundcloudIE): -    _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' +    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?'      IE_NAME = 'soundcloud:set'      _TESTS = [{          'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', @@ -287,7 +287,7 @@ class SoundcloudSetIE(SoundcloudIE):  class SoundcloudUserIE(SoundcloudIE): -    _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' +    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$'      IE_NAME = 'soundcloud:user'      _TESTS = [{          'url': 'https://soundcloud.com/the-concept-band', diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index feac666f7..0e3e627f4 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -92,7 +92,7 @@ class ThePlatformIE(InfoExtractor):              error_msg = next(                  n.attrib['abstract']                  for n in meta.findall(_x('.//smil:ref')) -                if n.attrib.get('title') == 'Geographic Restriction') +                if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired')          except StopIteration:              pass          else: diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 4971965f9..81d885fdc 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -69,18 +69,26 @@ class XuiteIE(InfoExtractor):          'only_matching': True,      }] +    @staticmethod +    def base64_decode_utf8(data): +        return base64.b64decode(data.encode('utf-8')).decode('utf-8') + +    @staticmethod +    def base64_encode_utf8(data): +        return base64.b64encode(data.encode('utf-8')).decode('utf-8') +      def _extract_flv_config(self, media_id): -        base64_media_id = base64.b64encode(media_id.encode('utf-8')).decode('utf-8') +        base64_media_id = self.base64_encode_utf8(media_id)          flv_config = self._download_xml(              'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id,              'flv config')          prop_dict = {}          for prop in flv_config.findall('./property'): -            prop_id = base64.b64decode(prop.attrib['id']).decode('utf-8') +            prop_id = self.base64_decode_utf8(prop.attrib['id'])              # CDATA may be empty in flv config              if not prop.text:                  continue -            encoded_content = base64.b64decode(prop.text).decode('utf-8') +            encoded_content = self.base64_decode_utf8(prop.text)              prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content)          return prop_dict diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 97dbac4cc..b777159c5 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -17,6 +17,8 @@ from ..utils import (      int_or_none,  ) +from .nbc import NBCSportsVPlayerIE +  class YahooIE(InfoExtractor):      IE_DESC = 'Yahoo screen and movies' @@ -129,6 +131,15 @@ class YahooIE(InfoExtractor):          }, {              'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',              'only_matching': True, +        }, { +            'note': 'NBC Sports embeds', +            'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', +            'info_dict': { +                'id': '9CsDKds0kvHI', +                'ext': 'flv', +                'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', +                'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', +            }          }      ] @@ -151,6 +162,10 @@ class YahooIE(InfoExtractor):                  items = json.loads(items_json)                  video_id = items[0]['id']                  return self._get_info(video_id, display_id, webpage) +        # Look for NBCSports iframes +        nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) +        if nbc_sports_url: +            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')          items_json = self._search_regex(              r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,  | 
