diff options
Diffstat (limited to 'youtube_dl')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/dailymotion.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/dumpert.py | 56 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 16 | ||||
-rw-r--r-- | youtube_dl/extractor/nbc.py | 51 | ||||
-rw-r--r-- | youtube_dl/extractor/phoenix.py | 40 | ||||
-rw-r--r-- | youtube_dl/extractor/pornhub.py | 17 | ||||
-rw-r--r-- | youtube_dl/extractor/soundcloud.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/theplatform.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/xuite.py | 14 | ||||
-rw-r--r-- | youtube_dl/extractor/yahoo.py | 15 |
11 files changed, 195 insertions, 31 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a65c0c25b..9fddb8e32 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -115,6 +115,7 @@ from .drtuber import DrTuberIE from .drtv import DRTVIE from .dvtv import DVTVIE from .dump import DumpIE +from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE from .divxstage import DivxStageIE @@ -310,6 +311,8 @@ from .nba import NBAIE from .nbc import ( NBCIE, NBCNewsIE, + NBCSportsIE, + NBCSportsVPlayerIE, ) from .ndr import NDRIE from .ndtv import NDTVIE diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 4f67c3aac..47d58330b 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -25,8 +25,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): def _build_request(url): """Build a request with the family filter disabled""" request = compat_urllib_request.Request(url) - request.add_header('Cookie', 'family_filter=off') - request.add_header('Cookie', 'ff=off') + request.add_header('Cookie', 'family_filter=off; ff=off') return request @@ -112,8 +111,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id - embed_page = self._download_webpage(embed_url, video_id, - 'Downloading embed page') + embed_request = self._build_request(embed_url) + embed_page = self._download_webpage( + embed_request, video_id, 'Downloading embed page') info = self._search_regex(r'var info = ({.*?}),$', embed_page, 'video info', flags=re.MULTILINE) info = json.loads(info) diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py new file mode 100644 index 000000000..e43bc81b2 --- /dev/null +++ b/youtube_dl/extractor/dumpert.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 + +from .common import InfoExtractor +from ..utils import qualities + + +class DumpertIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dumpert\.nl/mediabase/(?P<id>[0-9]+/[0-9a-zA-Z]+)' + _TEST = { + 'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/', + 'md5': '1b9318d7d5054e7dcb9dc7654f21d643', + 'info_dict': { + 'id': '6646981/951bc60f', + 'ext': 'mp4', + 'title': 'Ik heb nieuws voor je', + 'description': 'Niet schrikken hoor', + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + files_base64 = self._search_regex( + r'data-files="([^"]+)"', webpage, 'data files') + + files = self._parse_json( + base64.b64decode(files_base64.encode('utf-8')).decode('utf-8'), + video_id) + + quality = qualities(['flv', 'mobile', 'tablet', '720p']) + + formats = [{ + 'url': video_url, + 'format_id': format_id, + 'quality': quality(format_id), + } for format_id, video_url in files.items() if format_id != 'still'] + self._sort_formats(formats) + + title = self._html_search_meta( + 'title', webpage) or self._og_search_title(webpage) + description = self._html_search_meta( + 'description', webpage) or self._og_search_description(webpage) + thumbnail = files.get('still') or self._og_search_thumbnail(webpage) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 042d23a13..2ff002643 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,6 +29,7 @@ from ..utils import ( xpath_text, ) from .brightcove import BrightcoveIE +from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE from .smotri import SmotriIE @@ -639,6 +640,16 @@ class GenericIE(InfoExtractor): 'upload_date': '20150228', 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', } + }, + # NBC Sports vplayer embed + { + 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a', + 'info_dict': { + 'id': 'ln7x1qSThw4k', + 'ext': 'flv', + 'title': "PFT Live: New leader in the 'new-look' defense", + 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', + }, } ] @@ -1252,6 +1263,11 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin') + # Look for NBC Sports VPlayer embeds + nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) + if nbc_sports_url: + return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + def check_video(vurl): if YoutubeIE.suitable(vurl): return True diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 80a01c778..ecd0ac8b1 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -50,6 +50,57 @@ class NBCIE(InfoExtractor): return self.url_result(theplatform_url) +class NBCSportsVPlayerIE(InfoExtractor): + _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)' + + _TESTS = [{ + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI', + 'info_dict': { + 'id': '9CsDKds0kvHI', + 'ext': 'flv', + 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', + 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + } + }, { + 'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + iframe_m = re.search( + r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage) + if iframe_m: + return iframe_m.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + theplatform_url = self._og_search_video_url(webpage) + return self.url_result(theplatform_url, 'ThePlatform') + + +class NBCSportsIE(InfoExtractor): + # Does not include https becuase its certificate is invalid + _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)' + + _TEST = { + 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', + 'info_dict': { + 'id': 'PHJSaFWbrTY9', + 'ext': 'flv', + 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', + 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + return self.url_result( + NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') + + class NBCNewsIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ (?:video/.+?/(?P<id>\d+)| diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index a20672c0c..46cebc0d7 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -5,19 +5,33 @@ from .zdf import extract_from_xml_url class PhoenixIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?phoenix\.de/content/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.phoenix.de/content/884301', - 'md5': 'ed249f045256150c92e72dbb70eadec6', - 'info_dict': { - 'id': '884301', - 'ext': 'mp4', - 'title': 'Michael Krons mit Hans-Werner Sinn', - 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', - 'upload_date': '20141025', - 'uploader': 'Im Dialog', - } - } + _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ + (?: + phoenix/die_sendungen/(?:[^/]+/)? + )? + (?P<id>[0-9]+)''' + _TESTS = [ + { + 'url': 'http://www.phoenix.de/content/884301', + 'md5': 'ed249f045256150c92e72dbb70eadec6', + 'info_dict': { + 'id': '884301', + 'ext': 'mp4', + 'title': 'Michael Krons mit Hans-Werner Sinn', + 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', + 'upload_date': '20141025', + 'uploader': 'Im Dialog', + } + }, + { + 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815', + 'only_matching': True, + }, + { + 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234', + 'only_matching': True, + }, + ] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 3a27e3789..0c8b731cf 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,10 +33,8 @@ class PornHubIE(InfoExtractor): } def _extract_count(self, pattern, webpage, name): - count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False) - if count: - count = str_to_int(count) - return count + return str_to_int(self._search_regex( + pattern, webpage, '%s count' % name, fatal=False)) def _real_extract(self, url): video_id = self._match_id(url) @@ -62,11 +60,14 @@ class PornHubIE(InfoExtractor): if thumbnail: thumbnail = compat_urllib_parse.unquote(thumbnail) - view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') - like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') - dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') + view_count = self._extract_count( + r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') + like_count = self._extract_count( + r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') + dislike_count = self._extract_count( + r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') comment_count = self._extract_count( - r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment') + r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) if webpage.find('"encrypted":true') != -1: diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 9d4505972..316b2c90f 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -242,7 +242,7 @@ class SoundcloudIE(InfoExtractor): class SoundcloudSetIE(SoundcloudIE): - _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' IE_NAME = 'soundcloud:set' _TESTS = [{ 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', @@ -287,7 +287,7 @@ class SoundcloudSetIE(SoundcloudIE): class SoundcloudUserIE(SoundcloudIE): - _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' IE_NAME = 'soundcloud:user' _TESTS = [{ 'url': 'https://soundcloud.com/the-concept-band', diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index feac666f7..0e3e627f4 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -92,7 +92,7 @@ class ThePlatformIE(InfoExtractor): error_msg = next( n.attrib['abstract'] for n in meta.findall(_x('.//smil:ref')) - if n.attrib.get('title') == 'Geographic Restriction') + if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired') except StopIteration: pass else: diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 4971965f9..81d885fdc 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -69,18 +69,26 @@ class XuiteIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def base64_decode_utf8(data): + return base64.b64decode(data.encode('utf-8')).decode('utf-8') + + @staticmethod + def base64_encode_utf8(data): + return base64.b64encode(data.encode('utf-8')).decode('utf-8') + def _extract_flv_config(self, media_id): - base64_media_id = base64.b64encode(media_id.encode('utf-8')).decode('utf-8') + base64_media_id = self.base64_encode_utf8(media_id) flv_config = self._download_xml( 'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id, 'flv config') prop_dict = {} for prop in flv_config.findall('./property'): - prop_id = base64.b64decode(prop.attrib['id']).decode('utf-8') + prop_id = self.base64_decode_utf8(prop.attrib['id']) # CDATA may be empty in flv config if not prop.text: continue - encoded_content = base64.b64decode(prop.text).decode('utf-8') + encoded_content = self.base64_decode_utf8(prop.text) prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content) return prop_dict diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 97dbac4cc..b777159c5 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -17,6 +17,8 @@ from ..utils import ( int_or_none, ) +from .nbc import NBCSportsVPlayerIE + class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' @@ -129,6 +131,15 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', 'only_matching': True, + }, { + 'note': 'NBC Sports embeds', + 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', + 'info_dict': { + 'id': '9CsDKds0kvHI', + 'ext': 'flv', + 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', + 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + } } ] @@ -151,6 +162,10 @@ class YahooIE(InfoExtractor): items = json.loads(items_json) video_id = items[0]['id'] return self._get_info(video_id, display_id, webpage) + # Look for NBCSports iframes + nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) + if nbc_sports_url: + return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') items_json = self._search_regex( r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, |