aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rw-r--r--youtube_dl/extractor/__init__.py3
-rw-r--r--youtube_dl/extractor/dailymotion.py8
-rw-r--r--youtube_dl/extractor/dumpert.py56
-rw-r--r--youtube_dl/extractor/generic.py16
-rw-r--r--youtube_dl/extractor/nbc.py51
-rw-r--r--youtube_dl/extractor/phoenix.py40
-rw-r--r--youtube_dl/extractor/pornhub.py17
-rw-r--r--youtube_dl/extractor/soundcloud.py4
-rw-r--r--youtube_dl/extractor/theplatform.py2
-rw-r--r--youtube_dl/extractor/xuite.py14
-rw-r--r--youtube_dl/extractor/yahoo.py15
11 files changed, 195 insertions, 31 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index a65c0c25b..9fddb8e32 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -115,6 +115,7 @@ from .drtuber import DrTuberIE
from .drtv import DRTVIE
from .dvtv import DVTVIE
from .dump import DumpIE
+from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE
from .divxstage import DivxStageIE
@@ -310,6 +311,8 @@ from .nba import NBAIE
from .nbc import (
NBCIE,
NBCNewsIE,
+ NBCSportsIE,
+ NBCSportsVPlayerIE,
)
from .ndr import NDRIE
from .ndtv import NDTVIE
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 4f67c3aac..47d58330b 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -25,8 +25,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
def _build_request(url):
"""Build a request with the family filter disabled"""
request = compat_urllib_request.Request(url)
- request.add_header('Cookie', 'family_filter=off')
- request.add_header('Cookie', 'ff=off')
+ request.add_header('Cookie', 'family_filter=off; ff=off')
return request
@@ -112,8 +111,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
- embed_page = self._download_webpage(embed_url, video_id,
- 'Downloading embed page')
+ embed_request = self._build_request(embed_url)
+ embed_page = self._download_webpage(
+ embed_request, video_id, 'Downloading embed page')
info = self._search_regex(r'var info = ({.*?}),$', embed_page,
'video info', flags=re.MULTILINE)
info = json.loads(info)
diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py
new file mode 100644
index 000000000..e43bc81b2
--- /dev/null
+++ b/youtube_dl/extractor/dumpert.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+
+from .common import InfoExtractor
+from ..utils import qualities
+
+
+class DumpertIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?dumpert\.nl/mediabase/(?P<id>[0-9]+/[0-9a-zA-Z]+)'
+ _TEST = {
+ 'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/',
+ 'md5': '1b9318d7d5054e7dcb9dc7654f21d643',
+ 'info_dict': {
+ 'id': '6646981/951bc60f',
+ 'ext': 'mp4',
+ 'title': 'Ik heb nieuws voor je',
+ 'description': 'Niet schrikken hoor',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ files_base64 = self._search_regex(
+ r'data-files="([^"]+)"', webpage, 'data files')
+
+ files = self._parse_json(
+ base64.b64decode(files_base64.encode('utf-8')).decode('utf-8'),
+ video_id)
+
+ quality = qualities(['flv', 'mobile', 'tablet', '720p'])
+
+ formats = [{
+ 'url': video_url,
+ 'format_id': format_id,
+ 'quality': quality(format_id),
+ } for format_id, video_url in files.items() if format_id != 'still']
+ self._sort_formats(formats)
+
+ title = self._html_search_meta(
+ 'title', webpage) or self._og_search_title(webpage)
+ description = self._html_search_meta(
+ 'description', webpage) or self._og_search_description(webpage)
+ thumbnail = files.get('still') or self._og_search_thumbnail(webpage)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 042d23a13..2ff002643 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -29,6 +29,7 @@ from ..utils import (
xpath_text,
)
from .brightcove import BrightcoveIE
+from .nbc import NBCSportsVPlayerIE
from .ooyala import OoyalaIE
from .rutv import RUTVIE
from .smotri import SmotriIE
@@ -639,6 +640,16 @@ class GenericIE(InfoExtractor):
'upload_date': '20150228',
'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
}
+ },
+ # NBC Sports vplayer embed
+ {
+ 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
+ 'info_dict': {
+ 'id': 'ln7x1qSThw4k',
+ 'ext': 'flv',
+ 'title': "PFT Live: New leader in the 'new-look' defense",
+ 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
+ },
}
]
@@ -1252,6 +1263,11 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
+ # Look for NBC Sports VPlayer embeds
+ nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
+ if nbc_sports_url:
+ return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
+
def check_video(vurl):
if YoutubeIE.suitable(vurl):
return True
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index 80a01c778..ecd0ac8b1 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -50,6 +50,57 @@ class NBCIE(InfoExtractor):
return self.url_result(theplatform_url)
+class NBCSportsVPlayerIE(InfoExtractor):
+ _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)'
+
+ _TESTS = [{
+ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI',
+ 'info_dict': {
+ 'id': '9CsDKds0kvHI',
+ 'ext': 'flv',
+ 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
+ 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
+ }
+ }, {
+ 'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_url(webpage):
+ iframe_m = re.search(
+ r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage)
+ if iframe_m:
+ return iframe_m.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ theplatform_url = self._og_search_video_url(webpage)
+ return self.url_result(theplatform_url, 'ThePlatform')
+
+
+class NBCSportsIE(InfoExtractor):
+ # Does not include https becuase its certificate is invalid
+ _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)'
+
+ _TEST = {
+ 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke',
+ 'info_dict': {
+ 'id': 'PHJSaFWbrTY9',
+ 'ext': 'flv',
+ 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke',
+ 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ return self.url_result(
+ NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer')
+
+
class NBCNewsIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/
(?:video/.+?/(?P<id>\d+)|
diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py
index a20672c0c..46cebc0d7 100644
--- a/youtube_dl/extractor/phoenix.py
+++ b/youtube_dl/extractor/phoenix.py
@@ -5,19 +5,33 @@ from .zdf import extract_from_xml_url
class PhoenixIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?phoenix\.de/content/(?P<id>[0-9]+)'
- _TEST = {
- 'url': 'http://www.phoenix.de/content/884301',
- 'md5': 'ed249f045256150c92e72dbb70eadec6',
- 'info_dict': {
- 'id': '884301',
- 'ext': 'mp4',
- 'title': 'Michael Krons mit Hans-Werner Sinn',
- 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr',
- 'upload_date': '20141025',
- 'uploader': 'Im Dialog',
- }
- }
+ _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/
+ (?:
+ phoenix/die_sendungen/(?:[^/]+/)?
+ )?
+ (?P<id>[0-9]+)'''
+ _TESTS = [
+ {
+ 'url': 'http://www.phoenix.de/content/884301',
+ 'md5': 'ed249f045256150c92e72dbb70eadec6',
+ 'info_dict': {
+ 'id': '884301',
+ 'ext': 'mp4',
+ 'title': 'Michael Krons mit Hans-Werner Sinn',
+ 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr',
+ 'upload_date': '20141025',
+ 'uploader': 'Im Dialog',
+ }
+ },
+ {
+ 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234',
+ 'only_matching': True,
+ },
+ ]
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 3a27e3789..0c8b731cf 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -33,10 +33,8 @@ class PornHubIE(InfoExtractor):
}
def _extract_count(self, pattern, webpage, name):
- count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False)
- if count:
- count = str_to_int(count)
- return count
+ return str_to_int(self._search_regex(
+ pattern, webpage, '%s count' % name, fatal=False))
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -62,11 +60,14 @@ class PornHubIE(InfoExtractor):
if thumbnail:
thumbnail = compat_urllib_parse.unquote(thumbnail)
- view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
- like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
- dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
+ view_count = self._extract_count(
+ r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
+ like_count = self._extract_count(
+ r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
+ dislike_count = self._extract_count(
+ r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
comment_count = self._extract_count(
- r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment')
+ r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
if webpage.find('"encrypted":true') != -1:
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 9d4505972..316b2c90f 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -242,7 +242,7 @@ class SoundcloudIE(InfoExtractor):
class SoundcloudSetIE(SoundcloudIE):
- _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?'
IE_NAME = 'soundcloud:set'
_TESTS = [{
'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep',
@@ -287,7 +287,7 @@ class SoundcloudSetIE(SoundcloudIE):
class SoundcloudUserIE(SoundcloudIE):
- _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$'
IE_NAME = 'soundcloud:user'
_TESTS = [{
'url': 'https://soundcloud.com/the-concept-band',
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index feac666f7..0e3e627f4 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -92,7 +92,7 @@ class ThePlatformIE(InfoExtractor):
error_msg = next(
n.attrib['abstract']
for n in meta.findall(_x('.//smil:ref'))
- if n.attrib.get('title') == 'Geographic Restriction')
+ if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired')
except StopIteration:
pass
else:
diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py
index 4971965f9..81d885fdc 100644
--- a/youtube_dl/extractor/xuite.py
+++ b/youtube_dl/extractor/xuite.py
@@ -69,18 +69,26 @@ class XuiteIE(InfoExtractor):
'only_matching': True,
}]
+ @staticmethod
+ def base64_decode_utf8(data):
+ return base64.b64decode(data.encode('utf-8')).decode('utf-8')
+
+ @staticmethod
+ def base64_encode_utf8(data):
+ return base64.b64encode(data.encode('utf-8')).decode('utf-8')
+
def _extract_flv_config(self, media_id):
- base64_media_id = base64.b64encode(media_id.encode('utf-8')).decode('utf-8')
+ base64_media_id = self.base64_encode_utf8(media_id)
flv_config = self._download_xml(
'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id,
'flv config')
prop_dict = {}
for prop in flv_config.findall('./property'):
- prop_id = base64.b64decode(prop.attrib['id']).decode('utf-8')
+ prop_id = self.base64_decode_utf8(prop.attrib['id'])
# CDATA may be empty in flv config
if not prop.text:
continue
- encoded_content = base64.b64decode(prop.text).decode('utf-8')
+ encoded_content = self.base64_decode_utf8(prop.text)
prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content)
return prop_dict
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 97dbac4cc..b777159c5 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -17,6 +17,8 @@ from ..utils import (
int_or_none,
)
+from .nbc import NBCSportsVPlayerIE
+
class YahooIE(InfoExtractor):
IE_DESC = 'Yahoo screen and movies'
@@ -129,6 +131,15 @@ class YahooIE(InfoExtractor):
}, {
'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',
'only_matching': True,
+ }, {
+ 'note': 'NBC Sports embeds',
+ 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313',
+ 'info_dict': {
+ 'id': '9CsDKds0kvHI',
+ 'ext': 'flv',
+ 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
+ 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
+ }
}
]
@@ -151,6 +162,10 @@ class YahooIE(InfoExtractor):
items = json.loads(items_json)
video_id = items[0]['id']
return self._get_info(video_id, display_id, webpage)
+ # Look for NBCSports iframes
+ nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
+ if nbc_sports_url:
+ return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
items_json = self._search_regex(
r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,