aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/generic.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r--youtube_dl/extractor/generic.py41
1 files changed, 30 insertions, 11 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 7a3a7f66b..34d930a2d 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -4,10 +4,12 @@ from __future__ import unicode_literals
import os
import re
+import sys
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import (
+ compat_etree_fromstring,
compat_urllib_parse_unquote,
compat_urllib_request,
compat_urlparse,
@@ -20,7 +22,6 @@ from ..utils import (
HEADRequest,
is_html,
orderedSet,
- parse_xml,
smuggle_url,
unescapeHTML,
unified_strdate,
@@ -52,6 +53,7 @@ from .dailymotion import DailymotionCloudIE
from .onionstudios import OnionStudiosIE
from .snagfilms import SnagFilmsEmbedIE
from .screenwavemedia import ScreenwaveMediaIE
+from .mtv import MTVServicesEmbeddedIE
class GenericIE(InfoExtractor):
@@ -142,6 +144,7 @@ class GenericIE(InfoExtractor):
'ext': 'mp4',
'title': 'Automatics, robotics and biocybernetics',
'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+ 'upload_date': '20130627',
'formats': 'mincount:16',
'subtitles': 'mincount:1',
},
@@ -234,6 +237,22 @@ class GenericIE(InfoExtractor):
}
},
{
+ # redirect in Refresh HTTP header
+ 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1',
+ 'info_dict': {
+ 'id': 'pO8h3EaFRdo',
+ 'ext': 'mp4',
+ 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set',
+ 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5',
+ 'upload_date': '20150917',
+ 'uploader_id': 'brtvofficial',
+ 'uploader': 'Boiler Room',
+ },
+ 'params': {
+ 'skip_download': False,
+ },
+ },
+ {
'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
'info_dict': {
@@ -1233,7 +1252,7 @@ class GenericIE(InfoExtractor):
# Is it an RSS feed, a SMIL file or a XSPF playlist?
try:
- doc = parse_xml(webpage)
+ doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc)
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
@@ -1613,12 +1632,9 @@ class GenericIE(InfoExtractor):
return self.url_result(url, ie='Vulture')
# Look for embedded mtvservices player
- mobj = re.search(
- r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
- webpage)
- if mobj is not None:
- url = unescapeHTML(mobj.group('url'))
- return self.url_result(url, ie='MTVServicesEmbedded')
+ mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
+ if mtvservices_url:
+ return self.url_result(mtvservices_url, ie='MTVServicesEmbedded')
# Look for embedded yahoo player
mobj = re.search(
@@ -1657,7 +1673,7 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'MLB')
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
+ r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
webpage)
if mobj is not None:
return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
@@ -1675,8 +1691,8 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'Zapiks')
# Look for Kaltura embeds
- mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or
- re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage))
+ mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or
+ re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))
if mobj is not None:
return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
@@ -1827,6 +1843,9 @@ class GenericIE(InfoExtractor):
# Look also in Refresh HTTP header
refresh_header = head_response.headers.get('Refresh')
if refresh_header:
+ # In python 2 response HTTP headers are bytestrings
+ if sys.version_info < (3, 0) and isinstance(refresh_header, str):
+ refresh_header = refresh_header.decode('iso-8859-1')
found = re.search(REDIRECT_REGEX, refresh_header)
if found:
new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))