aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py7
-rw-r--r--youtube_dl/extractor/firedrive.py83
-rw-r--r--youtube_dl/extractor/mtv.py3
-rw-r--r--youtube_dl/extractor/ndr.py12
-rw-r--r--youtube_dl/extractor/pyvideo.py2
-rw-r--r--youtube_dl/extractor/reverbnation.py45
-rw-r--r--youtube_dl/extractor/ruhd.py46
-rw-r--r--youtube_dl/extractor/soundcloud.py16
-rw-r--r--youtube_dl/extractor/southpark.py (renamed from youtube_dl/extractor/southparkstudios.py)12
-rw-r--r--youtube_dl/extractor/tlc.py5
-rw-r--r--youtube_dl/extractor/tutv.py21
11 files changed, 218 insertions, 34 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index f75939a05..14133c315 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -83,6 +83,7 @@ from .extremetube import ExtremeTubeIE
from .facebook import FacebookIE
from .faz import FazIE
from .fc2 import FC2IE
+from .firedrive import FiredriveIE
from .firstpost import FirstpostIE
from .firsttv import FirstTVIE
from .fivemin import FiveMinIE
@@ -232,6 +233,7 @@ from .radiofrance import RadioFranceIE
from .rai import RaiIE
from .rbmaradio import RBMARadioIE
from .redtube import RedTubeIE
+from .reverbnation import ReverbNationIE
from .ringtv import RingTVIE
from .ro220 import Ro220IE
from .rottentomatoes import RottenTomatoesIE
@@ -240,6 +242,7 @@ from .rtbf import RTBFIE
from .rtlnow import RTLnowIE
from .rts import RTSIE
from .rtve import RTVEALaCartaIE
+from .ruhd import RUHDIE
from .rutube import (
RutubeIE,
RutubeChannelIE,
@@ -268,8 +271,8 @@ from .soundcloud import (
SoundcloudPlaylistIE
)
from .soundgasm import SoundgasmIE
-from .southparkstudios import (
- SouthParkStudiosIE,
+from .southpark import (
+ SouthParkIE,
SouthparkDeIE,
)
from .space import SpaceIE
diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py
new file mode 100644
index 000000000..d26145db1
--- /dev/null
+++ b/youtube_dl/extractor/firedrive.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ compat_urllib_parse,
+ compat_urllib_request,
+ determine_ext,
+)
+
+
+class FiredriveIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \
+ '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)'
+ _FILE_DELETED_REGEX = r'<div class="removed_file_image">'
+
+ _TESTS = [{
+ 'url': 'https://www.firedrive.com/file/FEB892FA160EBD01',
+ 'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970',
+ 'info_dict': {
+ 'id': 'FEB892FA160EBD01',
+ 'ext': 'flv',
+ 'title': 'bbb_theora_486kbit.flv',
+ 'thumbnail': 're:^http://.*\.jpg$',
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ url = 'http://firedrive.com/file/%s' % video_id
+
+ webpage = self._download_webpage(url, video_id)
+
+ if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
+ raise ExtractorError('Video %s does not exist' % video_id,
+ expected=True)
+
+ fields = dict(re.findall(r'''(?x)<input\s+
+ type="hidden"\s+
+ name="([^"]+)"\s+
+ (?:id="[^"]+"\s+)?
+ value="([^"]*)"
+ ''', webpage))
+
+ post = compat_urllib_parse.urlencode(fields)
+ req = compat_urllib_request.Request(url, post)
+ req.add_header('Content-type', 'application/x-www-form-urlencoded')
+
+ # Apparently, this header is required for confirmation to work.
+ req.add_header('Host', 'www.firedrive.com')
+
+ webpage = self._download_webpage(req, video_id,
+ 'Downloading video page')
+
+ title = self._search_regex(r'class="external_title_left">(.+)</div>',
+ webpage, 'title')
+ thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage,
+ 'thumbnail', fatal=False)
+ if thumbnail is not None:
+ thumbnail = 'http:' + thumbnail
+
+ ext = self._search_regex(r'type:\s?\'([^\']+)\',',
+ webpage, 'extension', fatal=False)
+ video_url = self._search_regex(
+ r'file:\s?\'(http[^\']+)\',', webpage, 'file url')
+
+ formats = [{
+ 'format_id': 'sd',
+ 'url': video_url,
+ 'ext': ext,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index af9490ccc..228b42d2b 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -158,6 +158,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
if mgid.endswith('.swf'):
mgid = mgid[:-4]
except RegexNotFoundError:
+ mgid = None
+
+ if mgid is None or ':' not in mgid:
mgid = self._search_regex(
[r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'],
webpage, u'mgid')
diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py
index 3d6096e46..94d5ba982 100644
--- a/youtube_dl/extractor/ndr.py
+++ b/youtube_dl/extractor/ndr.py
@@ -18,15 +18,15 @@ class NDRIE(InfoExtractor):
_TESTS = [
{
- 'url': 'http://www.ndr.de/fernsehen/sendungen/markt/markt7959.html',
- 'md5': 'e7a6079ca39d3568f4996cb858dd6708',
+ 'url': 'http://www.ndr.de/fernsehen/media/dienordreportage325.html',
+ 'md5': '4a4eeafd17c3058b65f0c8f091355855',
'note': 'Video file',
'info_dict': {
- 'id': '7959',
+ 'id': '325',
'ext': 'mp4',
- 'title': 'Markt - die ganze Sendung',
- 'description': 'md5:af9179cf07f67c5c12dc6d9997e05725',
- 'duration': 2655,
+ 'title': 'Blaue Bohnen aus Blocken',
+ 'description': 'md5:190d71ba2ccddc805ed01547718963bc',
+ 'duration': 1715,
},
},
{
diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py
index 0bc0859b4..6d5732d45 100644
--- a/youtube_dl/extractor/pyvideo.py
+++ b/youtube_dl/extractor/pyvideo.py
@@ -46,7 +46,7 @@ class PyvideoIE(InfoExtractor):
return self.url_result(m_youtube.group(1), 'Youtube')
title = self._html_search_regex(
- r'<div class="section">.*?<h3(?:\s+class="[^"]*")?>([^>]+?)</h3>',
+ r'<div class="section">\s*<h3(?:\s+class="[^"]*"[^>]*)?>([^>]+?)</h3>',
webpage, 'title', flags=re.DOTALL)
video_url = self._search_regex(
[r'<source src="(.*?)"', r'<dt>Download</dt>.*?<a href="(.+?)"'],
diff --git a/youtube_dl/extractor/reverbnation.py b/youtube_dl/extractor/reverbnation.py
new file mode 100644
index 000000000..49cf427a1
--- /dev/null
+++ b/youtube_dl/extractor/reverbnation.py
@@ -0,0 +1,45 @@
+from __future__ import unicode_literals
+
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import strip_jsonp
+
+
+class ReverbNationIE(InfoExtractor):
+ _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$'
+ _TESTS = [{
+ 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa',
+ 'file': '16965047.mp3',
+ 'md5': '3da12ebca28c67c111a7f8b262d3f7a7',
+ 'info_dict': {
+ "title": "MONA LISA",
+ "uploader": "ALKILADOS",
+ "uploader_id": 216429,
+ "thumbnail": "//gp1.wac.edgecastcdn.net/802892/production_public/Photo/13761700/image/1366002176_AVATAR_MONA_LISA.jpg"
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ song_id = mobj.group('id')
+
+ api_res = self._download_json(
+ 'https://api.reverbnation.com/song/%s?callback=api_response_5&_=%d'
+ % (song_id, int(time.time() * 1000)),
+ song_id,
+ transform_source=strip_jsonp,
+ note='Downloading information of song %s' % song_id
+ )
+
+ return {
+ 'id': song_id,
+ 'title': api_res.get('name'),
+ 'url': api_res.get('url'),
+ 'uploader': api_res.get('artist', {}).get('name'),
+ 'uploader_id': api_res.get('artist', {}).get('id'),
+ 'thumbnail': api_res.get('image', api_res.get('thumbnail')),
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ }
diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py
new file mode 100644
index 000000000..55b58e5e6
--- /dev/null
+++ b/youtube_dl/extractor/ruhd.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class RUHDIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.ruhd.ru/play.php?vid=207',
+ 'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83',
+ 'info_dict': {
+ 'id': '207',
+ 'ext': 'divx',
+ 'title': 'КОТ бааааам',
+ 'description': 'классный кот)',
+ 'thumbnail': 're:^http://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._html_search_regex(
+ r'<param name="src" value="([^"]+)"', webpage, 'video url')
+ title = self._html_search_regex(
+ r'<title>([^<]+)&nbsp;&nbsp; RUHD.ru - Видео Высокого качества №1 в России!</title>', webpage, 'title')
+ description = self._html_search_regex(
+ r'(?s)<div id="longdesc">(.+?)<span id="showlink">', webpage, 'description', fatal=False)
+ thumbnail = self._html_search_regex(
+ r'<param name="previewImage" value="([^"]+)"', webpage, 'thumbnail', fatal=False)
+ if thumbnail:
+ thumbnail = 'http://www.ruhd.ru' + thumbnail
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 14ec9452d..8a77c1370 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -81,16 +81,16 @@ class SoundcloudIE(InfoExtractor):
},
# downloadable song
{
- 'url': 'https://soundcloud.com/simgretina/just-your-problem-baby-1',
- 'md5': '56a8b69568acaa967b4c49f9d1d52d19',
+ 'url': 'https://soundcloud.com/oddsamples/bus-brakes',
+ 'md5': 'fee7b8747b09bb755cefd4b853e7249a',
'info_dict': {
- 'id': '105614606',
+ 'id': '128590877',
'ext': 'wav',
- 'title': 'Just Your Problem Baby (Acapella)',
- 'description': 'Vocals',
- 'uploader': 'Sim Gretina',
- 'upload_date': '20130815',
- #'duration': 42,
+ 'title': 'Bus Brakes',
+ 'description': 'md5:0170be75dd395c96025d210d261c784e',
+ 'uploader': 'oddsamples',
+ 'upload_date': '20140109',
+ 'duration': 17,
},
},
]
diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southpark.py
index aea8e6439..c20397b3d 100644
--- a/youtube_dl/extractor/southparkstudios.py
+++ b/youtube_dl/extractor/southpark.py
@@ -3,24 +3,24 @@ from __future__ import unicode_literals
from .mtv import MTVServicesInfoExtractor
-class SouthParkStudiosIE(MTVServicesInfoExtractor):
- IE_NAME = 'southparkstudios.com'
- _VALID_URL = r'https?://(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
+class SouthParkIE(MTVServicesInfoExtractor):
+ IE_NAME = 'southpark.cc.com'
+ _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.cc\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
_FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
_TESTS = [{
- 'url': 'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured',
+ 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured',
'info_dict': {
'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30',
'ext': 'mp4',
- 'title': 'Bat Daded',
+ 'title': 'South Park|Bat Daded',
'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.',
},
}]
-class SouthparkDeIE(SouthParkStudiosIE):
+class SouthparkDeIE(SouthParkIE):
IE_NAME = 'southpark.de'
_VALID_URL = r'https?://(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
_FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'
diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py
index ad175b83e..d848ee186 100644
--- a/youtube_dl/extractor/tlc.py
+++ b/youtube_dl/extractor/tlc.py
@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from .brightcove import BrightcoveIE
from .discovery import DiscoveryIE
+from ..utils import compat_urlparse
class TlcIE(DiscoveryIE):
@@ -51,6 +52,10 @@ class TlcDeIE(InfoExtractor):
# Otherwise we don't get the correct 'BrightcoveExperience' element,
# example: http://www.tlc.de/sendungen/cake-boss/videos/cake-boss-cannoli-drama/
iframe_url = iframe_url.replace('.htm?', '.php?')
+ url_fragment = compat_urlparse.urlparse(url).fragment
+ if url_fragment:
+ # Since the fragment is not send to the server, we always get the same iframe
+ iframe_url = re.sub(r'playlist=(\d+)', 'playlist=%s' % url_fragment, iframe_url)
iframe = self._download_webpage(iframe_url, title)
return {
diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py
index c980153ec..d516b6427 100644
--- a/youtube_dl/extractor/tutv.py
+++ b/youtube_dl/extractor/tutv.py
@@ -1,21 +1,21 @@
from __future__ import unicode_literals
+
import base64
import re
from .common import InfoExtractor
-from ..utils import (
- compat_parse_qs,
-)
+from ..utils import compat_parse_qs
class TutvIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)'
_TEST = {
- 'url': 'http://tu.tv/videos/noah-en-pabellon-cuahutemoc',
- 'file': '2742556.flv',
- 'md5': '5eb766671f69b82e528dc1e7769c5cb2',
+ 'url': 'http://tu.tv/videos/robots-futbolistas',
+ 'md5': '627c7c124ac2a9b5ab6addb94e0e65f7',
'info_dict': {
- 'title': 'Noah en pabellon cuahutemoc',
+ 'id': '2973058',
+ 'ext': 'flv',
+ 'title': 'Robots futbolistas',
},
}
@@ -26,10 +26,9 @@ class TutvIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, 'internal video ID')
- data_url = 'http://tu.tv/flvurl.php?codVideo=' + str(internal_id)
- data_content = self._download_webpage(data_url, video_id, note='Downloading video info')
- data = compat_parse_qs(data_content)
- video_url = base64.b64decode(data['kpt'][0]).decode('utf-8')
+ data_content = self._download_webpage(
+ 'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info')
+ video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8')
return {
'id': internal_id,