aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rwxr-xr-xyoutube_dl/YoutubeDL.py134
-rw-r--r--youtube_dl/__init__.py95
-rw-r--r--youtube_dl/compat.py6
-rw-r--r--youtube_dl/downloader/common.py30
-rw-r--r--youtube_dl/downloader/f4m.py50
-rw-r--r--youtube_dl/downloader/hls.py14
-rw-r--r--youtube_dl/downloader/http.py34
-rw-r--r--youtube_dl/downloader/mplayer.py8
-rw-r--r--youtube_dl/downloader/rtmp.py4
-rw-r--r--youtube_dl/extractor/__init__.py55
-rw-r--r--youtube_dl/extractor/adobetv.py70
-rw-r--r--youtube_dl/extractor/adultswim.py201
-rw-r--r--youtube_dl/extractor/aljazeera.py35
-rw-r--r--youtube_dl/extractor/allocine.py10
-rw-r--r--youtube_dl/extractor/alphaporno.py77
-rw-r--r--youtube_dl/extractor/aol.py48
-rw-r--r--youtube_dl/extractor/appletrailers.py2
-rw-r--r--youtube_dl/extractor/archiveorg.py52
-rw-r--r--youtube_dl/extractor/arte.py4
-rw-r--r--youtube_dl/extractor/atresplayer.py114
-rw-r--r--youtube_dl/extractor/audiomack.py134
-rw-r--r--youtube_dl/extractor/auengine.py26
-rw-r--r--youtube_dl/extractor/azubu.py93
-rw-r--r--youtube_dl/extractor/bambuser.py2
-rw-r--r--youtube_dl/extractor/bandcamp.py12
-rw-r--r--youtube_dl/extractor/bbccouk.py138
-rw-r--r--youtube_dl/extractor/behindkink.py29
-rw-r--r--youtube_dl/extractor/bet.py107
-rw-r--r--youtube_dl/extractor/bilibili.py72
-rw-r--r--youtube_dl/extractor/bliptv.py38
-rw-r--r--youtube_dl/extractor/brightcove.py21
-rw-r--r--youtube_dl/extractor/buzzfeed.py6
-rw-r--r--youtube_dl/extractor/canalplus.py12
-rw-r--r--youtube_dl/extractor/ceskatelevize.py137
-rw-r--r--youtube_dl/extractor/channel9.py13
-rw-r--r--youtube_dl/extractor/cinchcast.py52
-rw-r--r--youtube_dl/extractor/cnet.py35
-rw-r--r--youtube_dl/extractor/cnn.py41
-rw-r--r--youtube_dl/extractor/comcarcoff.py57
-rw-r--r--youtube_dl/extractor/comedycentral.py9
-rw-r--r--youtube_dl/extractor/common.py117
-rw-r--r--youtube_dl/extractor/commonmistakes.py29
-rw-r--r--youtube_dl/extractor/condenast.py6
-rw-r--r--youtube_dl/extractor/crunchyroll.py18
-rw-r--r--youtube_dl/extractor/cspan.py1
-rw-r--r--youtube_dl/extractor/dailymotion.py10
-rw-r--r--youtube_dl/extractor/daum.py4
-rw-r--r--youtube_dl/extractor/dbtv.py3
-rw-r--r--youtube_dl/extractor/discovery.py46
-rw-r--r--youtube_dl/extractor/dvtv.py125
-rw-r--r--youtube_dl/extractor/ebaumsworld.py5
-rw-r--r--youtube_dl/extractor/echomsk.py46
-rw-r--r--youtube_dl/extractor/ehow.py11
-rw-r--r--youtube_dl/extractor/eighttracks.py2
-rw-r--r--youtube_dl/extractor/ellentv.py35
-rw-r--r--youtube_dl/extractor/elpais.py6
-rw-r--r--youtube_dl/extractor/engadget.py8
-rw-r--r--youtube_dl/extractor/eroprofile.py45
-rw-r--r--youtube_dl/extractor/escapist.py5
-rw-r--r--youtube_dl/extractor/everyonesmixtape.py4
-rw-r--r--youtube_dl/extractor/extremetube.py8
-rw-r--r--youtube_dl/extractor/facebook.py12
-rw-r--r--youtube_dl/extractor/fc2.py13
-rw-r--r--youtube_dl/extractor/firedrive.py11
-rw-r--r--youtube_dl/extractor/fivemin.py15
-rw-r--r--youtube_dl/extractor/fktv.py21
-rw-r--r--youtube_dl/extractor/fourtube.py14
-rw-r--r--youtube_dl/extractor/foxgay.py48
-rw-r--r--youtube_dl/extractor/foxnews.py94
-rw-r--r--youtube_dl/extractor/franceculture.py2
-rw-r--r--youtube_dl/extractor/francetv.py10
-rw-r--r--youtube_dl/extractor/gameone.py63
-rw-r--r--youtube_dl/extractor/gamespot.py4
-rw-r--r--youtube_dl/extractor/gdcvault.py5
-rw-r--r--youtube_dl/extractor/generic.py47
-rw-r--r--youtube_dl/extractor/giantbomb.py81
-rw-r--r--youtube_dl/extractor/giga.py101
-rw-r--r--youtube_dl/extractor/goldenmoustache.py9
-rw-r--r--youtube_dl/extractor/golem.py4
-rw-r--r--youtube_dl/extractor/googlesearch.py2
-rw-r--r--youtube_dl/extractor/gorillavid.py8
-rw-r--r--youtube_dl/extractor/goshgay.py51
-rw-r--r--youtube_dl/extractor/groupon.py50
-rw-r--r--youtube_dl/extractor/hellporno.py71
-rw-r--r--youtube_dl/extractor/helsinki.py37
-rw-r--r--youtube_dl/extractor/hitbox.py166
-rw-r--r--youtube_dl/extractor/hostingbulk.py8
-rw-r--r--youtube_dl/extractor/howstuffworks.py127
-rw-r--r--youtube_dl/extractor/huffpost.py20
-rw-r--r--youtube_dl/extractor/hypem.py11
-rw-r--r--youtube_dl/extractor/imdb.py3
-rw-r--r--youtube_dl/extractor/infoq.py7
-rw-r--r--youtube_dl/extractor/internetvideoarchive.py6
-rw-r--r--youtube_dl/extractor/iprima.py4
-rw-r--r--youtube_dl/extractor/ivi.py4
-rw-r--r--youtube_dl/extractor/keek.py30
-rw-r--r--youtube_dl/extractor/keezmovies.py7
-rw-r--r--youtube_dl/extractor/khanacademy.py4
-rw-r--r--youtube_dl/extractor/kontrtube.py23
-rw-r--r--youtube_dl/extractor/livestream.py4
-rw-r--r--youtube_dl/extractor/lrt.py6
-rw-r--r--youtube_dl/extractor/lynda.py6
-rw-r--r--youtube_dl/extractor/malemotion.py38
-rw-r--r--youtube_dl/extractor/metacafe.py4
-rw-r--r--youtube_dl/extractor/minhateca.py72
-rw-r--r--youtube_dl/extractor/mit.py19
-rw-r--r--youtube_dl/extractor/mitele.py13
-rw-r--r--youtube_dl/extractor/mixcloud.py6
-rw-r--r--youtube_dl/extractor/moevideo.py9
-rw-r--r--youtube_dl/extractor/mofosex.py6
-rw-r--r--youtube_dl/extractor/moniker.py7
-rw-r--r--youtube_dl/extractor/mooshare.py14
-rw-r--r--youtube_dl/extractor/motorsport.py64
-rw-r--r--youtube_dl/extractor/movieclips.py4
-rw-r--r--youtube_dl/extractor/mtv.py4
-rw-r--r--youtube_dl/extractor/myspace.py1
-rw-r--r--youtube_dl/extractor/myspass.py5
-rw-r--r--youtube_dl/extractor/myvidster.py29
-rw-r--r--youtube_dl/extractor/naver.py8
-rw-r--r--youtube_dl/extractor/nba.py17
-rw-r--r--youtube_dl/extractor/nbc.py42
-rw-r--r--youtube_dl/extractor/nerdcubed.py35
-rw-r--r--youtube_dl/extractor/netzkino.py86
-rw-r--r--youtube_dl/extractor/nfb.py14
-rw-r--r--youtube_dl/extractor/nfl.py4
-rw-r--r--youtube_dl/extractor/nhl.py6
-rw-r--r--youtube_dl/extractor/niconico.py10
-rw-r--r--youtube_dl/extractor/ninegag.py6
-rw-r--r--youtube_dl/extractor/noco.py10
-rw-r--r--youtube_dl/extractor/normalboots.py6
-rw-r--r--youtube_dl/extractor/nosvideo.py7
-rw-r--r--youtube_dl/extractor/novamov.py4
-rw-r--r--youtube_dl/extractor/nowvideo.py2
-rw-r--r--youtube_dl/extractor/npo.py88
-rw-r--r--youtube_dl/extractor/nrk.py130
-rw-r--r--youtube_dl/extractor/ntv.py2
-rw-r--r--youtube_dl/extractor/nuvid.py9
-rw-r--r--youtube_dl/extractor/ooyala.py2
-rw-r--r--youtube_dl/extractor/openfilm.py70
-rw-r--r--youtube_dl/extractor/orf.py55
-rw-r--r--youtube_dl/extractor/pbs.py16
-rw-r--r--youtube_dl/extractor/photobucket.py5
-rw-r--r--youtube_dl/extractor/played.py8
-rw-r--r--youtube_dl/extractor/playfm.py4
-rw-r--r--youtube_dl/extractor/playvid.py22
-rw-r--r--youtube_dl/extractor/pornhd.py20
-rw-r--r--youtube_dl/extractor/pornhub.py8
-rw-r--r--youtube_dl/extractor/pornotube.py102
-rw-r--r--youtube_dl/extractor/promptfile.py8
-rw-r--r--youtube_dl/extractor/prosiebensat1.py62
-rw-r--r--youtube_dl/extractor/quickvid.py4
-rw-r--r--youtube_dl/extractor/radiobremen.py63
-rw-r--r--youtube_dl/extractor/radiode.py55
-rw-r--r--youtube_dl/extractor/rai.py4
-rw-r--r--youtube_dl/extractor/restudy.py40
-rw-r--r--youtube_dl/extractor/rtlnl.py2
-rw-r--r--youtube_dl/extractor/rtp.py60
-rw-r--r--youtube_dl/extractor/rts.py4
-rw-r--r--youtube_dl/extractor/rutube.py44
-rw-r--r--youtube_dl/extractor/screencast.py11
-rw-r--r--youtube_dl/extractor/screencastomatic.py49
-rw-r--r--youtube_dl/extractor/screenwavemedia.py (renamed from youtube_dl/extractor/cinemassacre.py)166
-rw-r--r--youtube_dl/extractor/sexykarma.py5
-rw-r--r--youtube_dl/extractor/shared.py38
-rw-r--r--youtube_dl/extractor/sharesix.py4
-rw-r--r--youtube_dl/extractor/sina.py2
-rw-r--r--youtube_dl/extractor/slideshare.py8
-rw-r--r--youtube_dl/extractor/smotri.py43
-rw-r--r--youtube_dl/extractor/sockshare.py13
-rw-r--r--youtube_dl/extractor/sohu.py96
-rw-r--r--youtube_dl/extractor/soulanime.py80
-rw-r--r--youtube_dl/extractor/soundcloud.py5
-rw-r--r--youtube_dl/extractor/spankwire.py8
-rw-r--r--youtube_dl/extractor/sportdeutschland.py7
-rw-r--r--youtube_dl/extractor/streamcloud.py6
-rw-r--r--youtube_dl/extractor/streamcz.py78
-rw-r--r--youtube_dl/extractor/sunporno.py18
-rw-r--r--youtube_dl/extractor/tagesschau.py111
-rw-r--r--youtube_dl/extractor/tapely.py6
-rw-r--r--youtube_dl/extractor/teachertube.py4
-rw-r--r--youtube_dl/extractor/ted.py6
-rw-r--r--youtube_dl/extractor/telecinco.py2
-rw-r--r--youtube_dl/extractor/teletask.py53
-rw-r--r--youtube_dl/extractor/tenplay.py1
-rw-r--r--youtube_dl/extractor/tf1.py24
-rw-r--r--youtube_dl/extractor/theplatform.py23
-rw-r--r--youtube_dl/extractor/tlc.py2
-rw-r--r--youtube_dl/extractor/tmz.py2
-rw-r--r--youtube_dl/extractor/tnaflix.py6
-rw-r--r--youtube_dl/extractor/tube8.py4
-rw-r--r--youtube_dl/extractor/tudou.py19
-rw-r--r--youtube_dl/extractor/tunein.py15
-rw-r--r--youtube_dl/extractor/tutv.py8
-rw-r--r--youtube_dl/extractor/tvigle.py31
-rw-r--r--youtube_dl/extractor/tvplay.py5
-rw-r--r--youtube_dl/extractor/twitch.py60
-rw-r--r--youtube_dl/extractor/udemy.py9
-rw-r--r--youtube_dl/extractor/urort.py47
-rw-r--r--youtube_dl/extractor/ustream.py2
-rw-r--r--youtube_dl/extractor/vbox7.py12
-rw-r--r--youtube_dl/extractor/veehd.py9
-rw-r--r--youtube_dl/extractor/veoh.py4
-rw-r--r--youtube_dl/extractor/vevo.py4
-rw-r--r--youtube_dl/extractor/videodetective.py9
-rw-r--r--youtube_dl/extractor/videomega.py10
-rw-r--r--youtube_dl/extractor/vier.py118
-rw-r--r--youtube_dl/extractor/viki.py4
-rw-r--r--youtube_dl/extractor/vimple.py23
-rw-r--r--youtube_dl/extractor/vine.py54
-rw-r--r--youtube_dl/extractor/vk.py19
-rw-r--r--youtube_dl/extractor/vodlocker.py6
-rw-r--r--youtube_dl/extractor/vube.py4
-rw-r--r--youtube_dl/extractor/vuclip.py9
-rw-r--r--youtube_dl/extractor/washingtonpost.py11
-rw-r--r--youtube_dl/extractor/wdr.py36
-rw-r--r--youtube_dl/extractor/webofstories.py102
-rw-r--r--youtube_dl/extractor/wistia.py8
-rw-r--r--youtube_dl/extractor/xbef.py9
-rw-r--r--youtube_dl/extractor/xboxclips.py26
-rw-r--r--youtube_dl/extractor/xhamster.py11
-rw-r--r--youtube_dl/extractor/xminus.py9
-rw-r--r--youtube_dl/extractor/xnxx.py9
-rw-r--r--youtube_dl/extractor/xtube.py70
-rw-r--r--youtube_dl/extractor/xvideos.py22
-rw-r--r--youtube_dl/extractor/xxxymovies.py81
-rw-r--r--youtube_dl/extractor/yahoo.py59
-rw-r--r--youtube_dl/extractor/yesjapan.py62
-rw-r--r--youtube_dl/extractor/ynet.py2
-rw-r--r--youtube_dl/extractor/youporn.py9
-rw-r--r--youtube_dl/extractor/youtube.py359
-rw-r--r--youtube_dl/extractor/zdf.py52
-rw-r--r--youtube_dl/options.py40
-rw-r--r--youtube_dl/postprocessor/__init__.py9
-rw-r--r--youtube_dl/postprocessor/execafterdownload.py2
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py21
-rw-r--r--youtube_dl/swfinterp.py2
-rw-r--r--youtube_dl/update.py14
-rw-r--r--youtube_dl/utils.py130
-rw-r--r--youtube_dl/version.py2
239 files changed, 5795 insertions, 1983 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index f89ac4e1d..61675d8ec 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -7,6 +7,7 @@ import collections
import datetime
import errno
import io
+import itertools
import json
import locale
import os
@@ -26,6 +27,7 @@ from .compat import (
compat_cookiejar,
compat_expanduser,
compat_http_client,
+ compat_kwargs,
compat_str,
compat_urllib_error,
compat_urllib_request,
@@ -61,12 +63,17 @@ from .utils import (
YoutubeDLHandler,
prepend_extension,
args_to_str,
+ age_restricted,
)
from .cache import Cache
from .extractor import get_info_extractor, gen_extractors
from .downloader import get_suitable_downloader
from .downloader.rtmp import rtmpdump_version
-from .postprocessor import FFmpegMergerPP, FFmpegPostProcessor
+from .postprocessor import (
+ FFmpegMergerPP,
+ FFmpegPostProcessor,
+ get_postprocessor,
+)
from .version import __version__
@@ -115,7 +122,7 @@ class YoutubeDL(object):
dump_single_json: Force printing the info_dict of the whole playlist
(or video) as a single JSON line.
simulate: Do not download the video files.
- format: Video format code.
+ format: Video format code. See options.py for more information.
format_limit: Highest quality format to try.
outtmpl: Template for output names.
restrictfilenames: Do not allow "&" and spaces in file names
@@ -123,6 +130,7 @@ class YoutubeDL(object):
nooverwrites: Prevent overwriting files.
playliststart: Playlist item to start at.
playlistend: Playlist item to end at.
+ playlistreverse: Download playlist items in reverse order.
matchtitle: Download only matching titles.
rejecttitle: Reject downloads for matching titles.
logger: Log messages to a logging.Logger instance.
@@ -174,6 +182,29 @@ class YoutubeDL(object):
extract_flat: Do not resolve URLs, return the immediate result.
Pass in 'in_playlist' to only show this behavior for
playlist items.
+ postprocessors: A list of dictionaries, each with an entry
+ * key: The name of the postprocessor. See
+ youtube_dl/postprocessor/__init__.py for a list.
+ as well as any further keyword arguments for the
+ postprocessor.
+ progress_hooks: A list of functions that get called on download
+ progress, with a dictionary with the entries
+ * filename: The final filename
+ * status: One of "downloading" and "finished"
+
+ The dict may also have some of the following entries:
+
+ * downloaded_bytes: Bytes on disk
+ * total_bytes: Size of the whole file, None if unknown
+ * tmpfilename: The filename we're currently writing to
+ * eta: The estimated time in seconds, None if unknown
+ * speed: The download speed in bytes/second, None if
+ unknown
+
+ Progress hooks are guaranteed to be called at least once
+ (with status "finished") if the download is successful.
+ merge_output_format: Extension to use when merging formats.
+
The following parameters are not used by YoutubeDL itself, they are used by
the FileDownloader:
@@ -254,6 +285,16 @@ class YoutubeDL(object):
self.print_debug_header()
self.add_default_info_extractors()
+ for pp_def_raw in self.params.get('postprocessors', []):
+ pp_class = get_postprocessor(pp_def_raw['key'])
+ pp_def = dict(pp_def_raw)
+ del pp_def['key']
+ pp = pp_class(self, **compat_kwargs(pp_def))
+ self.add_post_processor(pp)
+
+ for ph in self.params.get('progress_hooks', []):
+ self.add_progress_hook(ph)
+
def warn_if_short_id(self, argv):
# short YouTube ID starting with dash?
idxs = [
@@ -511,13 +552,8 @@ class YoutubeDL(object):
max_views = self.params.get('max_views')
if max_views is not None and view_count > max_views:
return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
- age_limit = self.params.get('age_limit')
- if age_limit is not None:
- actual_age_limit = info_dict.get('age_limit')
- if actual_age_limit is None:
- actual_age_limit = 0
- if age_limit < actual_age_limit:
- return 'Skipping "' + title + '" because it is age restricted'
+ if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
+ return 'Skipping "%s" because it is age restricted' % title
if self.in_download_archive(info_dict):
return '%s has already been recorded in archive' % video_title
return None
@@ -621,23 +657,15 @@ class YoutubeDL(object):
ie_result['url'], ie_key=ie_result.get('ie_key'),
extra_info=extra_info, download=False, process=False)
- def make_result(embedded_info):
- new_result = ie_result.copy()
- for f in ('_type', 'url', 'ext', 'player_url', 'formats',
- 'entries', 'ie_key', 'duration',
- 'subtitles', 'annotations', 'format',
- 'thumbnail', 'thumbnails'):
- if f in new_result:
- del new_result[f]
- if f in embedded_info:
- new_result[f] = embedded_info[f]
- return new_result
- new_result = make_result(info)
+ force_properties = dict(
+ (k, v) for k, v in ie_result.items() if v is not None)
+ for f in ('_type', 'url'):
+ if f in force_properties:
+ del force_properties[f]
+ new_result = info.copy()
+ new_result.update(force_properties)
assert new_result.get('_type') != 'url_transparent'
- if new_result.get('_type') == 'compat_list':
- new_result['entries'] = [
- make_result(e) for e in new_result['entries']]
return self.process_ie_result(
new_result, download=download, extra_info=extra_info)
@@ -654,24 +682,34 @@ class YoutubeDL(object):
if playlistend == -1:
playlistend = None
- if isinstance(ie_result['entries'], list):
- n_all_entries = len(ie_result['entries'])
- entries = ie_result['entries'][playliststart:playlistend]
+ ie_entries = ie_result['entries']
+ if isinstance(ie_entries, list):
+ n_all_entries = len(ie_entries)
+ entries = ie_entries[playliststart:playlistend]
n_entries = len(entries)
self.to_screen(
"[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
(ie_result['extractor'], playlist, n_all_entries, n_entries))
- else:
- assert isinstance(ie_result['entries'], PagedList)
- entries = ie_result['entries'].getslice(
+ elif isinstance(ie_entries, PagedList):
+ entries = ie_entries.getslice(
playliststart, playlistend)
n_entries = len(entries)
self.to_screen(
"[%s] playlist %s: Downloading %d videos" %
(ie_result['extractor'], playlist, n_entries))
+ else: # iterable
+ entries = list(itertools.islice(
+ ie_entries, playliststart, playlistend))
+ n_entries = len(entries)
+ self.to_screen(
+ "[%s] playlist %s: Downloading %d videos" %
+ (ie_result['extractor'], playlist, n_entries))
+
+ if self.params.get('playlistreverse', False):
+ entries = entries[::-1]
for i, entry in enumerate(entries, 1):
- self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
+ self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
extra = {
'n_entries': n_entries,
'playlist': playlist,
@@ -749,7 +787,7 @@ class YoutubeDL(object):
if video_formats:
return video_formats[0]
else:
- extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a']
+ extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
if format_spec in extensions:
filter_f = lambda f: f['ext'] == format_spec
else:
@@ -872,10 +910,23 @@ class YoutubeDL(object):
'contain the video, try using '
'"-f %s+%s"' % (format_2, format_1))
return
+ output_ext = (
+ formats_info[0]['ext']
+ if self.params.get('merge_output_format') is None
+ else self.params['merge_output_format'])
selected_format = {
'requested_formats': formats_info,
'format': rf,
'ext': formats_info[0]['ext'],
+ 'width': formats_info[0].get('width'),
+ 'height': formats_info[0].get('height'),
+ 'resolution': formats_info[0].get('resolution'),
+ 'fps': formats_info[0].get('fps'),
+ 'vcodec': formats_info[0].get('vcodec'),
+ 'vbr': formats_info[0].get('vbr'),
+ 'acodec': formats_info[1].get('acodec'),
+ 'abr': formats_info[1].get('abr'),
+ 'ext': output_ext,
}
else:
selected_format = None
@@ -934,8 +985,12 @@ class YoutubeDL(object):
if self.params.get('forceid', False):
self.to_stdout(info_dict['id'])
if self.params.get('forceurl', False):
- # For RTMP URLs, also include the playpath
- self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
+ if info_dict.get('requested_formats') is not None:
+ for f in info_dict['requested_formats']:
+ self.to_stdout(f['url'] + f.get('play_path', ''))
+ else:
+ # For RTMP URLs, also include the playpath
+ self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
self.to_stdout(info_dict['thumbnail'])
if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
@@ -971,13 +1026,13 @@ class YoutubeDL(object):
descfn = filename + '.description'
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
self.to_screen('[info] Video description is already present')
+ elif info_dict.get('description') is None:
+ self.report_warning('There\'s no description to write.')
else:
try:
self.to_screen('[info] Writing video description to: ' + descfn)
with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
descfile.write(info_dict['description'])
- except (KeyError, TypeError):
- self.report_warning('There\'s no description to write.')
except (OSError, IOError):
self.report_error('Cannot write description file ' + descfn)
return
@@ -1104,8 +1159,7 @@ class YoutubeDL(object):
except (PostProcessingError) as err:
self.report_error('postprocessing: %s' % str(err))
return
-
- self.record_download_archive(info_dict)
+ self.record_download_archive(info_dict)
def download(self, url_list):
"""Download a given list of URLs."""
@@ -1289,7 +1343,9 @@ class YoutubeDL(object):
formats = info_dict.get('formats', [info_dict])
idlen = max(len('format code'),
max(len(f['format_id']) for f in formats))
- formats_s = [line(f, idlen) for f in formats]
+ formats_s = [
+ line(f, idlen) for f in formats
+ if f.get('preference') is None or f['preference'] >= -1000]
if len(formats) > 1:
formats_s[0] += (' ' if self._format_note(formats[0]) else '') + '(worst)'
formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)'
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index 77b3384a0..8e7b74466 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -38,18 +38,8 @@ from .update import update_self
from .downloader import (
FileDownloader,
)
-from .extractor import gen_extractors
+from .extractor import gen_extractors, list_extractors
from .YoutubeDL import YoutubeDL
-from .postprocessor import (
- AtomicParsleyPP,
- FFmpegAudioFixPP,
- FFmpegMetadataPP,
- FFmpegVideoConvertor,
- FFmpegExtractAudioPP,
- FFmpegEmbedSubtitlePP,
- XAttrMetadataPP,
- ExecAfterDownloadPP,
-)
def _real_main(argv=None):
@@ -105,24 +95,22 @@ def _real_main(argv=None):
_enc = preferredencoding()
all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls]
- extractors = gen_extractors()
-
if opts.list_extractors:
- for ie in sorted(extractors, key=lambda ie: ie.IE_NAME.lower()):
+ for ie in list_extractors(opts.age_limit):
compat_print(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else ''))
matchedUrls = [url for url in all_urls if ie.suitable(url)]
for mu in matchedUrls:
compat_print(' ' + mu)
sys.exit(0)
if opts.list_extractor_descriptions:
- for ie in sorted(extractors, key=lambda ie: ie.IE_NAME.lower()):
+ for ie in list_extractors(opts.age_limit):
if not ie._WORKING:
continue
desc = getattr(ie, 'IE_DESC', ie.IE_NAME)
if desc is False:
continue
if hasattr(ie, 'SEARCH_KEY'):
- _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny')
+ _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow')
_COUNTS = ('', '5', '10', 'all')
desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES))
compat_print(desc)
@@ -178,6 +166,7 @@ def _real_main(argv=None):
if opts.recodevideo is not None:
if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']:
parser.error('invalid video recode format specified')
+
if opts.date is not None:
date = DateRange.day(opts.date)
else:
@@ -209,16 +198,54 @@ def _real_main(argv=None):
' file! Use "{0}.%(ext)s" instead of "{0}" as the output'
' template'.format(outtmpl))
- any_printing = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
+ any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
+ any_printing = opts.print_json
download_archive_fn = compat_expanduser(opts.download_archive) if opts.download_archive is not None else opts.download_archive
+ # PostProcessors
+ postprocessors = []
+ # Add the metadata pp first, the other pps will copy it
+ if opts.addmetadata:
+ postprocessors.append({'key': 'FFmpegMetadata'})
+ if opts.extractaudio:
+ postprocessors.append({
+ 'key': 'FFmpegExtractAudio',
+ 'preferredcodec': opts.audioformat,
+ 'preferredquality': opts.audioquality,
+ 'nopostoverwrites': opts.nopostoverwrites,
+ })
+ if opts.recodevideo:
+ postprocessors.append({
+ 'key': 'FFmpegVideoConvertor',
+ 'preferedformat': opts.recodevideo,
+ })
+ if opts.embedsubtitles:
+ postprocessors.append({
+ 'key': 'FFmpegEmbedSubtitle',
+ 'subtitlesformat': opts.subtitlesformat,
+ })
+ if opts.xattrs:
+ postprocessors.append({'key': 'XAttrMetadata'})
+ if opts.embedthumbnail:
+ if not opts.addmetadata:
+ postprocessors.append({'key': 'FFmpegAudioFix'})
+ postprocessors.append({'key': 'AtomicParsley'})
+ # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way.
+ # So if the user is able to remove the file before your postprocessor runs it might cause a few problems.
+ if opts.exec_cmd:
+ postprocessors.append({
+ 'key': 'ExecAfterDownload',
+ 'verboseOutput': opts.verbose,
+ 'exec_cmd': opts.exec_cmd,
+ })
+
ydl_opts = {
'usenetrc': opts.usenetrc,
'username': opts.username,
'password': opts.password,
'twofactor': opts.twofactor,
'videopassword': opts.videopassword,
- 'quiet': (opts.quiet or any_printing),
+ 'quiet': (opts.quiet or any_getting or any_printing),
'no_warnings': opts.no_warnings,
'forceurl': opts.geturl,
'forcetitle': opts.gettitle,
@@ -228,9 +255,9 @@ def _real_main(argv=None):
'forceduration': opts.getduration,
'forcefilename': opts.getfilename,
'forceformat': opts.getformat,
- 'forcejson': opts.dumpjson,
+ 'forcejson': opts.dumpjson or opts.print_json,
'dump_single_json': opts.dump_single_json,
- 'simulate': opts.simulate or any_printing,
+ 'simulate': opts.simulate or any_getting,
'skip_download': opts.skip_download,
'format': opts.format,
'format_limit': opts.format_limit,
@@ -249,6 +276,7 @@ def _real_main(argv=None):
'progress_with_newline': opts.progress_with_newline,
'playliststart': opts.playliststart,
'playlistend': opts.playlistend,
+ 'playlistreverse': opts.playlist_reverse,
'noplaylist': opts.noplaylist,
'logtostderr': opts.outtmpl == '-',
'consoletitle': opts.consoletitle,
@@ -296,32 +324,11 @@ def _real_main(argv=None):
'encoding': opts.encoding,
'exec_cmd': opts.exec_cmd,
'extract_flat': opts.extract_flat,
+ 'merge_output_format': opts.merge_output_format,
+ 'postprocessors': postprocessors,
}
with YoutubeDL(ydl_opts) as ydl:
- # PostProcessors
- # Add the metadata pp first, the other pps will copy it
- if opts.addmetadata:
- ydl.add_post_processor(FFmpegMetadataPP())
- if opts.extractaudio:
- ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites))
- if opts.recodevideo:
- ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo))
- if opts.embedsubtitles:
- ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat))
- if opts.xattrs:
- ydl.add_post_processor(XAttrMetadataPP())
- if opts.embedthumbnail:
- if not opts.addmetadata:
- ydl.add_post_processor(FFmpegAudioFixPP())
- ydl.add_post_processor(AtomicParsleyPP())
-
- # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way.
- # So if the user is able to remove the file before your postprocessor runs it might cause a few problems.
- if opts.exec_cmd:
- ydl.add_post_processor(ExecAfterDownloadPP(
- verboseOutput=opts.verbose, exec_cmd=opts.exec_cmd))
-
# Update version
if opts.update_self:
update_self(ydl.to_screen, opts.verbose)
@@ -359,3 +366,5 @@ def main(argv=None):
sys.exit('ERROR: fixed output name but more than one file to download')
except KeyboardInterrupt:
sys.exit('\nERROR: Interrupted by user')
+
+__all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors']
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
index 27596687d..46d438846 100644
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -247,7 +247,7 @@ else:
userhome = compat_getenv('HOME')
elif 'USERPROFILE' in os.environ:
userhome = compat_getenv('USERPROFILE')
- elif not 'HOMEPATH' in os.environ:
+ elif 'HOMEPATH' not in os.environ:
return path
else:
try:
@@ -297,7 +297,9 @@ else:
# Old 2.6 and 2.7 releases require kwargs to be bytes
try:
- (lambda x: x)(**{'x': 0})
+ def _testfunc(x):
+ pass
+ _testfunc(**{'x': 0})
except TypeError:
def compat_kwargs(kwargs):
return dict((bytes(k), v) for k, v in kwargs.items())
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py
index c0af50c59..584bde732 100644
--- a/youtube_dl/downloader/common.py
+++ b/youtube_dl/downloader/common.py
@@ -5,8 +5,8 @@ import re
import sys
import time
+from ..compat import compat_str
from ..utils import (
- compat_str,
encodeFilename,
format_bytes,
timeconvert,
@@ -80,6 +80,8 @@ class FileDownloader(object):
def calc_eta(start, now, total, current):
if total is None:
return None
+ if now is None:
+ now = time.time()
dif = now - start
if current == 0 or dif < 0.001: # One millisecond
return None
@@ -146,18 +148,19 @@ class FileDownloader(object):
def report_error(self, *args, **kargs):
self.ydl.report_error(*args, **kargs)
- def slow_down(self, start_time, byte_counter):
+ def slow_down(self, start_time, now, byte_counter):
"""Sleep if the download speed is over the rate limit."""
rate_limit = self.params.get('ratelimit', None)
if rate_limit is None or byte_counter == 0:
return
- now = time.time()
+ if now is None:
+ now = time.time()
elapsed = now - start_time
if elapsed <= 0.0:
return
speed = float(byte_counter) / elapsed
if speed > rate_limit:
- time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
+ time.sleep(max((byte_counter // rate_limit) - elapsed, 0))
def temp_name(self, filename):
"""Returns a temporary filename for the given filename."""
@@ -282,7 +285,7 @@ class FileDownloader(object):
Return True on success and False otherwise
"""
# Check file already present
- if self.params.get('continuedl', False) and os.path.isfile(encodeFilename(filename)) and not self.params.get('nopart', False):
+ if filename != '-' and self.params.get('continuedl', False) and os.path.isfile(encodeFilename(filename)) and not self.params.get('nopart', False):
self.report_file_already_downloaded(filename)
self._hook_progress({
'filename': filename,
@@ -302,19 +305,6 @@ class FileDownloader(object):
ph(status)
def add_progress_hook(self, ph):
- """ ph gets called on download progress, with a dictionary with the entries
- * filename: The final filename
- * status: One of "downloading" and "finished"
-
- It can also have some of the following entries:
-
- * downloaded_bytes: Bytes on disks
- * total_bytes: Total bytes, None if unknown
- * tmpfilename: The filename we're currently writing to
- * eta: The estimated time in seconds, None if unknown
- * speed: The download speed in bytes/second, None if unknown
-
- Hooks are guaranteed to be called at least once (with status "finished")
- if the download is successful.
- """
+ # See YoutubeDl.py (search for progress_hooks) for a description of
+ # this interface
self._progress_hooks.append(ph)
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py
index 7cd22c504..c460c167a 100644
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@@ -9,10 +9,12 @@ import xml.etree.ElementTree as etree
from .common import FileDownloader
from .http import HttpFD
+from ..compat import (
+ compat_urlparse,
+)
from ..utils import (
struct_pack,
struct_unpack,
- compat_urlparse,
format_bytes,
encodeFilename,
sanitize_open,
@@ -185,24 +187,34 @@ def build_fragments_list(boot_info):
return res
-def write_flv_header(stream, metadata):
- """Writes the FLV header and the metadata to stream"""
+def write_unsigned_int(stream, val):
+ stream.write(struct_pack('!I', val))
+
+
+def write_unsigned_int_24(stream, val):
+ stream.write(struct_pack('!I', val)[1:])
+
+
+def write_flv_header(stream):
+ """Writes the FLV header to stream"""
# FLV header
stream.write(b'FLV\x01')
stream.write(b'\x05')
stream.write(b'\x00\x00\x00\x09')
- # FLV File body
stream.write(b'\x00\x00\x00\x00')
- # FLVTAG
- # Script data
- stream.write(b'\x12')
- # Size of the metadata with 3 bytes
- stream.write(struct_pack('!L', len(metadata))[1:])
- stream.write(b'\x00\x00\x00\x00\x00\x00\x00')
- stream.write(metadata)
- # Magic numbers extracted from the output files produced by AdobeHDS.php
- #(https://github.com/K-S-V/Scripts)
- stream.write(b'\x00\x00\x01\x73')
+
+
+def write_metadata_tag(stream, metadata):
+ """Writes optional metadata tag to stream"""
+ SCRIPT_TAG = b'\x12'
+ FLV_TAG_HEADER_LEN = 11
+
+ if metadata:
+ stream.write(SCRIPT_TAG)
+ write_unsigned_int_24(stream, len(metadata))
+ stream.write(b'\x00\x00\x00\x00\x00\x00\x00')
+ stream.write(metadata)
+ write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata))
def _add_ns(prop):
@@ -231,6 +243,7 @@ class F4mFD(FileDownloader):
'continuedl': True,
'quiet': True,
'noprogress': True,
+ 'ratelimit': self.params.get('ratelimit', None),
'test': self.params.get('test', False),
}
)
@@ -253,7 +266,11 @@ class F4mFD(FileDownloader):
bootstrap = self.ydl.urlopen(bootstrap_url).read()
else:
bootstrap = base64.b64decode(bootstrap_node.text)
- metadata = base64.b64decode(media.find(_add_ns('metadata')).text)
+ metadata_node = media.find(_add_ns('metadata'))
+ if metadata_node is not None:
+ metadata = base64.b64decode(metadata_node.text)
+ else:
+ metadata = None
boot_info = read_bootstrap_info(bootstrap)
fragments_list = build_fragments_list(boot_info)
@@ -266,7 +283,8 @@ class F4mFD(FileDownloader):
tmpfilename = self.temp_name(filename)
(dest_stream, tmpfilename) = sanitize_open(tmpfilename, 'wb')
- write_flv_header(dest_stream, metadata)
+ write_flv_header(dest_stream)
+ write_metadata_tag(dest_stream, metadata)
# This dict stores the download progress, it's updated by the progress
# hook
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
index 954beffd5..aa58b52ab 100644
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -4,11 +4,13 @@ import os
import re
import subprocess
+from ..postprocessor.ffmpeg import FFmpegPostProcessor
from .common import FileDownloader
-from ..utils import (
+from ..compat import (
compat_urlparse,
compat_urllib_request,
- check_executable,
+)
+from ..utils import (
encodeFilename,
)
@@ -24,12 +26,12 @@ class HlsFD(FileDownloader):
'-bsf:a', 'aac_adtstoasc',
encodeFilename(tmpfilename, for_subprocess=True)]
- for program in ['avconv', 'ffmpeg']:
- if check_executable(program, ['-version']):
- break
- else:
+ ffpp = FFmpegPostProcessor(downloader=self)
+ program = ffpp._executable
+ if program is None:
self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.')
return False
+ ffpp.check_version()
cmd = [program] + args
retval = subprocess.call(cmd)
diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py
index 8491cee8a..e68f20c9f 100644
--- a/youtube_dl/downloader/http.py
+++ b/youtube_dl/downloader/http.py
@@ -4,11 +4,12 @@ import os
import time
from .common import FileDownloader
-from ..utils import (
+from ..compat import (
compat_urllib_request,
compat_urllib_error,
+)
+from ..utils import (
ContentTooShortError,
-
encodeFilename,
sanitize_open,
format_bytes,
@@ -136,16 +137,21 @@ class HttpFD(FileDownloader):
byte_counter = 0 + resume_len
block_size = self.params.get('buffersize', 1024)
start = time.time()
+
+ # measure time over whole while-loop, so slow_down() and best_block_size() work together properly
+ now = None # needed for slow_down() in the first loop run
+ before = start # start measuring
while True:
+
# Download and write
- before = time.time()
data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
- after = time.time()
+ byte_counter += len(data_block)
+
+ # exit loop when download is finished
if len(data_block) == 0:
break
- byte_counter += len(data_block)
- # Open file just in time
+ # Open destination file just in time
if stream is None:
try:
(stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
@@ -161,11 +167,22 @@ class HttpFD(FileDownloader):
self.to_stderr('\n')
self.report_error('unable to write data: %s' % str(err))
return False
+
+ # Apply rate limit
+ self.slow_down(start, now, byte_counter - resume_len)
+
+ # end measuring of one loop run
+ now = time.time()
+ after = now
+
+ # Adjust block size
if not self.params.get('noresizebuffer', False):
block_size = self.best_block_size(after - before, len(data_block))
+ before = after
+
# Progress message
- speed = self.calc_speed(start, time.time(), byte_counter - resume_len)
+ speed = self.calc_speed(start, now, byte_counter - resume_len)
if data_len is None:
eta = percent = None
else:
@@ -186,9 +203,6 @@ class HttpFD(FileDownloader):
if is_test and byte_counter == data_len:
break
- # Apply rate limit
- self.slow_down(start, byte_counter - resume_len)
-
if stream is None:
self.to_stderr('\n')
self.report_error('Did not get any data blocks')
diff --git a/youtube_dl/downloader/mplayer.py b/youtube_dl/downloader/mplayer.py
index c53195da0..72cef30ea 100644
--- a/youtube_dl/downloader/mplayer.py
+++ b/youtube_dl/downloader/mplayer.py
@@ -4,8 +4,8 @@ import os
import subprocess
from .common import FileDownloader
-from ..compat import compat_subprocess_get_DEVNULL
from ..utils import (
+ check_executable,
encodeFilename,
)
@@ -20,11 +20,7 @@ class MplayerFD(FileDownloader):
'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy',
'-dumpstream', '-dumpfile', tmpfilename, url]
# Check for mplayer first
- try:
- subprocess.call(
- ['mplayer', '-h'],
- stdout=compat_subprocess_get_DEVNULL(), stderr=subprocess.STDOUT)
- except (OSError, IOError):
+ if not check_executable('mplayer', ['-h']):
self.report_error('MMS or RTSP download detected but "%s" could not be run' % args[0])
return False
diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py
index 58ae2005c..5346cb9a0 100644
--- a/youtube_dl/downloader/rtmp.py
+++ b/youtube_dl/downloader/rtmp.py
@@ -7,9 +7,9 @@ import sys
import time
from .common import FileDownloader
+from ..compat import compat_str
from ..utils import (
check_executable,
- compat_str,
encodeFilename,
format_bytes,
get_exe_version,
@@ -185,7 +185,7 @@ class RtmpFD(FileDownloader):
cursize = os.path.getsize(encodeFilename(tmpfilename))
if prevsize == cursize and retval == RD_FAILED:
break
- # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
+ # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024:
self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
retval = RD_SUCCESS
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index b09ee303d..a8579d083 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -3,8 +3,11 @@ from __future__ import unicode_literals
from .abc import ABCIE
from .academicearth import AcademicEarthCourseIE
from .addanime import AddAnimeIE
+from .adobetv import AdobeTVIE
from .adultswim import AdultSwimIE
from .aftonbladet import AftonbladetIE
+from .aljazeera import AlJazeeraIE
+from .alphaporno import AlphaPornoIE
from .anitube import AnitubeIE
from .anysex import AnySexIE
from .aol import AolIE
@@ -22,13 +25,16 @@ from .arte import (
ArteTVDDCIE,
ArteTVEmbedIE,
)
-from .audiomack import AudiomackIE
+from .atresplayer import AtresPlayerIE
+from .audiomack import AudiomackIE, AudiomackAlbumIE
from .auengine import AUEngineIE
+from .azubu import AzubuIE
from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
from .bbccouk import BBCCoUkIE
from .beeg import BeegIE
from .behindkink import BehindKinkIE
+from .bet import BetIE
from .bild import BildIE
from .bilibili import BiliBiliIE
from .blinkx import BlinkxIE
@@ -49,7 +55,7 @@ from .cbsnews import CBSNewsIE
from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
from .chilloutzone import ChilloutzoneIE
-from .cinemassacre import CinemassacreIE
+from .cinchcast import CinchcastIE
from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE
@@ -60,9 +66,12 @@ from .cnet import CNETIE
from .cnn import (
CNNIE,
CNNBlogsIE,
+ CNNArticleIE,
)
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
+from .comcarcoff import ComCarCoffIE
+from .commonmistakes import CommonMistakesIE
from .condenast import CondeNastIE
from .cracked import CrackedIE
from .criterion import CriterionIE
@@ -84,12 +93,14 @@ from .dotsub import DotsubIE
from .dreisat import DreiSatIE
from .drtuber import DrTuberIE
from .drtv import DRTVIE
+from .dvtv import DVTVIE
from .dump import DumpIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE
from .divxstage import DivxStageIE
from .dropbox import DropboxIE
from .ebaumsworld import EbaumsWorldIE
+from .echomsk import EchoMskIE
from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .einthusan import EinthusanIE
@@ -102,6 +113,7 @@ from .elpais import ElPaisIE
from .empflix import EMPFlixIE
from .engadget import EngadgetIE
from .eporner import EpornerIE
+from .eroprofile import EroProfileIE
from .escapist import EscapistIE
from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
@@ -121,6 +133,8 @@ from .fktv import (
from .flickr import FlickrIE
from .folketinget import FolketingetIE
from .fourtube import FourTubeIE
+from .foxgay import FoxgayIE
+from .foxnews import FoxNewsIE
from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE
from .francetv import (
@@ -144,6 +158,8 @@ from .gamestar import GameStarIE
from .gametrailers import GametrailersIE
from .gdcvault import GDCVaultIE
from .generic import GenericIE
+from .giantbomb import GiantBombIE
+from .giga import GigaIE
from .glide import GlideIE
from .globo import GloboIE
from .godtube import GodTubeIE
@@ -154,10 +170,13 @@ from .googlesearch import GoogleSearchIE
from .gorillavid import GorillaVidIE
from .goshgay import GoshgayIE
from .grooveshark import GroovesharkIE
+from .groupon import GrouponIE
from .hark import HarkIE
from .heise import HeiseIE
+from .hellporno import HellPornoIE
from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE
+from .hitbox import HitboxIE, HitboxLiveIE
from .hornbunny import HornBunnyIE
from .hostingbulk import HostingBulkIE
from .hotnewhiphop import HotNewHipHopIE
@@ -216,6 +235,7 @@ from .mdr import MDRIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
from .mgoon import MgoonIE
+from .minhateca import MinhatecaIE
from .ministrygrid import MinistryGridIE
from .mit import TechTVMITIE, MITIE, OCWMITIE
from .mitele import MiTeleIE
@@ -245,6 +265,7 @@ from .muzu import MuzuTVIE
from .myspace import MySpaceIE, MySpaceAlbumIE
from .myspass import MySpassIE
from .myvideo import MyVideoIE
+from .myvidster import MyVidsterIE
from .naver import NaverIE
from .nba import NBAIE
from .nbc import (
@@ -253,6 +274,8 @@ from .nbc import (
)
from .ndr import NDRIE
from .ndtv import NDTVIE
+from .netzkino import NetzkinoIE
+from .nerdcubed import NerdCubedFeedIE
from .newgrounds import NewgroundsIE
from .newstube import NewstubeIE
from .nfb import NFBIE
@@ -279,6 +302,7 @@ from .nytimes import NYTimesIE
from .nuvid import NuvidIE
from .oktoberfesttv import OktoberfestTVIE
from .ooyala import OoyalaIE
+from .openfilm import OpenFilmIE
from .orf import (
ORFTVthekIE,
ORFOE1IE,
@@ -302,10 +326,13 @@ from .promptfile import PromptFileIE
from .prosiebensat1 import ProSiebenSat1IE
from .pyvideo import PyvideoIE
from .quickvid import QuickVidIE
+from .radiode import RadioDeIE
+from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE
from .rai import RaiIE
from .rbmaradio import RBMARadioIE
from .redtube import RedTubeIE
+from .restudy import RestudyIE
from .reverbnation import ReverbNationIE
from .ringtv import RingTVIE
from .ro220 import Ro220IE
@@ -314,12 +341,14 @@ from .roxwel import RoxwelIE
from .rtbf import RTBFIE
from .rtlnl import RtlXlIE
from .rtlnow import RTLnowIE
+from .rtp import RTPIE
from .rts import RTSIE
from .rtve import RTVEALaCartaIE, RTVELiveIE
from .ruhd import RUHDIE
from .rutube import (
RutubeIE,
RutubeChannelIE,
+ RutubeEmbedIE,
RutubeMovieIE,
RutubePersonIE,
)
@@ -329,6 +358,8 @@ from .savefrom import SaveFromIE
from .sbs import SBSIE
from .scivee import SciVeeIE
from .screencast import ScreencastIE
+from .screencastomatic import ScreencastOMaticIE
+from .screenwavemedia import CinemassacreIE, ScreenwaveMediaIE, TeamFourIE
from .servingsys import ServingSysIE
from .sexu import SexuIE
from .sexykarma import SexyKarmaIE
@@ -388,6 +419,7 @@ from .ted import TEDIE
from .telebruxelles import TeleBruxellesIE
from .telecinco import TelecincoIE
from .telemb import TeleMBIE
+from .teletask import TeleTaskIE
from .tenplay import TenPlayIE
from .testurl import TestURLIE
from .tf1 import TF1IE
@@ -446,6 +478,7 @@ from .videott import VideoTtIE
from .videoweed import VideoWeedIE
from .vidme import VidmeIE
from .vidzi import VidziIE
+from .vier import VierIE, VierVideosIE
from .vimeo import (
VimeoIE,
VimeoAlbumIE,
@@ -481,6 +514,7 @@ from .wdr import (
WDRMobileIE,
WDRMausIE,
)
+from .webofstories import WebOfStoriesIE
from .weibo import WeiboIE
from .wimp import WimpIE
from .wistia import WistiaIE
@@ -493,10 +527,12 @@ from .xminus import XMinusIE
from .xnxx import XNXXIE
from .xvideos import XVideosIE
from .xtube import XTubeUserIE, XTubeIE
+from .xxxymovies import XXXYMoviesIE
from .yahoo import (
YahooIE,
YahooSearchIE,
)
+from .yesjapan import YesJapanIE
from .ynet import YnetIE
from .youjizz import YouJizzIE
from .youku import YoukuIE
@@ -514,12 +550,12 @@ from .youtube import (
YoutubeSearchURLIE,
YoutubeShowIE,
YoutubeSubscriptionsIE,
- YoutubeTopListIE,
+ YoutubeTruncatedIDIE,
YoutubeTruncatedURLIE,
YoutubeUserIE,
YoutubeWatchLaterIE,
)
-from .zdf import ZDFIE
+from .zdf import ZDFIE, ZDFChannelIE
from .zingmp3 import (
ZingMp3SongIE,
ZingMp3AlbumIE,
@@ -540,6 +576,17 @@ def gen_extractors():
return [klass() for klass in _ALL_CLASSES]
+def list_extractors(age_limit):
+ """
+ Return a list of extractors that are suitable for the given age,
+ sorted by extractor ID.
+ """
+
+ return sorted(
+ filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()),
+ key=lambda ie: ie.IE_NAME.lower())
+
+
def get_info_extractor(ie_name):
"""Returns the info extractor class with the given ie_name"""
return globals()[ie_name + 'IE']
diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py
new file mode 100644
index 000000000..28e07f8b0
--- /dev/null
+++ b/youtube_dl/extractor/adobetv.py
@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ unified_strdate,
+ str_to_int,
+)
+
+
+class AdobeTVIE(InfoExtractor):
+ _VALID_URL = r'https?://tv\.adobe\.com/watch/[^/]+/(?P<id>[^/]+)'
+
+ _TEST = {
+ 'url': 'http://tv.adobe.com/watch/the-complete-picture-with-julieanne-kost/quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop/',
+ 'md5': '9bc5727bcdd55251f35ad311ca74fa1e',
+ 'info_dict': {
+ 'id': 'quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop',
+ 'ext': 'mp4',
+ 'title': 'Quick Tip - How to Draw a Circle Around an Object in Photoshop',
+ 'description': 'md5:99ec318dc909d7ba2a1f2b038f7d2311',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'upload_date': '20110914',
+ 'duration': 60,
+ 'view_count': int,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ player = self._parse_json(
+ self._search_regex(r'html5player:\s*({.+?})\s*\n', webpage, 'player'),
+ video_id)
+
+ title = player.get('title') or self._search_regex(
+ r'data-title="([^"]+)"', webpage, 'title')
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ upload_date = unified_strdate(
+ self._html_search_meta('datepublished', webpage, 'upload date'))
+
+ duration = parse_duration(
+ self._html_search_meta('duration', webpage, 'duration')
+ or self._search_regex(r'Runtime:\s*(\d{2}:\d{2}:\d{2})', webpage, 'duration'))
+
+ view_count = str_to_int(self._search_regex(
+ r'<div class="views">\s*Views?:\s*([\d,.]+)\s*</div>',
+ webpage, 'view count'))
+
+ formats = [{
+ 'url': source['src'],
+ 'format_id': source.get('quality') or source['src'].split('-')[-1].split('.')[0] or None,
+ 'tbr': source.get('bitrate'),
+ } for source in player['sources']]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py
index 0d05cbb4b..502a9c25a 100644
--- a/youtube_dl/extractor/adultswim.py
+++ b/youtube_dl/extractor/adultswim.py
@@ -2,123 +2,150 @@
from __future__ import unicode_literals
import re
+import json
from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ xpath_text,
+ float_or_none,
+)
class AdultSwimIE(InfoExtractor):
- _VALID_URL = r'https?://video\.adultswim\.com/(?P<path>.+?)(?:\.html)?(?:\?.*)?(?:#.*)?$'
- _TEST = {
- 'url': 'http://video.adultswim.com/rick-and-morty/close-rick-counters-of-the-rick-kind.html?x=y#title',
+ _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<is_playlist>playlists/)?(?P<show_path>[^/]+)/(?P<episode_path>[^/?#]+)/?'
+
+ _TESTS = [{
+ 'url': 'http://adultswim.com/videos/rick-and-morty/pilot',
'playlist': [
{
- 'md5': '4da359ec73b58df4575cd01a610ba5dc',
- 'info_dict': {
- 'id': '8a250ba1450996e901453d7f02ca02f5',
- 'ext': 'flv',
- 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 1',
- 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?',
- 'uploader': 'Rick and Morty',
- 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg'
- }
- },
- {
- 'md5': 'ffbdf55af9331c509d95350bd0cc1819',
+ 'md5': '247572debc75c7652f253c8daa51a14d',
'info_dict': {
- 'id': '8a250ba1450996e901453d7f4bd102f6',
+ 'id': 'rQxZvXQ4ROaSOqq-or2Mow-0',
'ext': 'flv',
- 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 2',
- 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?',
- 'uploader': 'Rick and Morty',
- 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg'
- }
+ 'title': 'Rick and Morty - Pilot Part 1',
+ 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
+ },
},
{
- 'md5': 'b92409635540304280b4b6c36bd14a0a',
+ 'md5': '77b0e037a4b20ec6b98671c4c379f48d',
'info_dict': {
- 'id': '8a250ba1450996e901453d7fa73c02f7',
+ 'id': 'rQxZvXQ4ROaSOqq-or2Mow-3',
'ext': 'flv',
- 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 3',
- 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?',
- 'uploader': 'Rick and Morty',
- 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg'
- }
+ 'title': 'Rick and Morty - Pilot Part 4',
+ 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
+ },
},
+ ],
+ 'info_dict': {
+ 'title': 'Rick and Morty - Pilot',
+ 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
+ }
+ }, {
+ 'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/',
+ 'playlist': [
{
- 'md5': 'e8818891d60e47b29cd89d7b0278156d',
+ 'md5': '2eb5c06d0f9a1539da3718d897f13ec5',
'info_dict': {
- 'id': '8a250ba1450996e901453d7fc8ba02f8',
+ 'id': '-t8CamQlQ2aYZ49ItZCFog-0',
'ext': 'flv',
- 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 4',
- 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?',
- 'uploader': 'Rick and Morty',
- 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg'
- }
+ 'title': 'American Dad - Putting Francine Out of Business',
+ 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
+ },
}
- ]
- }
-
- _video_extensions = {
- '3500': 'flv',
- '640': 'mp4',
- '150': 'mp4',
- 'ipad': 'm3u8',
- 'iphone': 'm3u8'
- }
- _video_dimensions = {
- '3500': (1280, 720),
- '640': (480, 270),
- '150': (320, 180)
- }
+ ],
+ 'info_dict': {
+ 'title': 'American Dad - Putting Francine Out of Business',
+ 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
+ },
+ }]
+
+ @staticmethod
+ def find_video_info(collection, slug):
+ for video in collection.get('videos'):
+ if video.get('slug') == slug:
+ return video
+
+ @staticmethod
+ def find_collection_by_linkURL(collections, linkURL):
+ for collection in collections:
+ if collection.get('linkURL') == linkURL:
+ return collection
+
+ @staticmethod
+ def find_collection_containing_video(collections, slug):
+ for collection in collections:
+ for video in collection.get('videos'):
+ if video.get('slug') == slug:
+ return collection, video
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_path = mobj.group('path')
-
- webpage = self._download_webpage(url, video_path)
- episode_id = self._html_search_regex(
- r'<link rel="video_src" href="http://i\.adultswim\.com/adultswim/adultswimtv/tools/swf/viralplayer.swf\?id=([0-9a-f]+?)"\s*/?\s*>',
- webpage, 'episode_id')
- title = self._og_search_title(webpage)
-
- index_url = 'http://asfix.adultswim.com/asfix-svc/episodeSearch/getEpisodesByIDs?networkName=AS&ids=%s' % episode_id
- idoc = self._download_xml(index_url, title, 'Downloading episode index', 'Unable to download episode index')
-
- episode_el = idoc.find('.//episode')
- show_title = episode_el.attrib.get('collectionTitle')
- episode_title = episode_el.attrib.get('title')
- thumbnail = episode_el.attrib.get('thumbnailUrl')
- description = episode_el.find('./description').text.strip()
+ show_path = mobj.group('show_path')
+ episode_path = mobj.group('episode_path')
+ is_playlist = True if mobj.group('is_playlist') else False
+
+ webpage = self._download_webpage(url, episode_path)
+
+ # Extract the value of `bootstrappedData` from the Javascript in the page.
+ bootstrappedDataJS = self._search_regex(r'var bootstrappedData = ({.*});', webpage, episode_path)
+
+ try:
+ bootstrappedData = json.loads(bootstrappedDataJS)
+ except ValueError as ve:
+ errmsg = '%s: Failed to parse JSON ' % episode_path
+ raise ExtractorError(errmsg, cause=ve)
+
+ # Downloading videos from a /videos/playlist/ URL needs to be handled differently.
+ # NOTE: We are only downloading one video (the current one) not the playlist
+ if is_playlist:
+ collections = bootstrappedData['playlists']['collections']
+ collection = self.find_collection_by_linkURL(collections, show_path)
+ video_info = self.find_video_info(collection, episode_path)
+
+ show_title = video_info['showTitle']
+ segment_ids = [video_info['videoPlaybackID']]
+ else:
+ collections = bootstrappedData['show']['collections']
+ collection, video_info = self.find_collection_containing_video(collections, episode_path)
+
+ show = bootstrappedData['show']
+ show_title = show['title']
+ segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']]
+
+ episode_id = video_info['id']
+ episode_title = video_info['title']
+ episode_description = video_info['description']
+ episode_duration = video_info.get('duration')
entries = []
- segment_els = episode_el.findall('./segments/segment')
+ for part_num, segment_id in enumerate(segment_ids):
+ segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=mobile' % segment_id
- for part_num, segment_el in enumerate(segment_els):
- segment_id = segment_el.attrib.get('id')
- segment_title = '%s %s part %d' % (show_title, episode_title, part_num + 1)
- thumbnail = segment_el.attrib.get('thumbnailUrl')
- duration = segment_el.attrib.get('duration')
+ segment_title = '%s - %s' % (show_title, episode_title)
+ if len(segment_ids) > 1:
+ segment_title += ' Part %d' % (part_num + 1)
- segment_url = 'http://asfix.adultswim.com/asfix-svc/episodeservices/getCvpPlaylist?networkName=AS&id=%s' % segment_id
idoc = self._download_xml(
segment_url, segment_title,
'Downloading segment information', 'Unable to download segment information')
+ segment_duration = float_or_none(
+ xpath_text(idoc, './/trt', 'segment duration').strip())
+
formats = []
file_els = idoc.findall('.//files/file')
for file_el in file_els:
bitrate = file_el.attrib.get('bitrate')
- type = file_el.attrib.get('type')
- width, height = self._video_dimensions.get(bitrate, (None, None))
+ ftype = file_el.attrib.get('type')
+
formats.append({
- 'format_id': '%s-%s' % (bitrate, type),
- 'url': file_el.text,
- 'ext': self._video_extensions.get(bitrate, 'mp4'),
+ 'format_id': '%s_%s' % (bitrate, ftype),
+ 'url': file_el.text.strip(),
# The bitrate may not be a number (for example: 'iphone')
'tbr': int(bitrate) if bitrate.isdigit() else None,
- 'height': height,
- 'width': width
+ 'quality': 1 if ftype == 'hd' else -1
})
self._sort_formats(formats)
@@ -127,18 +154,16 @@ class AdultSwimIE(InfoExtractor):
'id': segment_id,
'title': segment_title,
'formats': formats,
- 'uploader': show_title,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'description': description
+ 'duration': segment_duration,
+ 'description': episode_description
})
return {
'_type': 'playlist',
'id': episode_id,
- 'display_id': video_path,
+ 'display_id': episode_path,
'entries': entries,
- 'title': '%s %s' % (show_title, episode_title),
- 'description': description,
- 'thumbnail': thumbnail
+ 'title': '%s - %s' % (show_title, episode_title),
+ 'description': episode_description,
+ 'duration': episode_duration
}
diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py
new file mode 100644
index 000000000..612708e25
--- /dev/null
+++ b/youtube_dl/extractor/aljazeera.py
@@ -0,0 +1,35 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class AlJazeeraIE(InfoExtractor):
+ _VALID_URL = r'http://www\.aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html'
+
+ _TEST = {
+ 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html',
+ 'info_dict': {
+ 'id': '3792260579001',
+ 'ext': 'mp4',
+ 'title': 'The Slum - Episode 1: Deliverance',
+ 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.',
+ 'uploader': 'Al Jazeera English',
+ },
+ 'add_ie': ['Brightcove'],
+ }
+
+ def _real_extract(self, url):
+ program_name = self._match_id(url)
+ webpage = self._download_webpage(url, program_name)
+ brightcove_id = self._search_regex(
+ r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id')
+
+ return {
+ '_type': 'url',
+ 'url': (
+ 'brightcove:'
+ 'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc'
+ '&%40videoPlayer={0}'.format(brightcove_id)
+ ),
+ 'ie_key': 'Brightcove',
+ }
diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py
index 398e93bfb..7d65b8193 100644
--- a/youtube_dl/extractor/allocine.py
+++ b/youtube_dl/extractor/allocine.py
@@ -5,15 +5,14 @@ import re
import json
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
- compat_str,
qualities,
- determine_ext,
)
class AllocineIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?P<typ>article|video|film)/(fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=)(?P<id>[0-9]+)(?:\.html)?'
+ _VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?P<typ>article|video|film)/(fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=|video-)(?P<id>[0-9]+)(?:\.html)?'
_TESTS = [{
'url': 'http://www.allocine.fr/article/fichearticle_gen_carticle=18635087.html',
@@ -45,6 +44,9 @@ class AllocineIE(InfoExtractor):
'description': 'md5:71742e3a74b0d692c7fce0dd2017a4ac',
'thumbnail': 're:http://.*\.jpg',
},
+ }, {
+ 'url': 'http://www.allocine.fr/video/video-19550147/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -75,9 +77,7 @@ class AllocineIE(InfoExtractor):
'format_id': format_id,
'quality': quality(format_id),
'url': v,
- 'ext': determine_ext(v),
})
-
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/alphaporno.py b/youtube_dl/extractor/alphaporno.py
new file mode 100644
index 000000000..c34719d1f
--- /dev/null
+++ b/youtube_dl/extractor/alphaporno.py
@@ -0,0 +1,77 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_iso8601,
+ parse_duration,
+ parse_filesize,
+ int_or_none,
+)
+
+
+class AlphaPornoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?alphaporno\.com/videos/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://www.alphaporno.com/videos/sensual-striptease-porn-with-samantha-alexandra/',
+ 'md5': 'feb6d3bba8848cd54467a87ad34bd38e',
+ 'info_dict': {
+ 'id': '258807',
+ 'display_id': 'sensual-striptease-porn-with-samantha-alexandra',
+ 'ext': 'mp4',
+ 'title': 'Sensual striptease porn with Samantha Alexandra',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'timestamp': 1418694611,
+ 'upload_date': '20141216',
+ 'duration': 387,
+ 'filesize_approx': 54120000,
+ 'tbr': 1145,
+ 'categories': list,
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ r"video_id\s*:\s*'([^']+)'", webpage, 'video id', default=None)
+
+ video_url = self._search_regex(
+ r"video_url\s*:\s*'([^']+)'", webpage, 'video url')
+ ext = self._html_search_meta(
+ 'encodingFormat', webpage, 'ext', default='.mp4')[1:]
+
+ title = self._search_regex(
+ [r'<meta content="([^"]+)" itemprop="description">',
+ r'class="title" itemprop="name">([^<]+)<'],
+ webpage, 'title')
+ thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail')
+ timestamp = parse_iso8601(self._html_search_meta(
+ 'uploadDate', webpage, 'upload date'))
+ duration = parse_duration(self._html_search_meta(
+ 'duration', webpage, 'duration'))
+ filesize_approx = parse_filesize(self._html_search_meta(
+ 'contentSize', webpage, 'file size'))
+ bitrate = int_or_none(self._html_search_meta(
+ 'bitrate', webpage, 'bitrate'))
+ categories = self._html_search_meta(
+ 'keywords', webpage, 'categories', default='').split(',')
+
+ age_limit = self._rta_search(webpage)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'ext': ext,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'filesize_approx': filesize_approx,
+ 'tbr': bitrate,
+ 'categories': categories,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py
index 47f8e4157..b51eafc45 100644
--- a/youtube_dl/extractor/aol.py
+++ b/youtube_dl/extractor/aol.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from .fivemin import FiveMinIE
class AolIE(InfoExtractor):
@@ -42,31 +41,30 @@ class AolIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
-
playlist_id = mobj.group('playlist_id')
- if playlist_id and not self._downloader.params.get('noplaylist'):
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+ if not playlist_id or self._downloader.params.get('noplaylist'):
+ return self.url_result('5min:%s' % video_id)
- webpage = self._download_webpage(url, playlist_id)
- title = self._html_search_regex(
- r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title')
- playlist_html = self._search_regex(
- r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage,
- 'playlist HTML')
- entries = [{
- '_type': 'url',
- 'url': 'aol-video:%s' % m.group('id'),
- 'ie_key': 'Aol',
- } for m in re.finditer(
- r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>",
- playlist_html)]
+ self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
- return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'display_id': mobj.group('playlist_display_id'),
- 'title': title,
- 'entries': entries,
- }
+ webpage = self._download_webpage(url, playlist_id)
+ title = self._html_search_regex(
+ r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title')
+ playlist_html = self._search_regex(
+ r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage,
+ 'playlist HTML')
+ entries = [{
+ '_type': 'url',
+ 'url': 'aol-video:%s' % m.group('id'),
+ 'ie_key': 'Aol',
+ } for m in re.finditer(
+ r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>",
+ playlist_html)]
- return FiveMinIE._build_result(video_id)
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'display_id': mobj.group('playlist_display_id'),
+ 'title': title,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index 0c01fa1a1..7cd0482c7 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -4,8 +4,8 @@ import re
import json
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
- compat_urlparse,
int_or_none,
)
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py
index 34ce8429b..9fc35a42b 100644
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -1,42 +1,48 @@
from __future__ import unicode_literals
-import json
-import re
-
from .common import InfoExtractor
-from ..utils import (
- unified_strdate,
-)
+from ..utils import unified_strdate
class ArchiveOrgIE(InfoExtractor):
IE_NAME = 'archive.org'
IE_DESC = 'archive.org videos'
- _VALID_URL = r'(?:https?://)?(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
- _TEST = {
- "url": "http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
- 'file': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
+ _VALID_URL = r'https?://(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
+ _TESTS = [{
+ 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'md5': '8af1d4cf447933ed3c7f4871162602db',
'info_dict': {
- "title": "1968 Demo - FJCC Conference Presentation Reel #1",
- "description": "Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also <a href=\"http://dougengelbart.org/firsts/dougs-1968-demo.html\" rel=\"nofollow\">Doug's 1968 Demo page</a> for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | <a href=\"http://www.archive.org/details/XD300-24_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 2</a> | <a href=\"http://www.archive.org/details/XD300-25_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 3</a>",
- "upload_date": "19681210",
- "uploader": "SRI International"
+ 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
+ 'ext': 'ogv',
+ 'title': '1968 Demo - FJCC Conference Presentation Reel #1',
+ 'description': 'md5:1780b464abaca9991d8968c877bb53ed',
+ 'upload_date': '19681210',
+ 'uploader': 'SRI International'
+ }
+ }, {
+ 'url': 'https://archive.org/details/Cops1922',
+ 'md5': '18f2a19e6d89af8425671da1cf3d4e04',
+ 'info_dict': {
+ 'id': 'Cops1922',
+ 'ext': 'ogv',
+ 'title': 'Buster Keaton\'s "Cops" (1922)',
+ 'description': 'md5:70f72ee70882f713d4578725461ffcc3',
}
- }
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
json_url = url + ('?' if '?' in url else '&') + 'output=json'
- json_data = self._download_webpage(json_url, video_id)
- data = json.loads(json_data)
+ data = self._download_json(json_url, video_id)
+
+ def get_optional(data_dict, field):
+ return data_dict['metadata'].get(field, [None])[0]
- title = data['metadata']['title'][0]
- description = data['metadata']['description'][0]
- uploader = data['metadata']['creator'][0]
- upload_date = unified_strdate(data['metadata']['date'][0])
+ title = get_optional(data, 'title')
+ description = get_optional(data, 'description')
+ uploader = get_optional(data, 'creator')
+ upload_date = unified_strdate(get_optional(data, 'date'))
formats = [
{
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 219631b9b..929dd3cc5 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -37,7 +37,7 @@ class ArteTvIE(InfoExtractor):
config_xml_url, video_id, note='Downloading configuration')
formats = [{
- 'forma_id': q.attrib['quality'],
+ 'format_id': q.attrib['quality'],
# The playpath starts at 'mp4:', if we don't manually
# split the url, rtmpdump will incorrectly parse them
'url': q.text.split('mp4:', 1)[0],
@@ -133,7 +133,7 @@ class ArteTVPlus7IE(InfoExtractor):
'width': int_or_none(f.get('width')),
'height': int_or_none(f.get('height')),
'tbr': int_or_none(f.get('bitrate')),
- 'quality': qfunc(f['quality']),
+ 'quality': qfunc(f.get('quality')),
'source_preference': source_pref,
}
diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py
new file mode 100644
index 000000000..72e83bfc2
--- /dev/null
+++ b/youtube_dl/extractor/atresplayer.py
@@ -0,0 +1,114 @@
+from __future__ import unicode_literals
+
+import time
+import hmac
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_str,
+ compat_urllib_request,
+ int_or_none,
+ float_or_none,
+ xpath_text,
+ ExtractorError,
+)
+
+
+class AtresPlayerIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P<id>.+?)_\d+\.html'
+ _TESTS = [
+ {
+ 'url': 'http://www.atresplayer.com/television/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_2014122100174.html',
+ 'md5': 'efd56753cda1bb64df52a3074f62e38a',
+ 'info_dict': {
+ 'id': 'capitulo-10-especial-solidario-nochebuena',
+ 'ext': 'mp4',
+ 'title': 'Especial Solidario de Nochebuena',
+ 'description': 'md5:e2d52ff12214fa937107d21064075bf1',
+ 'duration': 5527.6,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ },
+ {
+ 'url': 'http://www.atresplayer.com/television/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_2014122400174.html',
+ 'only_matching': True,
+ },
+ ]
+
+ _USER_AGENT = 'Dalvik/1.6.0 (Linux; U; Android 4.3; GT-I9300 Build/JSS15J'
+ _MAGIC = 'QWtMLXs414Yo+c#_+Q#K@NN)'
+ _TIMESTAMP_SHIFT = 30000
+
+ _TIME_API_URL = 'http://servicios.atresplayer.com/api/admin/time.json'
+ _URL_VIDEO_TEMPLATE = 'https://servicios.atresplayer.com/api/urlVideo/{1}/{0}/{1}|{2}|{3}.json'
+ _PLAYER_URL_TEMPLATE = 'https://servicios.atresplayer.com/episode/getplayer.json?episodePk=%s'
+ _EPISODE_URL_TEMPLATE = 'http://www.atresplayer.com/episodexml/%s'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ episode_id = self._search_regex(
+ r'episode="([^"]+)"', webpage, 'episode id')
+
+ timestamp = int_or_none(self._download_webpage(
+ self._TIME_API_URL,
+ video_id, 'Downloading timestamp', fatal=False), 1000, time.time())
+ timestamp_shifted = compat_str(timestamp + self._TIMESTAMP_SHIFT)
+ token = hmac.new(
+ self._MAGIC.encode('ascii'),
+ (episode_id + timestamp_shifted).encode('utf-8')
+ ).hexdigest()
+
+ formats = []
+ for fmt in ['windows', 'android_tablet']:
+ request = compat_urllib_request.Request(
+ self._URL_VIDEO_TEMPLATE.format(fmt, episode_id, timestamp_shifted, token))
+ request.add_header('Youtubedl-user-agent', self._USER_AGENT)
+
+ fmt_json = self._download_json(
+ request, video_id, 'Downloading %s video JSON' % fmt)
+
+ result = fmt_json.get('resultDes')
+ if result.lower() != 'ok':
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, result), expected=True)
+
+ for _, video_url in fmt_json['resultObject'].items():
+ if video_url.endswith('/Manifest'):
+ formats.extend(self._extract_f4m_formats(video_url[:-9] + '/manifest.f4m', video_id))
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': 'android',
+ 'preference': 1,
+ })
+ self._sort_formats(formats)
+
+ player = self._download_json(
+ self._PLAYER_URL_TEMPLATE % episode_id,
+ episode_id)
+
+ path_data = player.get('pathData')
+
+ episode = self._download_xml(
+ self._EPISODE_URL_TEMPLATE % path_data,
+ video_id, 'Downloading episode XML')
+
+ duration = float_or_none(xpath_text(
+ episode, './media/asset/info/technical/contentDuration', 'duration'))
+
+ art = episode.find('./media/asset/info/art')
+ title = xpath_text(art, './name', 'title')
+ description = xpath_text(art, './description', 'description')
+ thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py
index 04386f7f7..8bfe50214 100644
--- a/youtube_dl/extractor/audiomack.py
+++ b/youtube_dl/extractor/audiomack.py
@@ -1,11 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals
+import itertools
+import time
+
from .common import InfoExtractor
from .soundcloud import SoundcloudIE
-from ..utils import ExtractorError
-
-import time
+from ..utils import (
+ ExtractorError,
+ url_basename,
+)
class AudiomackIE(InfoExtractor):
@@ -17,53 +21,119 @@ class AudiomackIE(InfoExtractor):
'url': 'http://www.audiomack.com/song/roosh-williams/extraordinary',
'info_dict':
{
- 'id': 'roosh-williams/extraordinary',
+ 'id': '310086',
'ext': 'mp3',
- 'title': 'Roosh Williams - Extraordinary'
+ 'uploader': 'Roosh Williams',
+ 'title': 'Extraordinary'
}
},
- # hosted on soundcloud via audiomack
+ # audiomack wrapper around soundcloud song
{
'add_ie': ['Soundcloud'],
'url': 'http://www.audiomack.com/song/xclusiveszone/take-kare',
- 'file': '172419696.mp3',
- 'info_dict':
- {
+ 'info_dict': {
+ 'id': '172419696',
+ 'ext': 'mp3',
'description': 'md5:1fc3272ed7a635cce5be1568c2822997',
'title': 'Young Thug ft Lil Wayne - Take Kare',
- 'uploader':'Young Thug World',
- 'upload_date':'20141016',
+ 'uploader': 'Young Thug World',
+ 'upload_date': '20141016',
}
},
]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ # URLs end with [uploader name]/[uploader title]
+ # this title is whatever the user types in, and is rarely
+ # the proper song title. Real metadata is in the api response
+ album_url_tag = self._match_id(url)
+ # Request the extended version of the api for extra fields like artist and title
api_response = self._download_json(
- "http://www.audiomack.com/api/music/url/song/%s?_=%d" % (
- video_id, time.time()),
- video_id)
+ 'http://www.audiomack.com/api/music/url/song/%s?extended=1&_=%d' % (
+ album_url_tag, time.time()),
+ album_url_tag)
- if "url" not in api_response:
- raise ExtractorError("Unable to deduce api url of song")
- realurl = api_response["url"]
+ # API is inconsistent with errors
+ if 'url' not in api_response or not api_response['url'] or 'error' in api_response:
+ raise ExtractorError('Invalid url %s', url)
# Audiomack wraps a lot of soundcloud tracks in their branded wrapper
- # - if so, pass the work off to the soundcloud extractor
- if SoundcloudIE.suitable(realurl):
- return {'_type': 'url', 'url': realurl, 'ie_key': 'Soundcloud'}
-
- webpage = self._download_webpage(url, video_id)
- artist = self._html_search_regex(
- r'<span class="artist">(.*?)</span>', webpage, "artist")
- songtitle = self._html_search_regex(
- r'<h1 class="profile-title song-title"><span class="artist">.*?</span>(.*?)</h1>',
- webpage, "title")
- title = artist + " - " + songtitle
+ # if so, pass the work off to the soundcloud extractor
+ if SoundcloudIE.suitable(api_response['url']):
+ return {'_type': 'url', 'url': api_response['url'], 'ie_key': 'Soundcloud'}
return {
- 'id': video_id,
- 'title': title,
- 'url': realurl,
+ 'id': api_response.get('id', album_url_tag),
+ 'uploader': api_response.get('artist'),
+ 'title': api_response.get('title'),
+ 'url': api_response['url'],
}
+
+
+class AudiomackAlbumIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P<id>[\w/-]+)'
+ IE_NAME = 'audiomack:album'
+ _TESTS = [
+ # Standard album playlist
+ {
+ 'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape',
+ 'playlist_count': 15,
+ 'info_dict':
+ {
+ 'id': '812251',
+ 'title': 'Tha Tour: Part 2 (Official Mixtape)'
+ }
+ },
+ # Album playlist ripped from fakeshoredrive with no metadata
+ {
+ 'url': 'http://www.audiomack.com/album/fakeshoredrive/ppp-pistol-p-project',
+ 'playlist': [{
+ 'info_dict': {
+ 'title': '9.-heaven-or-hell-chimaca-ft-zuse-prod-by-dj-fu',
+ 'id': '9.-heaven-or-hell-chimaca-ft-zuse-prod-by-dj-fu',
+ 'ext': 'mp3',
+ }
+ }],
+ 'params': {
+ 'playliststart': 8,
+ 'playlistend': 8,
+ }
+ }
+ ]
+
+ def _real_extract(self, url):
+ # URLs end with [uploader name]/[uploader title]
+ # this title is whatever the user types in, and is rarely
+ # the proper song title. Real metadata is in the api response
+ album_url_tag = self._match_id(url)
+ result = {'_type': 'playlist', 'entries': []}
+ # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata
+ # Therefore we don't know how many songs the album has and must infi-loop until failure
+ for track_no in itertools.count():
+ # Get song's metadata
+ api_response = self._download_json(
+ 'http://www.audiomack.com/api/music/url/album/%s/%d?extended=1&_=%d'
+ % (album_url_tag, track_no, time.time()), album_url_tag,
+ note='Querying song information (%d)' % (track_no + 1))
+
+ # Total failure, only occurs when url is totally wrong
+ # Won't happen in middle of valid playlist (next case)
+ if 'url' not in api_response or 'error' in api_response:
+ raise ExtractorError('Invalid url for track %d of album url %s' % (track_no, url))
+ # URL is good but song id doesn't exist - usually means end of playlist
+ elif not api_response['url']:
+ break
+ else:
+ # Pull out the album metadata and add to result (if it exists)
+ for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]:
+ if apikey in api_response and resultkey not in result:
+ result[resultkey] = api_response[apikey]
+ song_id = url_basename(api_response['url']).rpartition('.')[0]
+ result['entries'].append({
+ 'id': api_response.get('id', song_id),
+ 'uploader': api_response.get('artist'),
+ 'title': api_response.get('title', song_id),
+ 'url': api_response['url'],
+ })
+ return result
diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py
index 1c765532a..a1b666be0 100644
--- a/youtube_dl/extractor/auengine.py
+++ b/youtube_dl/extractor/auengine.py
@@ -3,10 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urllib_parse
from ..utils import (
- compat_urllib_parse,
determine_ext,
ExtractorError,
+ remove_end,
)
@@ -27,23 +28,18 @@ class AUEngineIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>(?P<title>.+?)</title>', webpage, 'title')
- title = title.strip()
- links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage)
- links = map(compat_urllib_parse.unquote, links)
-
- thumbnail = None
- video_url = None
- for link in links:
- if link.endswith('.png'):
- thumbnail = link
- elif '/videos/' in link:
- video_url = link
+ title = self._html_search_regex(
+ r'<title>\s*(?P<title>.+?)\s*</title>', webpage, 'title')
+ video_urls = re.findall(r'http://\w+.auengine.com/vod/.*[^\W]', webpage)
+ video_url = compat_urllib_parse.unquote(video_urls[0])
+ thumbnails = re.findall(r'http://\w+.auengine.com/thumb/.*[^\W]', webpage)
+ thumbnail = compat_urllib_parse.unquote(thumbnails[0])
+
if not video_url:
raise ExtractorError('Could not find video URL')
+
ext = '.' + determine_ext(video_url)
- if ext == title[-len(ext):]:
- title = title[:-len(ext)]
+ title = remove_end(title, ext)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py
new file mode 100644
index 000000000..0961d339f
--- /dev/null
+++ b/youtube_dl/extractor/azubu.py
@@ -0,0 +1,93 @@
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import float_or_none
+
+
+class AzubuIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?azubu\.tv/[^/]+#!/play/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'http://www.azubu.tv/GSL#!/play/15575/2014-hot6-cup-last-big-match-ro8-day-1',
+ 'md5': 'a88b42fcf844f29ad6035054bd9ecaf4',
+ 'info_dict': {
+ 'id': '15575',
+ 'ext': 'mp4',
+ 'title': '2014 HOT6 CUP LAST BIG MATCH Ro8 Day 1',
+ 'description': 'md5:d06bdea27b8cc4388a90ad35b5c66c01',
+ 'thumbnail': 're:^https?://.*\.jpe?g',
+ 'timestamp': 1417523507.334,
+ 'upload_date': '20141202',
+ 'duration': 9988.7,
+ 'uploader': 'GSL',
+ 'uploader_id': 414310,
+ 'view_count': int,
+ },
+ },
+ {
+ 'url': 'http://www.azubu.tv/FnaticTV#!/play/9344/-fnatic-at-worlds-2014:-toyz---%22i-love-rekkles,-he-has-amazing-mechanics%22-',
+ 'md5': 'b72a871fe1d9f70bd7673769cdb3b925',
+ 'info_dict': {
+ 'id': '9344',
+ 'ext': 'mp4',
+ 'title': 'Fnatic at Worlds 2014: Toyz - "I love Rekkles, he has amazing mechanics"',
+ 'description': 'md5:4a649737b5f6c8b5c5be543e88dc62af',
+ 'thumbnail': 're:^https?://.*\.jpe?g',
+ 'timestamp': 1410530893.320,
+ 'upload_date': '20140912',
+ 'duration': 172.385,
+ 'uploader': 'FnaticTV',
+ 'uploader_id': 272749,
+ 'view_count': int,
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ data = self._download_json(
+ 'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data']
+
+ title = data['title'].strip()
+ description = data['description']
+ thumbnail = data['thumbnail']
+ view_count = data['view_count']
+ uploader = data['user']['username']
+ uploader_id = data['user']['id']
+
+ stream_params = json.loads(data['stream_params'])
+
+ timestamp = float_or_none(stream_params['creationDate'], 1000)
+ duration = float_or_none(stream_params['length'], 1000)
+
+ renditions = stream_params.get('renditions') or []
+ video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength')
+ if video:
+ renditions.append(video)
+
+ formats = [{
+ 'url': fmt['url'],
+ 'width': fmt['frameWidth'],
+ 'height': fmt['frameHeight'],
+ 'vbr': float_or_none(fmt['encodingRate'], 1000),
+ 'filesize': fmt['size'],
+ 'vcodec': fmt['videoCodec'],
+ 'container': fmt['videoContainer'],
+ } for fmt in renditions if fmt['url']]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py
index 1ca0b7cf2..98e1443ab 100644
--- a/youtube_dl/extractor/bambuser.py
+++ b/youtube_dl/extractor/bambuser.py
@@ -5,7 +5,7 @@ import json
import itertools
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
)
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index acddbc8f1..b45d68a61 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -4,9 +4,11 @@ import json
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
compat_urlparse,
+)
+from ..utils import (
ExtractorError,
)
@@ -104,7 +106,7 @@ class BandcampIE(InfoExtractor):
class BandcampAlbumIE(InfoExtractor):
IE_NAME = 'Bandcamp:album'
- _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+))'
+ _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+)|/?(?:$|[?#]))'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -139,6 +141,12 @@ class BandcampAlbumIE(InfoExtractor):
'title': 'Hierophany of the Open Grave',
},
'playlist_mincount': 9,
+ }, {
+ 'url': 'http://dotscale.bandcamp.com',
+ 'info_dict': {
+ 'title': 'Loom',
+ },
+ 'playlist_mincount': 7,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py
index beb6cfc8a..1cf48fe0d 100644
--- a/youtube_dl/extractor/bbccouk.py
+++ b/youtube_dl/extractor/bbccouk.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import re
import xml.etree.ElementTree
from .subtitles import SubtitlesInfoExtractor
@@ -11,7 +10,7 @@ from ..compat import compat_HTTPError
class BBCCoUkIE(SubtitlesInfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
- _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:programmes|iplayer/episode)/(?P<id>[\da-z]{8})'
+ _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
_TESTS = [
{
@@ -19,8 +18,8 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
'info_dict': {
'id': 'b039d07m',
'ext': 'flv',
- 'title': 'Kaleidoscope: Leonard Cohen',
- 'description': 'md5:db4755d7a665ae72343779f7dacb402c',
+ 'title': 'Kaleidoscope, Leonard Cohen',
+ 'description': 'The Canadian poet and songwriter reflects on his musical career.',
'duration': 1740,
},
'params': {
@@ -72,7 +71,54 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
'skip_download': True,
},
'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
- },
+ }, {
+ 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
+ 'info_dict': {
+ 'id': 'b04v209v',
+ 'ext': 'flv',
+ 'title': 'Pete Tong, The Essential New Tune Special',
+ 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
+ 'duration': 10800,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3',
+ 'note': 'Audio',
+ 'info_dict': {
+ 'id': 'p02frcch',
+ 'ext': 'flv',
+ 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix',
+ 'description': 'French house superstar Madeon takes us out of the club and onto the after party.',
+ 'duration': 3507,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
+ 'note': 'Video',
+ 'info_dict': {
+ 'id': 'p025c103',
+ 'ext': 'flv',
+ 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
+ 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
+ 'duration': 226,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
+ 'only_matching': True,
+ }
]
def _extract_asx_playlist(self, connection, programme_id):
@@ -204,13 +250,66 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
return formats, subtitles
+ def _download_playlist(self, playlist_id):
+ try:
+ playlist = self._download_json(
+ 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
+ playlist_id, 'Downloading playlist JSON')
+
+ version = playlist.get('defaultAvailableVersion')
+ if version:
+ smp_config = version['smpConfig']
+ title = smp_config['title']
+ description = smp_config['summary']
+ for item in smp_config['items']:
+ kind = item['kind']
+ if kind != 'programme' and kind != 'radioProgramme':
+ continue
+ programme_id = item.get('vpid')
+ duration = int(item.get('duration'))
+ formats, subtitles = self._download_media_selector(programme_id)
+ return programme_id, title, description, duration, formats, subtitles
+ except ExtractorError as ee:
+ if not isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
+ raise
+
+ # fallback to legacy playlist
+ playlist = self._download_xml(
+ 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id,
+ playlist_id, 'Downloading legacy playlist XML')
+
+ no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
+ if no_items is not None:
+ reason = no_items.get('reason')
+ if reason == 'preAvailability':
+ msg = 'Episode %s is not yet available' % playlist_id
+ elif reason == 'postAvailability':
+ msg = 'Episode %s is no longer available' % playlist_id
+ elif reason == 'noMedia':
+ msg = 'Episode %s is not currently available' % playlist_id
+ else:
+ msg = 'Episode %s is not available: %s' % (playlist_id, reason)
+ raise ExtractorError(msg, expected=True)
+
+ for item in self._extract_items(playlist):
+ kind = item.get('kind')
+ if kind != 'programme' and kind != 'radioProgramme':
+ continue
+ title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
+ description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
+ programme_id = item.get('identifier')
+ duration = int(item.get('duration'))
+ formats, subtitles = self._download_media_selector(programme_id)
+
+ return programme_id, title, description, duration, formats, subtitles
+
def _real_extract(self, url):
group_id = self._match_id(url)
webpage = self._download_webpage(url, group_id, 'Downloading video page')
programme_id = self._search_regex(
- r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False)
+ r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None)
if programme_id:
player = self._download_json(
'http://www.bbc.co.uk/iplayer/episode/%s.json' % group_id,
@@ -220,32 +319,7 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
duration = player['duration']
formats, subtitles = self._download_media_selector(programme_id)
else:
- playlist = self._download_xml(
- 'http://www.bbc.co.uk/iplayer/playlist/%s' % group_id,
- group_id, 'Downloading playlist XML')
-
- no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
- if no_items is not None:
- reason = no_items.get('reason')
- if reason == 'preAvailability':
- msg = 'Episode %s is not yet available' % group_id
- elif reason == 'postAvailability':
- msg = 'Episode %s is no longer available' % group_id
- elif reason == 'noMedia':
- msg = 'Episode %s is not currently available' % group_id
- else:
- msg = 'Episode %s is not available: %s' % (group_id, reason)
- raise ExtractorError(msg, expected=True)
-
- for item in self._extract_items(playlist):
- kind = item.get('kind')
- if kind != 'programme' and kind != 'radioProgramme':
- continue
- title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
- description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
- programme_id = item.get('identifier')
- duration = int(item.get('duration'))
- formats, subtitles = self._download_media_selector(programme_id)
+ programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(programme_id, subtitles)
diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py
index 31fdc0dcc..1bdc25812 100644
--- a/youtube_dl/extractor/behindkink.py
+++ b/youtube_dl/extractor/behindkink.py
@@ -10,15 +10,15 @@ from ..utils import url_basename
class BehindKinkIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)'
_TEST = {
- 'url': 'http://www.behindkink.com/2014/08/14/ab1576-performers-voice-finally-heard-the-bill-is-killed/',
- 'md5': '41ad01222b8442089a55528fec43ec01',
+ 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/',
+ 'md5': '507b57d8fdcd75a41a9a7bdb7989c762',
'info_dict': {
- 'id': '36370',
+ 'id': '37127',
'ext': 'mp4',
- 'title': 'AB1576 - PERFORMERS VOICE FINALLY HEARD - THE BILL IS KILLED!',
- 'description': 'The adult industry voice was finally heard as Assembly Bill 1576 remained\xa0 in suspense today at the Senate Appropriations Hearing. AB1576 was, among other industry damaging issues, a condom mandate...',
- 'upload_date': '20140814',
- 'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/08/36370_AB1576_Win.jpg',
+ 'title': 'What are you passionate about – Marley Blaze',
+ 'description': 'md5:aee8e9611b4ff70186f752975d9b94b4',
+ 'upload_date': '20141205',
+ 'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/12/blaze-1.jpg',
'age_limit': 18,
}
}
@@ -26,26 +26,19 @@ class BehindKinkIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
- year = mobj.group('year')
- month = mobj.group('month')
- day = mobj.group('day')
- upload_date = year + month + day
webpage = self._download_webpage(url, display_id)
video_url = self._search_regex(
- r"'file':\s*'([^']+)'",
- webpage, 'URL base')
-
- video_id = url_basename(video_url)
- video_id = video_id.split('_')[0]
+ r'<source src="([^"]+)"', webpage, 'video URL')
+ video_id = url_basename(video_url).split('_')[0]
+ upload_date = mobj.group('year') + mobj.group('month') + mobj.group('day')
return {
'id': video_id,
+ 'display_id': display_id,
'url': video_url,
- 'ext': 'mp4',
'title': self._og_search_title(webpage),
- 'display_id': display_id,
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
'upload_date': upload_date,
diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py
new file mode 100644
index 000000000..d2abd4d77
--- /dev/null
+++ b/youtube_dl/extractor/bet.py
@@ -0,0 +1,107 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import (
+ xpath_text,
+ xpath_with_ns,
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class BetIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html'
+ _TESTS = [
+ {
+ 'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html',
+ 'info_dict': {
+ 'id': '740ab250-bb94-4a8a-8787-fe0de7c74471',
+ 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism',
+ 'ext': 'flv',
+ 'title': 'BET News Presents: A Conversation With President Obama',
+ 'description': 'md5:5a88d8ae912c1b33e090290af7ec33c6',
+ 'duration': 1534,
+ 'timestamp': 1418075340,
+ 'upload_date': '20141208',
+ 'uploader': 'admin',
+ 'thumbnail': 're:(?i)^https?://.*\.jpg$',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html',
+ 'info_dict': {
+ 'id': 'bcd1b1df-673a-42cf-8d01-b282db608f2d',
+ 'display_id': 'justice-for-ferguson-a-community-reacts',
+ 'ext': 'flv',
+ 'title': 'Justice for Ferguson: A Community Reacts',
+ 'description': 'A BET News special.',
+ 'duration': 1696,
+ 'timestamp': 1416942360,
+ 'upload_date': '20141125',
+ 'uploader': 'admin',
+ 'thumbnail': 're:(?i)^https?://.*\.jpg$',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }
+ ]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ media_url = compat_urllib_parse.unquote(self._search_regex(
+ [r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"],
+ webpage, 'media URL'))
+
+ mrss = self._download_xml(media_url, display_id)
+
+ item = mrss.find('./channel/item')
+
+ NS_MAP = {
+ 'dc': 'http://purl.org/dc/elements/1.1/',
+ 'media': 'http://search.yahoo.com/mrss/',
+ 'ka': 'http://kickapps.com/karss',
+ }
+
+ title = xpath_text(item, './title', 'title')
+ description = xpath_text(
+ item, './description', 'description', fatal=False)
+
+ video_id = xpath_text(item, './guid', 'video id', fatal=False)
+
+ timestamp = parse_iso8601(xpath_text(
+ item, xpath_with_ns('./dc:date', NS_MAP),
+ 'upload date', fatal=False))
+ uploader = xpath_text(
+ item, xpath_with_ns('./dc:creator', NS_MAP),
+ 'uploader', fatal=False)
+
+ media_content = item.find(
+ xpath_with_ns('./media:content', NS_MAP))
+ duration = int_or_none(media_content.get('duration'))
+ smil_url = media_content.get('url')
+
+ thumbnail = media_content.find(
+ xpath_with_ns('./media:thumbnail', NS_MAP)).get('url')
+
+ formats = self._extract_smil_formats(smil_url, display_id)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
index 0d5889f5d..75d744852 100644
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -5,8 +5,6 @@ import re
from .common import InfoExtractor
from ..utils import (
- compat_parse_qs,
- ExtractorError,
int_or_none,
unified_strdate,
)
@@ -29,10 +27,9 @@ class BiliBiliIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+
video_code = self._search_regex(
r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code')
@@ -55,45 +52,38 @@ class BiliBiliIE(InfoExtractor):
thumbnail = self._html_search_meta(
'thumbnailUrl', video_code, 'thumbnail', fatal=False)
- player_params = compat_parse_qs(self._html_search_regex(
- r'<iframe .*?class="player" src="https://secure\.bilibili\.(?:tv|com)/secure,([^"]+)"',
- webpage, 'player params'))
+ cid = self._search_regex(r'cid=(\d+)', webpage, 'cid')
- if 'cid' in player_params:
- cid = player_params['cid'][0]
+ lq_doc = self._download_xml(
+ 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid,
+ video_id,
+ note='Downloading LQ video info'
+ )
+ lq_durl = lq_doc.find('./durl')
+ formats = [{
+ 'format_id': 'lq',
+ 'quality': 1,
+ 'url': lq_durl.find('./url').text,
+ 'filesize': int_or_none(
+ lq_durl.find('./size'), get_attr='text'),
+ }]
- lq_doc = self._download_xml(
- 'http://interface.bilibili.cn/v_cdn_play?cid=%s' % cid,
- video_id,
- note='Downloading LQ video info'
- )
- lq_durl = lq_doc.find('.//durl')
- formats = [{
- 'format_id': 'lq',
- 'quality': 1,
- 'url': lq_durl.find('./url').text,
+ hq_doc = self._download_xml(
+ 'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid,
+ video_id,
+ note='Downloading HQ video info',
+ fatal=False,
+ )
+ if hq_doc is not False:
+ hq_durl = hq_doc.find('./durl')
+ formats.append({
+ 'format_id': 'hq',
+ 'quality': 2,
+ 'ext': 'flv',
+ 'url': hq_durl.find('./url').text,
'filesize': int_or_none(
- lq_durl.find('./size'), get_attr='text'),
- }]
-
- hq_doc = self._download_xml(
- 'http://interface.bilibili.cn/playurl?cid=%s' % cid,
- video_id,
- note='Downloading HQ video info',
- fatal=False,
- )
- if hq_doc is not False:
- hq_durl = hq_doc.find('.//durl')
- formats.append({
- 'format_id': 'hq',
- 'quality': 2,
- 'ext': 'flv',
- 'url': hq_durl.find('./url').text,
- 'filesize': int_or_none(
- hq_durl.find('./size'), get_attr='text'),
- })
- else:
- raise ExtractorError('Unsupported player parameters: %r' % (player_params,))
+ hq_durl.find('./size'), get_attr='text'),
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py
index da47f27bd..14b814120 100644
--- a/youtube_dl/extractor/bliptv.py
+++ b/youtube_dl/extractor/bliptv.py
@@ -4,13 +4,17 @@ import re
from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
-from ..utils import (
+
+from ..compat import (
+ compat_str,
compat_urllib_request,
- unescapeHTML,
- parse_iso8601,
compat_urlparse,
+)
+from ..utils import (
clean_html,
- compat_str,
+ int_or_none,
+ parse_iso8601,
+ unescapeHTML,
)
@@ -78,7 +82,25 @@ class BlipTVIE(SubtitlesInfoExtractor):
'uploader': 'NostalgiaCritic',
'uploader_id': '246467',
}
- }
+ },
+ {
+ # https://github.com/rg3/youtube-dl/pull/4404
+ 'note': 'Audio only',
+ 'url': 'http://blip.tv/hilarios-productions/weekly-manga-recap-kingdom-7119982',
+ 'md5': '76c0a56f24e769ceaab21fbb6416a351',
+ 'info_dict': {
+ 'id': '7103299',
+ 'ext': 'flv',
+ 'title': 'Weekly Manga Recap: Kingdom',
+ 'description': 'And then Shin breaks the enemy line, and he&apos;s all like HWAH! And then he slices a guy and it&apos;s all like FWASHING! And... it&apos;s really hard to describe the best parts of this series without breaking down into sound effects, okay?',
+ 'timestamp': 1417660321,
+ 'upload_date': '20141204',
+ 'uploader': 'The Rollo T',
+ 'uploader_id': '407429',
+ 'duration': 7251,
+ 'vcodec': 'none',
+ }
+ },
]
def _real_extract(self, url):
@@ -145,11 +167,11 @@ class BlipTVIE(SubtitlesInfoExtractor):
'url': real_url,
'format_id': role,
'format_note': media_type,
- 'vcodec': media_content.get(blip('vcodec')),
+ 'vcodec': media_content.get(blip('vcodec')) or 'none',
'acodec': media_content.get(blip('acodec')),
'filesize': media_content.get('filesize'),
- 'width': int(media_content.get('width')),
- 'height': int(media_content.get('height')),
+ 'width': int_or_none(media_content.get('width')),
+ 'height': int_or_none(media_content.get('height')),
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index bf18a97e0..003152c4e 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -6,25 +6,26 @@ import json
import xml.etree.ElementTree
from .common import InfoExtractor
-from ..utils import (
- compat_urllib_parse,
- find_xpath_attr,
- fix_xml_ampersands,
- compat_urlparse,
- compat_str,
- compat_urllib_request,
+from ..compat import (
compat_parse_qs,
+ compat_str,
+ compat_urllib_parse,
compat_urllib_parse_urlparse,
-
+ compat_urllib_request,
+ compat_urlparse,
+)
+from ..utils import (
determine_ext,
ExtractorError,
- unsmuggle_url,
+ find_xpath_attr,
+ fix_xml_ampersands,
unescapeHTML,
+ unsmuggle_url,
)
class BrightcoveIE(InfoExtractor):
- _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*?\?(?P<query>.*)'
+ _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)'
_FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
_TESTS = [
diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py
index a40a1bbc4..a5d2af174 100644
--- a/youtube_dl/extractor/buzzfeed.py
+++ b/youtube_dl/extractor/buzzfeed.py
@@ -33,7 +33,7 @@ class BuzzFeedIE(InfoExtractor):
'skip_download': True, # Got enough YouTube download tests
},
'info_dict': {
- 'description': 'Munchkin the Teddy Bear is back !',
+ 'description': 're:Munchkin the Teddy Bear is back ?!',
'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill',
},
'playlist': [{
@@ -42,9 +42,9 @@ class BuzzFeedIE(InfoExtractor):
'ext': 'mp4',
'upload_date': '20141124',
'uploader_id': 'CindysMunchkin',
- 'description': '© 2014 Munchkin the Shih Tzu\nAll rights reserved\nFacebook: http://facebook.com/MunchkintheShihTzu',
+ 'description': 're:© 2014 Munchkin the Shih Tzu',
'uploader': 'Munchkin the Shih Tzu',
- 'title': 'Munchkin the Teddy Bear gets her exercise',
+ 'title': 're:Munchkin the Teddy Bear gets her exercise',
},
}]
}]
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 9873728df..11d18d74a 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -5,6 +5,8 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
+ HEADRequest,
unified_strdate,
url_basename,
qualities,
@@ -76,6 +78,16 @@ class CanalplusIE(InfoExtractor):
preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD', 'HLS', 'HDS'])
+ fmt_url = next(iter(media.find('VIDEOS'))).text
+ if '/geo' in fmt_url.lower():
+ response = self._request_webpage(
+ HEADRequest(fmt_url), video_id,
+ 'Checking if the video is georestricted')
+ if '/blocage' in response.geturl():
+ raise ExtractorError(
+ 'The video is not available in your country',
+ expected=True)
+
formats = []
for fmt in media.find('VIDEOS'):
format_url = fmt.text
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py
index 97feb6704..f70e090bb 100644
--- a/youtube_dl/extractor/ceskatelevize.py
+++ b/youtube_dl/extractor/ceskatelevize.py
@@ -3,55 +3,50 @@ from __future__ import unicode_literals
import re
-from .common import InfoExtractor
-from ..utils import (
+from .subtitles import SubtitlesInfoExtractor
+from ..compat import (
compat_urllib_request,
compat_urllib_parse,
compat_urllib_parse_urlparse,
+)
+from ..utils import (
ExtractorError,
+ float_or_none,
)
-class CeskaTelevizeIE(InfoExtractor):
+class CeskaTelevizeIE(SubtitlesInfoExtractor):
_VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)'
_TESTS = [
{
- 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka',
- 'info_dict': {
- 'id': '213512120230004',
- 'ext': 'flv',
- 'title': 'První republika: Španělská chřipka',
- 'duration': 3107.4,
- },
- 'params': {
- 'skip_download': True, # requires rtmpdump
- },
- 'skip': 'Works only from Czech Republic.',
- },
- {
- 'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt',
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
'info_dict': {
- 'id': '20138143440',
- 'ext': 'flv',
- 'title': 'Tsatsiki, maminka a policajt',
- 'duration': 6754.1,
+ 'id': '214411058091220',
+ 'ext': 'mp4',
+ 'title': 'Hyde Park Civilizace',
+ 'description': 'Věda a současná civilizace. Interaktivní pořad - prostor pro vaše otázky a komentáře',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 3350,
},
'params': {
- 'skip_download': True, # requires rtmpdump
+ # m3u8 download
+ 'skip_download': True,
},
- 'skip': 'Works only from Czech Republic.',
},
{
'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',
'info_dict': {
'id': '14716',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'První republika: Zpěvačka z Dupárny Bobina',
- 'duration': 90,
+ 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 88.4,
},
'params': {
- 'skip_download': True, # requires rtmpdump
+ # m3u8 download
+ 'skip_download': True,
},
},
]
@@ -78,8 +73,9 @@ class CeskaTelevizeIE(InfoExtractor):
'requestSource': 'iVysilani',
}
- req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url',
- data=compat_urllib_parse.urlencode(data))
+ req = compat_urllib_request.Request(
+ 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
+ data=compat_urllib_parse.urlencode(data))
req.add_header('Content-type', 'application/x-www-form-urlencoded')
req.add_header('x-addr', '127.0.0.1')
@@ -88,39 +84,72 @@ class CeskaTelevizeIE(InfoExtractor):
playlistpage = self._download_json(req, video_id)
- req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlistpage['url']))
+ playlist_url = playlistpage['url']
+ if playlist_url == 'error_region':
+ raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
+
+ req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url))
req.add_header('Referer', url)
- playlist = self._download_xml(req, video_id)
+ playlist = self._download_json(req, video_id)
+ item = playlist['playlist'][0]
formats = []
- for i in playlist.find('smilRoot/body'):
- if 'AD' not in i.attrib['id']:
- base_url = i.attrib['base']
- parsedurl = compat_urllib_parse_urlparse(base_url)
- duration = i.attrib['duration']
-
- for video in i.findall('video'):
- if video.attrib['label'] != 'AD':
- format_id = video.attrib['label']
- play_path = video.attrib['src']
- vbr = int(video.attrib['system-bitrate'])
-
- formats.append({
- 'format_id': format_id,
- 'url': base_url,
- 'vbr': vbr,
- 'play_path': play_path,
- 'app': parsedurl.path[1:] + '?' + parsedurl.query,
- 'rtmp_live': True,
- 'ext': 'flv',
- })
-
+ for format_id, stream_url in item['streamUrls'].items():
+ formats.extend(self._extract_m3u8_formats(stream_url, video_id, 'mp4'))
self._sort_formats(formats)
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ duration = float_or_none(item.get('duration'))
+ thumbnail = item.get('previewImageUrl')
+
+ subtitles = {}
+ subs = item.get('subtitles')
+ if subs:
+ subtitles['cs'] = subs[0]['url']
+
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, subtitles)
+ return
+
+ subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles))
+
return {
'id': episode_id,
- 'title': self._html_search_regex(r'<title>(.+?) — iVysílání — Česká televize</title>', webpage, 'title'),
- 'duration': float(duration),
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
'formats': formats,
+ 'subtitles': subtitles,
}
+
+ @staticmethod
+ def _fix_subtitles(subtitles):
+ """ Convert millisecond-based subtitles to SRT """
+ if subtitles is None:
+ return subtitles # subtitles not requested
+
+ def _msectotimecode(msec):
+ """ Helper utility to convert milliseconds to timecode """
+ components = []
+ for divider in [1000, 60, 60, 100]:
+ components.append(msec % divider)
+ msec //= divider
+ return "{3:02}:{2:02}:{1:02},{0:03}".format(*components)
+
+ def _fix_subtitle(subtitle):
+ for line in subtitle.splitlines():
+ m = re.match(r"^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$", line)
+ if m:
+ yield m.group(1)
+ start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:])
+ yield "{0} --> {1}".format(start, stop)
+ else:
+ yield line
+
+ fixed_subtitles = {}
+ for k, v in subtitles.items():
+ fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v))
+ return fixed_subtitles
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py
index 2a05813f8..3dfc24f5b 100644
--- a/youtube_dl/extractor/channel9.py
+++ b/youtube_dl/extractor/channel9.py
@@ -236,16 +236,17 @@ class Channel9IE(InfoExtractor):
if contents is None:
return contents
- session_meta = {'session_code': self._extract_session_code(html),
- 'session_day': self._extract_session_day(html),
- 'session_room': self._extract_session_room(html),
- 'session_speakers': self._extract_session_speakers(html),
- }
+ session_meta = {
+ 'session_code': self._extract_session_code(html),
+ 'session_day': self._extract_session_day(html),
+ 'session_room': self._extract_session_room(html),
+ 'session_speakers': self._extract_session_speakers(html),
+ }
for content in contents:
content.update(session_meta)
- return contents
+ return self.playlist_result(contents)
def _extract_list(self, content_path):
rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
diff --git a/youtube_dl/extractor/cinchcast.py b/youtube_dl/extractor/cinchcast.py
new file mode 100644
index 000000000..0c9a24bef
--- /dev/null
+++ b/youtube_dl/extractor/cinchcast.py
@@ -0,0 +1,52 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ unified_strdate,
+ xpath_text,
+)
+
+
+class CinchcastIE(InfoExtractor):
+ _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)'
+ _TEST = {
+ # Actual test is run in generic, look for undergroundwellness
+ 'url': 'http://player.cinchcast.com/?platformId=1&#038;assetType=single&#038;assetId=7141703',
+ 'only_matching': True,
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ doc = self._download_xml(
+ 'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id,
+ video_id)
+
+ item = doc.find('.//item')
+ title = xpath_text(item, './title', fatal=True)
+ date_str = xpath_text(
+ item, './{http://developer.longtailvideo.com/trac/}date')
+ upload_date = unified_strdate(date_str, day_first=False)
+ # duration is present but wrong
+ formats = []
+ formats.append({
+ 'format_id': 'main',
+ 'url': item.find(
+ './{http://search.yahoo.com/mrss/}content').attrib['url'],
+ })
+ backup_url = xpath_text(
+ item, './{http://developer.longtailvideo.com/trac/}backupContent')
+ if backup_url:
+ formats.append({
+ 'preference': 2, # seems to be more reliable
+ 'format_id': 'backup',
+ 'url': backup_url,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'upload_date': upload_date,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py
index 710d5009b..3145b3051 100644
--- a/youtube_dl/extractor/cnet.py
+++ b/youtube_dl/extractor/cnet.py
@@ -2,12 +2,10 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
- int_or_none,
)
@@ -15,23 +13,24 @@ class CNETIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
_TEST = {
'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
- 'md5': '041233212a0d06b179c87cbcca1577b8',
'info_dict': {
'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Hands-on with Microsoft Windows 8.1 Update',
'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.',
'thumbnail': 're:^http://.*/flmswindows8.jpg$',
- 'uploader_id': 'sarah.mitroff@cbsinteractive.com',
+ 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861',
'uploader': 'Sarah Mitroff',
+ },
+ 'params': {
+ 'skip_download': 'requires rtmpdump',
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('id')
-
+ display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
+
data_json = self._html_search_regex(
r"<div class=\"cnetVideoPlayer\"\s+.*?data-cnet-video-options='([^']+)'",
webpage, 'data json')
@@ -42,37 +41,31 @@ class CNETIE(InfoExtractor):
if not vdata:
raise ExtractorError('Cannot find video data')
+ mpx_account = data['config']['players']['default']['mpx_account']
+ vid = vdata['files']['rtmp']
+ tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid)
+
video_id = vdata['id']
title = vdata.get('headline')
if title is None:
title = vdata.get('title')
if title is None:
raise ExtractorError('Cannot find title!')
- description = vdata.get('dek')
thumbnail = vdata.get('image', {}).get('path')
author = vdata.get('author')
if author:
uploader = '%s %s' % (author['firstName'], author['lastName'])
- uploader_id = author.get('email')
+ uploader_id = author.get('id')
else:
uploader = None
uploader_id = None
- formats = [{
- 'format_id': '%s-%s-%s' % (
- f['type'], f['format'],
- int_or_none(f.get('bitrate'), 1000, default='')),
- 'url': f['uri'],
- 'tbr': int_or_none(f.get('bitrate'), 1000),
- } for f in vdata['files']['data']]
- self._sort_formats(formats)
-
return {
+ '_type': 'url_transparent',
+ 'url': tp_link,
'id': video_id,
'display_id': display_id,
'title': title,
- 'formats': formats,
- 'description': description,
'uploader': uploader,
'uploader_id': uploader_id,
'thumbnail': thumbnail,
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index 81142ee41..93e8d0de3 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -11,14 +11,14 @@ from ..utils import (
class CNNIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://((edition|www)\.)?cnn\.com/video/(data/.+?|\?)/
- (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn(-ap)?|(?=&)))'''
+ _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
+ (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln)(?:-ap)?|(?=&)))'''
_TESTS = [{
'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
'md5': '3e6121ea48df7e2259fe73a0628605c4',
'info_dict': {
- 'id': 'sports_2013_06_09_nadal-1-on-1.cnn',
+ 'id': 'sports/2013/06/09/nadal-1-on-1.cnn',
'ext': 'mp4',
'title': 'Nadal wins 8th French Open title',
'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
@@ -35,6 +35,16 @@ class CNNIE(InfoExtractor):
"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"",
"upload_date": "20130821",
}
+ }, {
+ 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html',
+ 'md5': 'f14d02ebd264df951feb2400e2c25a1b',
+ 'info_dict': {
+ 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln',
+ 'ext': 'mp4',
+ 'title': 'Nashville Ep. 1: Hand crafted skateboards',
+ 'description': 'md5:e7223a503315c9f150acac52e76de086',
+ 'upload_date': '20141222',
+ }
}]
def _real_extract(self, url):
@@ -127,3 +137,28 @@ class CNNBlogsIE(InfoExtractor):
'url': cnn_url,
'ie_key': CNNIE.ie_key(),
}
+
+
+class CNNArticleIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)'
+ _TEST = {
+ 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
+ 'md5': '275b326f85d80dff7592a9820f5dc887',
+ 'info_dict': {
+ 'id': 'bestoftv/2014/12/21/sotu-crowley-president-obama-north-korea-not-going-to-be-intimidated.cnn',
+ 'ext': 'mp4',
+ 'title': 'Obama: We\'re not going to be intimidated',
+ 'description': 'md5:e735586f3dc936075fa654a4d91b21f9',
+ 'upload_date': '20141220',
+ },
+ 'add_ie': ['CNN'],
+ }
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage(url, url_basename(url))
+ cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url')
+ return {
+ '_type': 'url',
+ 'url': 'http://cnn.com/video/?/video/' + cnn_url,
+ 'ie_key': CNNIE.ie_key(),
+ }
diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py
new file mode 100644
index 000000000..9c25b2223
--- /dev/null
+++ b/youtube_dl/extractor/comcarcoff.py
@@ -0,0 +1,57 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import parse_iso8601
+
+
+class ComCarCoffIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?comediansincarsgettingcoffee\.com/(?P<id>[a-z0-9\-]*)'
+ _TESTS = [{
+ 'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/',
+ 'info_dict': {
+ 'id': 'miranda-sings-happy-thanksgiving-miranda',
+ 'ext': 'mp4',
+ 'upload_date': '20141127',
+ 'timestamp': 1417107600,
+ 'title': 'Happy Thanksgiving Miranda',
+ 'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.',
+ 'thumbnail': 'http://ccc.crackle.com/images/s5e4_thumb.jpg',
+ },
+ 'params': {
+ 'skip_download': 'requires ffmpeg',
+ }
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ if not display_id:
+ display_id = 'comediansincarsgettingcoffee.com'
+ webpage = self._download_webpage(url, display_id)
+
+ full_data = json.loads(self._search_regex(
+ r'<script type="application/json" id="videoData">(?P<json>.+?)</script>',
+ webpage, 'full data json'))
+
+ video_id = full_data['activeVideo']['video']
+ video_data = full_data['videos'][video_id]
+ thumbnails = [{
+ 'url': video_data['images']['thumb'],
+ }, {
+ 'url': video_data['images']['poster'],
+ }]
+ formats = self._extract_m3u8_formats(
+ video_data['mediaUrl'], video_id, ext='mp4')
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': video_data['title'],
+ 'description': video_data.get('description'),
+ 'timestamp': parse_iso8601(video_data.get('pubDate')),
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))),
+ }
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index 2e3ef3fda..8d27af5e5 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
from .mtv import MTVServicesInfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
compat_urllib_parse,
+)
+from ..utils import (
ExtractorError,
float_or_none,
unified_strdate,
@@ -48,7 +50,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
)|
(?P<interview>
extended-interviews/(?P<interID>[0-9a-z]+)/(?:playlist_tds_extended_)?(?P<interview_title>.*?)(/.*?)?)))
- (?:[?#].*|$)'''
+ '''
_TESTS = [{
'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart',
'md5': '4e2f5cb088a83cd8cdb7756132f9739d',
@@ -82,6 +84,9 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
'url': 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights',
'only_matching': True,
}, {
+ 'url': 'http://thedailyshow.cc.com/video-playlists/t6d9sg/the-daily-show-20038-highlights/be3cwo',
+ 'only_matching': True,
+ }, {
'url': 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food',
'only_matching': True,
}, {
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index b633ea9b9..cd155a090 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -21,6 +21,7 @@ from ..compat import (
compat_str,
)
from ..utils import (
+ age_restricted,
clean_html,
compiled_regex_type,
ExtractorError,
@@ -40,7 +41,7 @@ class InfoExtractor(object):
information about the video (or videos) the URL refers to. This
information includes the real video URL, the video title, author and
others. The information is stored in a dictionary which is then
- passed to the FileDownloader. The FileDownloader processes this
+ passed to the YoutubeDL. The YoutubeDL processes this
information possibly downloading the video to the file system, among
other possible outcomes.
@@ -92,6 +93,8 @@ class InfoExtractor(object):
by this field, regardless of all other values.
-1 for default (order by other properties),
-2 or smaller for less than default.
+ < -1000 to hide the format (if there is
+ another one which is strictly better)
* language_preference Is this in the correct requested
language?
10 if it's what the URL is about,
@@ -118,6 +121,7 @@ class InfoExtractor(object):
The following fields are optional:
+ alt_title: A secondary title of the video.
display_id An alternative identifier for the video, not necessarily
unique, but available before title. Typically, id is
something like "4234987", title "Dancing naked mole rats",
@@ -129,7 +133,7 @@ class InfoExtractor(object):
* "resolution" (optional, string "{width}x{height"},
deprecated)
thumbnail: Full URL to a video thumbnail image.
- description: One-line video description.
+ description: Full video description.
uploader: Full name of the video uploader.
timestamp: UNIX timestamp of the moment the video became available.
upload_date: Video upload date (YYYYMMDD).
@@ -143,6 +147,17 @@ class InfoExtractor(object):
like_count: Number of positive ratings of the video
dislike_count: Number of negative ratings of the video
comment_count: Number of comments on the video
+ comments: A list of comments, each with one or more of the following
+ properties (all but one of text or html optional):
+ * "author" - human-readable name of the comment author
+ * "author_id" - user ID of the comment author
+ * "id" - Comment ID
+ * "html" - Comment as HTML
+ * "text" - Plain text of the comment
+ * "timestamp" - UNIX timestamp of comment
+ * "parent" - ID of the comment this one is replying to.
+ Set to "root" to indicate that this is a
+ comment to the original video.
age_limit: Age restriction for the video, as an integer (years)
webpage_url: The url to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set
@@ -158,8 +173,8 @@ class InfoExtractor(object):
_type "playlist" indicates multiple videos.
- There must be a key "entries", which is a list or a PagedList object, each
- element of which is a valid dictionary under this specfication.
+ There must be a key "entries", which is a list, an iterable, or a PagedList
+ object, each element of which is a valid dictionary by this specification.
Additionally, playlists can have "title" and "id" attributes with the same
semantics as videos (see above).
@@ -174,9 +189,10 @@ class InfoExtractor(object):
_type "url" indicates that the video must be extracted from another
location, possibly by a different extractor. Its only required key is:
"url" - the next URL to extract.
-
- Additionally, it may have properties believed to be identical to the
- resolved entity, for example "title" if the title of the referred video is
+ The key "ie_key" can be set to the class name (minus the trailing "IE",
+ e.g. "Youtube") if the extractor class is known in advance.
+ Additionally, the dictionary may have any properties of the resolved entity
+ known in advance, for example "title" if the title of the referred video is
known ahead of time.
@@ -400,6 +416,10 @@ class InfoExtractor(object):
url_or_request, video_id, note, errnote, fatal=fatal)
if (not fatal) and json_string is False:
return None
+ return self._parse_json(
+ json_string, video_id, transform_source=transform_source, fatal=fatal)
+
+ def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
if transform_source:
json_string = transform_source(json_string)
try:
@@ -449,7 +469,7 @@ class InfoExtractor(object):
return video_info
@staticmethod
- def playlist_result(entries, playlist_id=None, playlist_title=None):
+ def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
"""Returns a playlist"""
video_info = {'_type': 'playlist',
'entries': entries}
@@ -457,6 +477,8 @@ class InfoExtractor(object):
video_info['id'] = playlist_id
if playlist_title:
video_info['title'] = playlist_title
+ if playlist_description:
+ video_info['description'] = playlist_description
return video_info
def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
@@ -591,9 +613,9 @@ class InfoExtractor(object):
if display_name is None:
display_name = name
return self._html_search_regex(
- r'''(?ix)<meta
+ r'''(?isx)<meta
(?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
- [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
+ [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
@@ -800,6 +822,49 @@ class InfoExtractor(object):
self._sort_formats(formats)
return formats
+ # TODO: improve extraction
+ def _extract_smil_formats(self, smil_url, video_id):
+ smil = self._download_xml(
+ smil_url, video_id, 'Downloading SMIL file',
+ 'Unable to download SMIL file')
+
+ base = smil.find('./head/meta').get('base')
+
+ formats = []
+ rtmp_count = 0
+ for video in smil.findall('./body/switch/video'):
+ src = video.get('src')
+ if not src:
+ continue
+ bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+ width = int_or_none(video.get('width'))
+ height = int_or_none(video.get('height'))
+ proto = video.get('proto')
+ if not proto:
+ if base:
+ if base.startswith('rtmp'):
+ proto = 'rtmp'
+ elif base.startswith('http'):
+ proto = 'http'
+ ext = video.get('ext')
+ if proto == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(src, video_id, ext))
+ elif proto == 'rtmp':
+ rtmp_count += 1
+ streamer = video.get('streamer') or base
+ formats.append({
+ 'url': streamer,
+ 'play_path': src,
+ 'ext': 'flv',
+ 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
+ 'tbr': bitrate,
+ 'width': width,
+ 'height': height,
+ })
+ self._sort_formats(formats)
+
+ return formats
+
def _live_title(self, name):
""" Generate the title for a live video """
now = datetime.datetime.now()
@@ -829,10 +894,40 @@ class InfoExtractor(object):
return res
def _set_cookie(self, domain, name, value, expire_time=None):
- cookie = compat_cookiejar.Cookie(0, name, value, None, None, domain, None,
+ cookie = compat_cookiejar.Cookie(
+ 0, name, value, None, None, domain, None,
None, '/', True, False, expire_time, '', None, None, None)
self._downloader.cookiejar.set_cookie(cookie)
+ def get_testcases(self, include_onlymatching=False):
+ t = getattr(self, '_TEST', None)
+ if t:
+ assert not hasattr(self, '_TESTS'), \
+ '%s has _TEST and _TESTS' % type(self).__name__
+ tests = [t]
+ else:
+ tests = getattr(self, '_TESTS', [])
+ for t in tests:
+ if not include_onlymatching and t.get('only_matching', False):
+ continue
+ t['name'] = type(self).__name__[:-len('IE')]
+ yield t
+
+ def is_suitable(self, age_limit):
+ """ Test whether the extractor is generally suitable for the given
+ age limit (i.e. pornographic sites are not, all others usually are) """
+
+ any_restricted = False
+ for tc in self.get_testcases(include_onlymatching=False):
+ if 'playlist' in tc:
+ tc = tc['playlist'][0]
+ is_restricted = age_restricted(
+ tc.get('info_dict', {}).get('age_limit'), age_limit)
+ if not is_restricted:
+ return True
+ any_restricted = any_restricted or is_restricted
+ return not any_restricted
+
class SearchInfoExtractor(InfoExtractor):
"""
diff --git a/youtube_dl/extractor/commonmistakes.py b/youtube_dl/extractor/commonmistakes.py
new file mode 100644
index 000000000..75c06903f
--- /dev/null
+++ b/youtube_dl/extractor/commonmistakes.py
@@ -0,0 +1,29 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class CommonMistakesIE(InfoExtractor):
+ IE_DESC = False # Do not list
+ _VALID_URL = r'''(?x)
+ (?:url|URL)
+ '''
+
+ _TESTS = [{
+ 'url': 'url',
+ 'only_matching': True,
+ }, {
+ 'url': 'URL',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ msg = (
+ 'You\'ve asked youtube-dl to download the URL "%s". '
+ 'That doesn\'t make any sense. '
+ 'Simply remove the parameter in your command or configuration.'
+ ) % url
+ if self._downloader.params.get('verbose'):
+ msg += ' Add -v to the command line to see what arguments and configuration youtube-dl got.'
+ raise ExtractorError(msg, expected=True)
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py
index 7a7e79360..3db4db4e4 100644
--- a/youtube_dl/extractor/condenast.py
+++ b/youtube_dl/extractor/condenast.py
@@ -5,12 +5,14 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
- orderedSet,
compat_urllib_parse_urlparse,
compat_urlparse,
)
+from ..utils import (
+ orderedSet,
+)
class CondeNastIE(InfoExtractor):
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index d7e2b841e..1680f532f 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -10,10 +10,12 @@ import xml.etree.ElementTree
from hashlib import sha1
from math import pow, sqrt, floor
from .subtitles import SubtitlesInfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
+ ExtractorError,
bytes_to_intlist,
intlist_to_bytes,
unified_strdate,
@@ -27,10 +29,9 @@ from .common import InfoExtractor
class CrunchyrollIE(SubtitlesInfoExtractor):
- _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
- _TEST = {
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+ _TESTS = [{
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
- #'md5': 'b1639fd6ddfaa43788c85f6d1dddd412',
'info_dict': {
'id': '645513',
'ext': 'flv',
@@ -45,7 +46,10 @@ class CrunchyrollIE(SubtitlesInfoExtractor):
# rtmp
'skip_download': True,
},
- }
+ }, {
+ 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
+ 'only_matching': True,
+ }]
_FORMAT_IDS = {
'360': ('60', '106'),
@@ -224,7 +228,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)
formats = []
- for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage):
+ for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage):
stream_quality, stream_format = self._FORMAT_IDS[fmt]
video_format = fmt + 'p'
streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/')
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index 541106684..955119d40 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -27,7 +27,6 @@ class CSpanIE(InfoExtractor):
'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models',
# For whatever reason, the served video alternates between
# two different ones
- #'md5': 'dbb0f047376d457f2ab8b3929cbb2d0c',
'info_dict': {
'id': '340723',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 936c13cd6..cf5841a7c 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -8,13 +8,15 @@ import itertools
from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
-from ..utils import (
- compat_urllib_request,
+from ..compat import (
compat_str,
+ compat_urllib_request,
+)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
orderedSet,
str_to_int,
- int_or_none,
- ExtractorError,
unescapeHTML,
)
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py
index 45d66e2e6..934da765e 100644
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -5,7 +5,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
@@ -38,7 +38,7 @@ class DaumIE(InfoExtractor):
canonical_url = 'http://tvpot.daum.net/v/%s' % video_id
webpage = self._download_webpage(canonical_url, video_id)
full_id = self._search_regex(
- r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]',
+ r'src=["\']http://videofarm\.daum\.net/controller/video/viewer/Video\.html\?.*?vid=(.+?)[&"\']',
webpage, 'full id')
query = compat_urllib_parse.urlencode({'vid': full_id})
info = self._download_xml(
diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py
index 1d3e2ff08..212217625 100644
--- a/youtube_dl/extractor/dbtv.py
+++ b/youtube_dl/extractor/dbtv.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
float_or_none,
int_or_none,
@@ -61,7 +62,7 @@ class DBTVIE(InfoExtractor):
self._sort_formats(formats)
return {
- 'id': video['id'],
+ 'id': compat_str(video['id']),
'display_id': display_id,
'title': video['title'],
'description': clean_html(video['desc']),
diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py
index 52c2d7ddf..d3e667528 100644
--- a/youtube_dl/extractor/discovery.py
+++ b/youtube_dl/extractor/discovery.py
@@ -1,47 +1,45 @@
from __future__ import unicode_literals
-import re
-import json
-
from .common import InfoExtractor
+from ..utils import (
+ parse_iso8601,
+ int_or_none,
+)
class DiscoveryIE(InfoExtractor):
- _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
+ _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?'
_TEST = {
'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
- 'md5': 'e12614f9ee303a6ccef415cb0793eba2',
+ 'md5': '3c69d77d9b0d82bfd5e5932a60f26504',
'info_dict': {
- 'id': '614784',
- 'ext': 'mp4',
- 'title': 'MythBusters: Mission Impossible Outtakes',
+ 'id': 'mission-impossible-outtakes',
+ 'ext': 'flv',
+ 'title': 'Mission Impossible Outtakes',
'description': ('Watch Jamie Hyneman and Adam Savage practice being'
' each other -- to the point of confusing Jamie\'s dog -- and '
'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s'
' back.'),
'duration': 156,
+ 'timestamp': 1303099200,
+ 'upload_date': '20110418',
},
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_list_json = self._search_regex(r'var videoListJSON = ({.*?});',
- webpage, 'video list', flags=re.DOTALL)
- video_list = json.loads(video_list_json)
- info = video_list['clips'][0]
- formats = []
- for f in info['mp4']:
- formats.append(
- {'url': f['src'], 'ext': 'mp4', 'tbr': int(f['bitrate'][:-1])})
+ info = self._parse_json(self._search_regex(
+ r'(?s)<script type="application/ld\+json">(.*?)</script>',
+ webpage, 'video info'), video_id)
return {
- 'id': info['contentId'],
- 'title': video_list['name'],
- 'formats': formats,
- 'description': info['videoCaption'],
- 'thumbnail': info.get('videoStillURL') or info.get('thumbnailURL'),
- 'duration': info['duration'],
+ 'id': video_id,
+ 'title': info['name'],
+ 'url': info['contentURL'],
+ 'description': info.get('description'),
+ 'thumbnail': info.get('thumbnailUrl'),
+ 'timestamp': parse_iso8601(info.get('uploadDate')),
+ 'duration': int_or_none(info.get('duration')),
}
diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py
new file mode 100644
index 000000000..c1a4bc757
--- /dev/null
+++ b/youtube_dl/extractor/dvtv.py
@@ -0,0 +1,125 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ unescapeHTML,
+ ExtractorError,
+)
+
+
+class DVTVIE(InfoExtractor):
+ IE_NAME = 'dvtv'
+ IE_DESC = 'http://video.aktualne.cz/'
+
+ _VALID_URL = r'http://video\.aktualne\.cz/(?:[^/]+/)+r~(?P<id>[0-9a-f]{32})'
+
+ _TESTS = [{
+ 'url': 'http://video.aktualne.cz/dvtv/vondra-o-ceskem-stoleti-pri-pohledu-na-havla-mi-bylo-trapne/r~e5efe9ca855511e4833a0025900fea04/',
+ 'md5': '67cb83e4a955d36e1b5d31993134a0c2',
+ 'info_dict': {
+ 'id': 'dc0768de855511e49e4b0025900fea04',
+ 'ext': 'mp4',
+ 'title': 'Vondra o Českém století: Při pohledu na Havla mi bylo trapně',
+ }
+ }, {
+ 'url': 'http://video.aktualne.cz/dvtv/stropnicky-policie-vrbetice-preventivne-nekontrolovala/r~82ed4322849211e4a10c0025900fea04/',
+ 'md5': '6388f1941b48537dbd28791f712af8bf',
+ 'info_dict': {
+ 'id': '72c02230849211e49f60002590604f2e',
+ 'ext': 'mp4',
+ 'title': 'Stropnický: Policie Vrbětice preventivně nekontrolovala',
+ }
+ }, {
+ 'url': 'http://video.aktualne.cz/dvtv/dvtv-16-12-2014-utok-talibanu-boj-o-kliniku-uprchlici/r~973eb3bc854e11e498be002590604f2e/',
+ 'info_dict': {
+ 'title': 'DVTV 16. 12. 2014: útok Talibanu, boj o kliniku, uprchlíci',
+ 'id': '973eb3bc854e11e498be002590604f2e',
+ },
+ 'playlist': [{
+ 'md5': 'da7ca6be4935532241fa9520b3ad91e4',
+ 'info_dict': {
+ 'id': 'b0b40906854d11e4bdad0025900fea04',
+ 'ext': 'mp4',
+ 'title': 'Drtinová Veselovský TV 16. 12. 2014: Témata dne'
+ }
+ }, {
+ 'md5': '5f7652a08b05009c1292317b449ffea2',
+ 'info_dict': {
+ 'id': '420ad9ec854a11e4bdad0025900fea04',
+ 'ext': 'mp4',
+ 'title': 'Školní masakr možná změní boj s Talibanem, říká novinářka'
+ }
+ }, {
+ 'md5': '498eb9dfa97169f409126c617e2a3d64',
+ 'info_dict': {
+ 'id': '95d35580846a11e4b6d20025900fea04',
+ 'ext': 'mp4',
+ 'title': 'Boj o kliniku: Veřejný zájem, nebo právo na majetek?'
+ }
+ }, {
+ 'md5': 'b8dc6b744844032dab6ba3781a7274b9',
+ 'info_dict': {
+ 'id': '6fe14d66853511e4833a0025900fea04',
+ 'ext': 'mp4',
+ 'title': 'Pánek: Odmítání syrských uprchlíků je ostudou české vlády'
+ }
+ }],
+ }, {
+ 'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/',
+ 'only_matching': True,
+ }]
+
+ def _parse_video_metadata(self, js, video_id):
+ metadata = self._parse_json(js, video_id, transform_source=js_to_json)
+
+ formats = []
+ for video in metadata['sources']:
+ ext = video['type'][6:]
+ formats.append({
+ 'url': video['file'],
+ 'ext': ext,
+ 'format_id': '%s-%s' % (ext, video['label']),
+ 'height': int(video['label'].rstrip('p')),
+ 'fps': 25,
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': metadata['mediaid'],
+ 'title': unescapeHTML(metadata['title']),
+ 'thumbnail': self._proto_relative_url(metadata['image'], 'http:'),
+ 'formats': formats
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ # single video
+ item = self._search_regex(
+ r"(?s)embedData[0-9a-f]{32}\['asset'\]\s*=\s*(\{.+?\});",
+ webpage, 'video', default=None, fatal=False)
+
+ if item:
+ return self._parse_video_metadata(item, video_id)
+
+ # playlist
+ items = re.findall(
+ r"(?s)BBX\.context\.assets\['[0-9a-f]{32}'\]\.push\(({.+?})\);",
+ webpage)
+
+ if items:
+ return {
+ '_type': 'playlist',
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'entries': [self._parse_video_metadata(i, video_id) for i in items]
+ }
+
+ raise ExtractorError('Could not find neither video nor playlist')
diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py
index 63c2549d3..b6bfd2b2d 100644
--- a/youtube_dl/extractor/ebaumsworld.py
+++ b/youtube_dl/extractor/ebaumsworld.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -20,8 +18,7 @@ class EbaumsWorldIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
config = self._download_xml(
'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id)
video_url = config.find('file').text
diff --git a/youtube_dl/extractor/echomsk.py b/youtube_dl/extractor/echomsk.py
new file mode 100644
index 000000000..d2d94049d
--- /dev/null
+++ b/youtube_dl/extractor/echomsk.py
@@ -0,0 +1,46 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class EchoMskIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?echo\.msk\.ru/sounds/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.echo.msk.ru/sounds/1464134.html',
+ 'md5': '2e44b3b78daff5b458e4dbc37f191f7c',
+ 'info_dict': {
+ 'id': '1464134',
+ 'ext': 'mp3',
+ 'title': 'Особое мнение - 29 декабря 2014, 19:08',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ audio_url = self._search_regex(
+ r'<a rel="mp3" href="([^"]+)">', webpage, 'audio URL')
+
+ title = self._html_search_regex(
+ r'<a href="/programs/[^"]+" target="_blank">([^<]+)</a>',
+ webpage, 'title')
+
+ air_date = self._html_search_regex(
+ r'(?s)<div class="date">(.+?)</div>',
+ webpage, 'date', fatal=False, default=None)
+
+ if air_date:
+ air_date = re.sub(r'(\s)\1+', r'\1', air_date)
+ if air_date:
+ title = '%s - %s' % (title, air_date)
+
+ return {
+ 'id': video_id,
+ 'url': audio_url,
+ 'title': title,
+ }
diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py
index b766e17f2..9cb1bf301 100644
--- a/youtube_dl/extractor/ehow.py
+++ b/youtube_dl/extractor/ehow.py
@@ -1,8 +1,6 @@
from __future__ import unicode_literals
-import re
-
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
from .common import InfoExtractor
@@ -24,11 +22,10 @@ class EHowIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
- webpage, 'video URL')
+ video_url = self._search_regex(
+ r'(?:file|source)=(http[^\'"&]*)', webpage, 'video URL')
final_url = compat_urllib_parse.unquote(video_url)
uploader = self._html_search_meta('uploader', webpage)
title = self._og_search_title(webpage).replace(' | eHow', '')
diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py
index f4c1e2a72..a30a1f330 100644
--- a/youtube_dl/extractor/eighttracks.py
+++ b/youtube_dl/extractor/eighttracks.py
@@ -6,7 +6,7 @@ import random
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
)
diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py
index 3e7923648..fc92ff825 100644
--- a/youtube_dl/extractor/ellentv.py
+++ b/youtube_dl/extractor/ellentv.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
import json
from .common import InfoExtractor
@@ -12,32 +11,49 @@ from ..utils import (
class EllenTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ellentv\.com/videos/(?P<id>[a-z0-9_-]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)'
+ _TESTS = [{
'url': 'http://www.ellentv.com/videos/0-7jqrsr18/',
'md5': 'e4af06f3bf0d5f471921a18db5764642',
'info_dict': {
'id': '0-7jqrsr18',
'ext': 'mp4',
'title': 'What\'s Wrong with These Photos? A Whole Lot',
+ 'description': 'md5:35f152dc66b587cf13e6d2cf4fa467f6',
'timestamp': 1406876400,
'upload_date': '20140801',
}
- }
+ }, {
+ 'url': 'http://ellentube.com/videos/0-dvzmabd5/',
+ 'md5': '98238118eaa2bbdf6ad7f708e3e4f4eb',
+ 'info_dict': {
+ 'id': '0-dvzmabd5',
+ 'ext': 'mp4',
+ 'title': '1 year old twin sister makes her brother laugh',
+ 'description': '1 year old twin sister makes her brother laugh',
+ 'timestamp': 1419542075,
+ 'upload_date': '20141225',
+ }
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ video_url = self._html_search_meta('VideoURL', webpage, 'url')
+ title = self._og_search_title(webpage, default=None) or self._search_regex(
+ r'pageName\s*=\s*"([^"]+)"', webpage, 'title')
+ description = self._html_search_meta(
+ 'description', webpage, 'description') or self._og_search_description(webpage)
timestamp = parse_iso8601(self._search_regex(
r'<span class="publish-date"><time datetime="([^"]+)">',
webpage, 'timestamp'))
return {
'id': video_id,
- 'title': self._og_search_title(webpage),
- 'url': self._html_search_meta('VideoURL', webpage, 'url'),
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
'timestamp': timestamp,
}
@@ -55,8 +71,7 @@ class EllenTVClipsIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
+ playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
playlist = self._extract_playlist(webpage)
diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py
index 4277202a2..00a69e631 100644
--- a/youtube_dl/extractor/elpais.py
+++ b/youtube_dl/extractor/elpais.py
@@ -1,8 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import unified_strdate
@@ -24,9 +22,7 @@ class ElPaisIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
prefix = self._html_search_regex(
diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py
index 92ada81d2..4ea37ebd9 100644
--- a/youtube_dl/extractor/engadget.py
+++ b/youtube_dl/extractor/engadget.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from .fivemin import FiveMinIE
from ..utils import (
url_basename,
)
@@ -27,11 +26,10 @@ class EngadgetIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
if video_id is not None:
- return FiveMinIE._build_result(video_id)
+ return self.url_result('5min:%s' % video_id)
else:
title = url_basename(url)
webpage = self._download_webpage(url, title)
@@ -39,5 +37,5 @@ class EngadgetIE(InfoExtractor):
return {
'_type': 'playlist',
'title': title,
- 'entries': [FiveMinIE._build_result(id) for id in ids]
+ 'entries': [self.url_result('5min:%s' % vid) for vid in ids]
}
diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py
new file mode 100644
index 000000000..79e2fbd39
--- /dev/null
+++ b/youtube_dl/extractor/eroprofile.py
@@ -0,0 +1,45 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class EroProfileIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore',
+ 'md5': 'c26f351332edf23e1ea28ce9ec9de32f',
+ 'info_dict': {
+ 'id': '3733775',
+ 'display_id': 'sexy-babe-softcore',
+ 'ext': 'm4v',
+ 'title': 'sexy babe softcore',
+ 'thumbnail': 're:https?://.*\.jpg',
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],
+ webpage, 'video id', default=None)
+
+ video_url = self._search_regex(
+ r'<source src="([^"]+)', webpage, 'video url')
+ title = self._html_search_regex(
+ r'Title:</th><td>([^<]+)</td>', webpage, 'title')
+ thumbnail = self._search_regex(
+ r'onclick="showVideoPlayer\(\)"><img src="([^"]+)',
+ webpage, 'thumbnail', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py
index 476fc22b9..e240cb859 100644
--- a/youtube_dl/extractor/escapist.py
+++ b/youtube_dl/extractor/escapist.py
@@ -3,9 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
-
+)
+from ..utils import (
ExtractorError,
)
diff --git a/youtube_dl/extractor/everyonesmixtape.py b/youtube_dl/extractor/everyonesmixtape.py
index d237a8281..d872d828f 100644
--- a/youtube_dl/extractor/everyonesmixtape.py
+++ b/youtube_dl/extractor/everyonesmixtape.py
@@ -3,8 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
)
diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py
index aacbf1414..36ba33128 100644
--- a/youtube_dl/extractor/extremetube.py
+++ b/youtube_dl/extractor/extremetube.py
@@ -3,16 +3,18 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
+)
+from ..utils import (
str_to_int,
)
class ExtremeTubeIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<id>[0-9]+))(?:[/?&]|$)'
_TESTS = [{
'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
'md5': '1fb9228f5e3332ec8c057d6ac36f33e0',
@@ -31,7 +33,7 @@ class ExtremeTubeIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
+ video_id = mobj.group('id')
url = 'http://www.' + mobj.group('url')
req = compat_urllib_request.Request(url)
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index 2139f68aa..1ad4e77a8 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -13,9 +13,10 @@ from ..compat import (
compat_urllib_request,
)
from ..utils import (
- urlencode_postdata,
ExtractorError,
+ int_or_none,
limit_length,
+ urlencode_postdata,
)
@@ -36,7 +37,6 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '637842556329505',
'ext': 'mp4',
- 'duration': 38,
'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam',
}
}, {
@@ -107,9 +107,7 @@ class FacebookIE(InfoExtractor):
self._login()
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
webpage = self._download_webpage(url, video_id)
@@ -149,6 +147,6 @@ class FacebookIE(InfoExtractor):
'id': video_id,
'title': video_title,
'url': video_url,
- 'duration': int(video_data['video_duration']),
- 'thumbnail': video_data['thumbnail_src'],
+ 'duration': int_or_none(video_data.get('video_duration')),
+ 'thumbnail': video_data.get('thumbnail_src'),
}
diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py
index 6f5d23559..81ceace53 100644
--- a/youtube_dl/extractor/fc2.py
+++ b/youtube_dl/extractor/fc2.py
@@ -1,19 +1,20 @@
#! -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import re
import hashlib
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_urllib_request,
compat_urlparse,
)
+from ..utils import (
+ ExtractorError,
+)
class FC2IE(InfoExtractor):
- _VALID_URL = r'^http://video\.fc2\.com/((?P<lang>[^/]+)/)?content/(?P<id>[^/]+)'
+ _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)?content/(?P<id>[^/]+)'
IE_NAME = 'fc2'
_TEST = {
'url': 'http://video.fc2.com/en/content/20121103kUan1KHs',
@@ -26,9 +27,7 @@ class FC2IE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
self._downloader.cookiejar.clear_session_cookies() # must clear
diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py
index af439ccfe..3191116d9 100644
--- a/youtube_dl/extractor/firedrive.py
+++ b/youtube_dl/extractor/firedrive.py
@@ -4,11 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
+from ..utils import (
+ ExtractorError,
+)
class FiredriveIE(InfoExtractor):
@@ -28,11 +30,8 @@ class FiredriveIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
url = 'http://firedrive.com/file/%s' % video_id
-
webpage = self._download_webpage(url, video_id)
if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py
index f9c127ce6..5b24b921c 100644
--- a/youtube_dl/extractor/fivemin.py
+++ b/youtube_dl/extractor/fivemin.py
@@ -1,11 +1,11 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
compat_urllib_parse,
+)
+from ..utils import (
ExtractorError,
)
@@ -13,7 +13,7 @@ from ..utils import (
class FiveMinIE(InfoExtractor):
IE_NAME = '5min'
_VALID_URL = r'''(?x)
- (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(.*?&)?playList=|
+ (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=|
5min:)
(?P<id>\d+)
'''
@@ -41,13 +41,8 @@ class FiveMinIE(InfoExtractor):
},
]
- @classmethod
- def _build_result(cls, video_id):
- return cls.url_result('5min:%s' % video_id, cls.ie_key())
-
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id
embed_page = self._download_webpage(embed_url, video_id,
'Downloading embed page')
diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py
index d09d1c13a..190d9f9ad 100644
--- a/youtube_dl/extractor/fktv.py
+++ b/youtube_dl/extractor/fktv.py
@@ -13,7 +13,7 @@ from ..utils import (
class FKTVIE(InfoExtractor):
IE_NAME = 'fernsehkritik.tv'
- _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<ep>[0-9]+)(?:/.*)?'
+ _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<id>[0-9]+)(?:/.*)?'
_TEST = {
'url': 'http://fernsehkritik.tv/folge-1',
@@ -26,29 +26,32 @@ class FKTVIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- episode = int(mobj.group('ep'))
+ episode = int(self._match_id(url))
- server = random.randint(2, 4)
- video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%d.jpg' % episode
- start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%d/Start' % episode,
+ video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%s.jpg' % episode
+ start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%s/Start' % episode,
episode)
playlist = self._search_regex(r'playlist = (\[.*?\]);', start_webpage,
'playlist', flags=re.DOTALL)
files = json.loads(re.sub('{[^{}]*?}', '{}', playlist))
- # TODO: return a single multipart video
+
videos = []
for i, _ in enumerate(files, 1):
video_id = '%04d%d' % (episode, i)
- video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i)
+ video_url = 'http://fernsehkritik.tv/js/directme.php?file=%s%s.flv' % (episode, '' if i == 1 else '-%d' % i)
videos.append({
+ 'ext': 'flv',
'id': video_id,
'url': video_url,
'title': clean_html(get_element_by_id('eptitle', start_webpage)),
'description': clean_html(get_element_by_id('contentlist', start_webpage)),
'thumbnail': video_thumbnail
})
- return videos
+ return {
+ '_type': 'multi_video',
+ 'entries': videos,
+ 'id': 'folge-%s' % episode,
+ }
class FKTVPosteckeIE(InfoExtractor):
diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py
index b22ce2acb..7187e0752 100644
--- a/youtube_dl/extractor/fourtube.py
+++ b/youtube_dl/extractor/fourtube.py
@@ -3,12 +3,14 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
- unified_strdate,
- str_to_int,
- parse_duration,
+)
+from ..utils import (
clean_html,
+ parse_duration,
+ str_to_int,
+ unified_strdate,
)
@@ -31,9 +33,7 @@ class FourTubeIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage_url = 'http://www.4tube.com/videos/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py
new file mode 100644
index 000000000..08b8ea362
--- /dev/null
+++ b/youtube_dl/extractor/foxgay.py
@@ -0,0 +1,48 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class FoxgayIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P<id>\d+)\.shtml'
+ _TEST = {
+ 'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml',
+ 'md5': '80d72beab5d04e1655a56ad37afe6841',
+ 'info_dict': {
+ 'id': '2582',
+ 'ext': 'mp4',
+ 'title': 'md5:6122f7ae0fc6b21ebdf59c5e083ce25a',
+ 'description': 'md5:5e51dc4405f1fd315f7927daed2ce5cf',
+ 'age_limit': 18,
+ 'thumbnail': 're:https?://.*\.jpg$',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(
+ r'<title>(?P<title>.*?)</title>',
+ webpage, 'title', fatal=False)
+ description = self._html_search_regex(
+ r'<div class="ico_desc"><h2>(?P<description>.*?)</h2>',
+ webpage, 'description', fatal=False)
+
+ # Find the URL for the iFrame which contains the actual video.
+ iframe = self._download_webpage(
+ self._html_search_regex(r'iframe src="(?P<frame>.*?)"', webpage, 'video frame'),
+ video_id)
+ video_url = self._html_search_regex(
+ r"v_path = '(?P<vid>http://.*?)'", iframe, 'url')
+ thumb_url = self._html_search_regex(
+ r"t_path = '(?P<thumb>http://.*?)'", iframe, 'thumbnail', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'description': description,
+ 'thumbnail': thumb_url,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py
new file mode 100644
index 000000000..917f76b1e
--- /dev/null
+++ b/youtube_dl/extractor/foxnews.py
@@ -0,0 +1,94 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_iso8601,
+ int_or_none,
+)
+
+
+class FoxNewsIE(InfoExtractor):
+ _VALID_URL = r'https?://video\.foxnews\.com/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips',
+ 'md5': '32aaded6ba3ef0d1c04e238d01031e5e',
+ 'info_dict': {
+ 'id': '3937480',
+ 'ext': 'flv',
+ 'title': 'Frozen in Time',
+ 'description': 'Doctors baffled by 16-year-old girl that is the size of a toddler',
+ 'duration': 265,
+ 'timestamp': 1304411491,
+ 'upload_date': '20110503',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ },
+ {
+ 'url': 'http://video.foxnews.com/v/3922535568001/rep-luis-gutierrez-on-if-obamas-immigration-plan-is-legal/#sp=show-clips',
+ 'md5': '5846c64a1ea05ec78175421b8323e2df',
+ 'info_dict': {
+ 'id': '3922535568001',
+ 'ext': 'mp4',
+ 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal",
+ 'description': "Congressman discusses the president's executive action",
+ 'duration': 292,
+ 'timestamp': 1417662047,
+ 'upload_date': '20141204',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ },
+ {
+ 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com',
+ 'only_matching': True,
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://video.foxnews.com/v/feed/video/%s.js?template=fox' % video_id, video_id)
+
+ item = video['channel']['item']
+ title = item['title']
+ description = item['description']
+ timestamp = parse_iso8601(item['dc-date'])
+
+ media_group = item['media-group']
+ duration = None
+ formats = []
+ for media in media_group['media-content']:
+ attributes = media['@attributes']
+ video_url = attributes['url']
+ if video_url.endswith('.f4m'):
+ formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id))
+ elif video_url.endswith('.m3u8'):
+ formats.extend(self._extract_m3u8_formats(video_url, video_id, 'flv'))
+ elif not video_url.endswith('.smil'):
+ duration = int_or_none(attributes.get('duration'))
+ formats.append({
+ 'url': video_url,
+ 'format_id': media['media-category']['@attributes']['label'],
+ 'preference': 1,
+ 'vbr': int_or_none(attributes.get('bitrate')),
+ 'filesize': int_or_none(attributes.get('fileSize'))
+ })
+ self._sort_formats(formats)
+
+ media_thumbnail = media_group['media-thumbnail']['@attributes']
+ thumbnails = [{
+ 'url': media_thumbnail['url'],
+ 'width': int_or_none(media_thumbnail.get('width')),
+ 'height': int_or_none(media_thumbnail.get('height')),
+ }] if media_thumbnail else []
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ }
diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py
index 898e0dda7..0c2972162 100644
--- a/youtube_dl/extractor/franceculture.py
+++ b/youtube_dl/extractor/franceculture.py
@@ -5,7 +5,7 @@ import json
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_parse_qs,
compat_urlparse,
)
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index e0420a48f..bbc760a49 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -6,13 +6,15 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
+ compat_urllib_parse_urlparse,
compat_urlparse,
- ExtractorError,
+)
+from ..utils import (
clean_html,
- parse_duration,
- compat_urllib_parse_urlparse,
+ ExtractorError,
int_or_none,
+ parse_duration,
)
diff --git a/youtube_dl/extractor/gameone.py b/youtube_dl/extractor/gameone.py
index 3022f539d..a07d69841 100644
--- a/youtube_dl/extractor/gameone.py
+++ b/youtube_dl/extractor/gameone.py
@@ -6,7 +6,9 @@ import re
from .common import InfoExtractor
from ..utils import (
xpath_with_ns,
- parse_iso8601
+ parse_iso8601,
+ float_or_none,
+ int_or_none,
)
NAMESPACE_MAP = {
@@ -21,25 +23,41 @@ RAW_MP4_URL = 'http://cdn.riptide-mtvn.com/'
class GameOneIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?gameone\.de/tv/(?P<id>\d+)'
- _TEST = {
- 'url': 'http://www.gameone.de/tv/288',
- 'md5': '136656b7fb4c9cb4a8e2d500651c499b',
- 'info_dict': {
- 'id': '288',
- 'ext': 'mp4',
- 'title': 'Game One - Folge 288',
- 'duration': 1238,
- 'thumbnail': 'http://s3.gameone.de/gameone/assets/video_metas/teaser_images/000/643/636/big/640x360.jpg',
- 'description': 'FIFA-Pressepokal 2014, Star Citizen, Kingdom Come: Deliverance, Project Cars, Schöner Trants Nerdquiz Folge 2 Runde 1',
- 'age_limit': 16,
- 'upload_date': '20140513',
- 'timestamp': 1399980122,
+ _TESTS = [
+ {
+ 'url': 'http://www.gameone.de/tv/288',
+ 'md5': '136656b7fb4c9cb4a8e2d500651c499b',
+ 'info_dict': {
+ 'id': '288',
+ 'ext': 'mp4',
+ 'title': 'Game One - Folge 288',
+ 'duration': 1238,
+ 'thumbnail': 'http://s3.gameone.de/gameone/assets/video_metas/teaser_images/000/643/636/big/640x360.jpg',
+ 'description': 'FIFA-Pressepokal 2014, Star Citizen, Kingdom Come: Deliverance, Project Cars, Schöner Trants Nerdquiz Folge 2 Runde 1',
+ 'age_limit': 16,
+ 'upload_date': '20140513',
+ 'timestamp': 1399980122,
+ }
+ },
+ {
+ 'url': 'http://gameone.de/tv/220',
+ 'md5': '5227ca74c4ae6b5f74c0510a7c48839e',
+ 'info_dict': {
+ 'id': '220',
+ 'ext': 'mp4',
+ 'upload_date': '20120918',
+ 'description': 'Jet Set Radio HD, Tekken Tag Tournament 2, Source Filmmaker',
+ 'timestamp': 1347971451,
+ 'title': 'Game One - Folge 220',
+ 'duration': 896.62,
+ 'age_limit': 16,
+ }
}
- }
+
+ ]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
og_video = self._og_search_video_url(webpage, secure=False)
@@ -66,13 +84,13 @@ class GameOneIE(InfoExtractor):
video_id,
'Downloading media:content')
rendition_items = content.findall('.//rendition')
- duration = int(rendition_items[0].get('duration'))
+ duration = float_or_none(rendition_items[0].get('duration'))
formats = [
{
'url': re.sub(r'.*/(r2)', RAW_MP4_URL + r'\1', r.find('./src').text),
- 'width': int(r.get('width')),
- 'height': int(r.get('height')),
- 'tbr': int(r.get('bitrate')),
+ 'width': int_or_none(r.get('width')),
+ 'height': int_or_none(r.get('height')),
+ 'tbr': int_or_none(r.get('bitrate')),
}
for r in rendition_items
]
@@ -105,7 +123,8 @@ class GameOnePlaylistIE(InfoExtractor):
webpage = self._download_webpage('http://www.gameone.de/tv', 'TV')
max_id = max(map(int, re.findall(r'<a href="/tv/(\d+)"', webpage)))
entries = [
- self.url_result('http://www.gameone.de/tv/%d' % video_id, 'GameOne')
+ self.url_result('http://www.gameone.de/tv/%d' %
+ video_id, 'GameOne')
for video_id in range(max_id, 0, -1)]
return {
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index d570e3f6a..47373e215 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -4,9 +4,11 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urlparse,
+)
+from ..utils import (
unescapeHTML,
)
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
index de14ae1fb..fed968f51 100644
--- a/youtube_dl/extractor/gdcvault.py
+++ b/youtube_dl/extractor/gdcvault.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
@@ -39,7 +39,8 @@ class GDCVaultIE(InfoExtractor):
'id': '1015301',
'ext': 'flv',
'title': 'Thexder Meets Windows 95, or Writing Great Games in the Windows 95 Environment',
- }
+ },
+ 'skip': 'Requires login',
}
]
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 328301de3..7a5bf9392 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -23,6 +23,7 @@ from ..utils import (
unescapeHTML,
unified_strdate,
unsmuggle_url,
+ UnsupportedError,
url_basename,
)
from .brightcove import BrightcoveIE
@@ -130,12 +131,13 @@ class GenericIE(InfoExtractor):
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
- 'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
+ 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
'info_dict': {
'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
'ext': 'mp4',
'title': '2cc213299525360.mov', # that's what we get
},
+ 'add_ie': ['Ooyala'],
},
# google redirect
{
@@ -145,7 +147,7 @@ class GenericIE(InfoExtractor):
'ext': 'mp4',
'upload_date': '20130224',
'uploader_id': 'TheVerge',
- 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
+ 'description': 're:^Chris Ziegler takes a look at the\.*',
'uploader': 'The Verge',
'title': 'First Firefox OS phones side-by-side',
},
@@ -180,6 +182,14 @@ class GenericIE(InfoExtractor):
'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
},
},
+ # BBC iPlayer embeds
+ {
+ 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
+ 'info_dict': {
+ 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
+ },
+ 'playlist_mincount': 18,
+ },
# RUTV embed
{
'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
@@ -467,8 +477,17 @@ class GenericIE(InfoExtractor):
'expected_warnings': [
'URL could be a direct video link, returning it as such.'
]
- }
-
+ },
+ # Cinchcast embed
+ {
+ 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
+ 'info_dict': {
+ 'id': '7141703',
+ 'ext': 'mp3',
+ 'upload_date': '20141126',
+ 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
+ }
+ },
]
def report_following_redirect(self, new_url):
@@ -689,9 +708,9 @@ class GenericIE(InfoExtractor):
r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
# Helper method
- def _playlist_from_matches(matches, getter, ie=None):
+ def _playlist_from_matches(matches, getter=None, ie=None):
urlrs = orderedSet(
- self.url_result(self._proto_relative_url(getter(m)), ie)
+ self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
for m in matches)
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
@@ -895,6 +914,11 @@ class GenericIE(InfoExtractor):
return _playlist_from_matches(
matches, getter=unescapeHTML, ie='FunnyOrDie')
+ # Look for BBC iPlayer embed
+ matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
+ if matches:
+ return _playlist_from_matches(matches, ie='BBCCoUk')
+
# Look for embedded RUTV player
rutv_url = RUTVIE._extract_url(webpage)
if rutv_url:
@@ -902,7 +926,7 @@ class GenericIE(InfoExtractor):
# Look for embedded TED player
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'TED')
@@ -962,6 +986,13 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'SBS')
+ # Look for embedded Cinchcast player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Cinchcast')
+
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
webpage)
@@ -1041,7 +1072,7 @@ class GenericIE(InfoExtractor):
'url': new_url,
}
if not found:
- raise ExtractorError('Unsupported URL: %s' % url)
+ raise UnsupportedError(url)
entries = []
for video_url in found:
diff --git a/youtube_dl/extractor/giantbomb.py b/youtube_dl/extractor/giantbomb.py
new file mode 100644
index 000000000..87cd19147
--- /dev/null
+++ b/youtube_dl/extractor/giantbomb.py
@@ -0,0 +1,81 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ unescapeHTML,
+ qualities,
+ int_or_none,
+)
+
+
+class GiantBombIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?giantbomb\.com/videos/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)'
+ _TEST = {
+ 'url': 'http://www.giantbomb.com/videos/quick-look-destiny-the-dark-below/2300-9782/',
+ 'md5': '57badeface303ecf6b98b812de1b9018',
+ 'info_dict': {
+ 'id': '2300-9782',
+ 'display_id': 'quick-look-destiny-the-dark-below',
+ 'ext': 'mp4',
+ 'title': 'Quick Look: Destiny: The Dark Below',
+ 'description': 'md5:0aa3aaf2772a41b91d44c63f30dfad24',
+ 'duration': 2399,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ video = json.loads(unescapeHTML(self._search_regex(
+ r'data-video="([^"]+)"', webpage, 'data-video')))
+
+ duration = int_or_none(video.get('lengthSeconds'))
+
+ quality = qualities([
+ 'f4m_low', 'progressive_low', 'f4m_high',
+ 'progressive_high', 'f4m_hd', 'progressive_hd'])
+
+ formats = []
+ for format_id, video_url in video['videoStreams'].items():
+ if format_id == 'f4m_stream':
+ continue
+ if video_url.endswith('.f4m'):
+ f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.3.1', display_id)
+ if f4m_formats:
+ f4m_formats[0]['quality'] = quality(format_id)
+ formats.extend(f4m_formats)
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'quality': quality(format_id),
+ })
+
+ if not formats:
+ youtube_id = video.get('youtubeID')
+ if youtube_id:
+ return self.url_result(youtube_id, 'Youtube')
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/giga.py b/youtube_dl/extractor/giga.py
new file mode 100644
index 000000000..775890112
--- /dev/null
+++ b/youtube_dl/extractor/giga.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+ qualities,
+ compat_str,
+ parse_duration,
+ parse_iso8601,
+ str_to_int,
+)
+
+
+class GigaIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?giga\.de/(?:[^/]+/)*(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'http://www.giga.de/filme/anime-awesome/trailer/anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss/',
+ 'md5': '6bc5535e945e724640664632055a584f',
+ 'info_dict': {
+ 'id': '2622086',
+ 'display_id': 'anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss',
+ 'ext': 'mp4',
+ 'title': 'Anime Awesome: Chihiros Reise ins Zauberland – Das Beste kommt zum Schluss',
+ 'description': 'md5:afdf5862241aded4718a30dff6a57baf',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 578,
+ 'timestamp': 1414749706,
+ 'upload_date': '20141031',
+ 'uploader': 'Robin Schweiger',
+ 'view_count': int,
+ },
+ }, {
+ 'url': 'http://www.giga.de/games/channel/giga-top-montag/giga-topmontag-die-besten-serien-2014/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.giga.de/extra/netzkultur/videos/giga-games-tom-mats-robin-werden-eigene-wege-gehen-eine-ankuendigung/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.giga.de/tv/jonas-liest-spieletitel-eingedeutscht-episode-2/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ [r'data-video-id="(\d+)"', r'/api/video/jwplayer/#v=(\d+)'],
+ webpage, 'video id')
+
+ playlist = self._download_json(
+ 'http://www.giga.de/api/syndication/video/video_id/%s/playlist.json?content=syndication/key/368b5f151da4ae05ced7fa296bdff65a/'
+ % video_id, video_id)[0]
+
+ quality = qualities(['normal', 'hd720'])
+
+ formats = []
+ for format_id in itertools.count(0):
+ fmt = playlist.get(compat_str(format_id))
+ if not fmt:
+ break
+ formats.append({
+ 'url': fmt['src'],
+ 'format_id': '%s-%s' % (fmt['quality'], fmt['type'].split('/')[-1]),
+ 'quality': quality(fmt['quality']),
+ })
+ self._sort_formats(formats)
+
+ title = self._html_search_meta(
+ 'title', webpage, 'title', fatal=True)
+ description = self._html_search_meta(
+ 'description', webpage, 'description')
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ duration = parse_duration(self._search_regex(
+ r'(?s)(?:data-video-id="{0}"|data-video="[^"]*/api/video/jwplayer/#v={0}[^"]*")[^>]*>.+?<span class="duration">([^<]+)</span>'.format(video_id),
+ webpage, 'duration', fatal=False))
+
+ timestamp = parse_iso8601(self._search_regex(
+ r'datetime="([^"]+)"', webpage, 'upload date', fatal=False))
+ uploader = self._search_regex(
+ r'class="author">([^<]+)</a>', webpage, 'uploader', fatal=False)
+
+ view_count = str_to_int(self._search_regex(
+ r'<span class="views"><strong>([\d.]+)</strong>', webpage, 'view count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/goldenmoustache.py b/youtube_dl/extractor/goldenmoustache.py
index 10001d4d9..0fb509724 100644
--- a/youtube_dl/extractor/goldenmoustache.py
+++ b/youtube_dl/extractor/goldenmoustache.py
@@ -1,9 +1,6 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import (
- int_or_none,
-)
class GoldenMoustacheIE(InfoExtractor):
@@ -17,7 +14,6 @@ class GoldenMoustacheIE(InfoExtractor):
'title': 'Suricate - Le Poker',
'description': 'md5:3d1f242f44f8c8cb0a106f1fd08e5dc9',
'thumbnail': 're:^https?://.*\.jpg$',
- 'view_count': int,
}
}, {
'url': 'http://www.goldenmoustache.com/le-lab-tout-effacer-mc-fly-et-carlito-55249/',
@@ -28,7 +24,6 @@ class GoldenMoustacheIE(InfoExtractor):
'title': 'Le LAB - Tout Effacer (Mc Fly et Carlito)',
'description': 'md5:9b7fbf11023fb2250bd4b185e3de3b2a',
'thumbnail': 're:^https?://.*\.(?:png|jpg)$',
- 'view_count': int,
}
}]
@@ -42,9 +37,6 @@ class GoldenMoustacheIE(InfoExtractor):
r'<title>(.*?)(?: - Golden Moustache)?</title>', webpage, 'title')
thumbnail = self._og_search_thumbnail(webpage)
description = self._og_search_description(webpage)
- view_count = int_or_none(self._html_search_regex(
- r'<strong>([0-9]+)</strong>\s*VUES</span>',
- webpage, 'view count', fatal=False))
return {
'id': video_id,
@@ -53,5 +45,4 @@ class GoldenMoustacheIE(InfoExtractor):
'title': title,
'description': description,
'thumbnail': thumbnail,
- 'view_count': view_count,
}
diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py
index 53714f47f..2bfb99040 100644
--- a/youtube_dl/extractor/golem.py
+++ b/youtube_dl/extractor/golem.py
@@ -2,8 +2,10 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
+)
+from ..utils import (
determine_ext,
)
diff --git a/youtube_dl/extractor/googlesearch.py b/youtube_dl/extractor/googlesearch.py
index 469e1f935..498304cb2 100644
--- a/youtube_dl/extractor/googlesearch.py
+++ b/youtube_dl/extractor/googlesearch.py
@@ -4,7 +4,7 @@ import itertools
import re
from .common import SearchInfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py
index 1ac1da856..ae24aff84 100644
--- a/youtube_dl/extractor/gorillavid.py
+++ b/youtube_dl/extractor/gorillavid.py
@@ -4,11 +4,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- determine_ext,
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
+ ExtractorError,
int_or_none,
)
@@ -106,7 +107,6 @@ class GorillaVidIE(InfoExtractor):
formats = [{
'format_id': 'sd',
'url': video_url,
- 'ext': determine_ext(video_url),
'quality': 1,
}]
diff --git a/youtube_dl/extractor/goshgay.py b/youtube_dl/extractor/goshgay.py
index 18474cbb7..b116d251d 100644
--- a/youtube_dl/extractor/goshgay.py
+++ b/youtube_dl/extractor/goshgay.py
@@ -2,57 +2,52 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import (
+ compat_parse_qs,
+)
from ..utils import (
- compat_urlparse,
- ExtractorError,
+ parse_duration,
)
class GoshgayIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)www.goshgay.com/video(?P<id>\d+?)($|/)'
+ _VALID_URL = r'https?://www\.goshgay\.com/video(?P<id>\d+?)($|/)'
_TEST = {
- 'url': 'http://www.goshgay.com/video4116282',
- 'md5': '268b9f3c3229105c57859e166dd72b03',
+ 'url': 'http://www.goshgay.com/video299069/diesel_sfw_xxx_video',
+ 'md5': '027fcc54459dff0feb0bc06a7aeda680',
'info_dict': {
- 'id': '4116282',
+ 'id': '299069',
'ext': 'flv',
- 'title': 'md5:089833a4790b5e103285a07337f245bf',
- 'thumbnail': 're:http://.*\.jpg',
+ 'title': 'DIESEL SFW XXX Video',
+ 'thumbnail': 're:^http://.*\.jpg$',
+ 'duration': 79,
'age_limit': 18,
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
-
webpage = self._download_webpage(url, video_id)
- title = self._og_search_title(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
+
+ title = self._html_search_regex(
+ r'<h2>(.*?)<', webpage, 'title')
+ duration = parse_duration(self._html_search_regex(
+ r'<span class="duration">\s*-?\s*(.*?)</span>',
+ webpage, 'duration', fatal=False))
family_friendly = self._html_search_meta(
'isFamilyFriendly', webpage, default='false')
- config_url = self._search_regex(
- r"'config'\s*:\s*'([^']+)'", webpage, 'config URL')
-
- config = self._download_xml(
- config_url, video_id, 'Downloading player config XML')
-
- if config is None:
- raise ExtractorError('Missing config XML')
- if config.tag != 'config':
- raise ExtractorError('Missing config attribute')
- fns = config.findall('file')
- if len(fns) < 1:
- raise ExtractorError('Missing media URI')
- video_url = fns[0].text
- url_comp = compat_urlparse.urlparse(url)
- ref = "%s://%s%s" % (url_comp[0], url_comp[1], url_comp[2])
+ flashvars = compat_parse_qs(self._html_search_regex(
+ r'<embed.+?id="flash-player-embed".+?flashvars="([^"]+)"',
+ webpage, 'flashvars'))
+ thumbnail = flashvars.get('url_bigthumb', [None])[0]
+ video_url = flashvars['flv_url'][0]
return {
'id': video_id,
'url': video_url,
'title': title,
'thumbnail': thumbnail,
- 'http_referer': ref,
+ 'duration': duration,
'age_limit': 0 if family_friendly == 'true' else 18,
}
diff --git a/youtube_dl/extractor/groupon.py b/youtube_dl/extractor/groupon.py
new file mode 100644
index 000000000..8b9e0e2f8
--- /dev/null
+++ b/youtube_dl/extractor/groupon.py
@@ -0,0 +1,50 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class GrouponIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.groupon\.com/deals/(?P<id>[^?#]+)'
+
+ _TEST = {
+ 'url': 'https://www.groupon.com/deals/bikram-yoga-huntington-beach-2#ooid=tubGNycTo_9Uxg82uESj4i61EYX8nyuf',
+ 'info_dict': {
+ 'id': 'bikram-yoga-huntington-beach-2',
+ 'title': '$49 for 10 Yoga Classes or One Month of Unlimited Classes at Bikram Yoga Huntington Beach ($180 Value)',
+ 'description': 'Studio kept at 105 degrees and 40% humidity with anti-microbial and anti-slip Flotex flooring; certified instructors',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'tubGNycTo_9Uxg82uESj4i61EYX8nyuf',
+ 'ext': 'mp4',
+ 'title': 'Bikram Yoga Huntington Beach | Orange County',
+ },
+ }],
+ 'params': {
+ 'skip_download': 'HLS',
+ }
+ }
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ webpage = self._download_webpage(url, playlist_id)
+
+ payload = self._parse_json(self._search_regex(
+ r'var\s+payload\s*=\s*(.*?);\n', webpage, 'payload'), playlist_id)
+ videos = payload['carousel'].get('dealVideos', [])
+ entries = []
+ for v in videos:
+ if v.get('provider') != 'OOYALA':
+ self.report_warning(
+ '%s: Unsupported video provider %s, skipping video' %
+ (playlist_id, v.get('provider')))
+ continue
+ entries.append(self.url_result('ooyala:%s' % v['media']))
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'entries': entries,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ }
diff --git a/youtube_dl/extractor/hellporno.py b/youtube_dl/extractor/hellporno.py
new file mode 100644
index 000000000..7a1c75b65
--- /dev/null
+++ b/youtube_dl/extractor/hellporno.py
@@ -0,0 +1,71 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ remove_end,
+)
+
+
+class HellPornoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?hellporno\.com/videos/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://hellporno.com/videos/dixie-is-posing-with-naked-ass-very-erotic/',
+ 'md5': '1fee339c610d2049699ef2aa699439f1',
+ 'info_dict': {
+ 'id': '149116',
+ 'display_id': 'dixie-is-posing-with-naked-ass-very-erotic',
+ 'ext': 'mp4',
+ 'title': 'Dixie is posing with naked ass very erotic',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = remove_end(self._html_search_regex(
+ r'<title>([^<]+)</title>', webpage, 'title'), ' - Hell Porno')
+
+ flashvars = self._parse_json(self._search_regex(
+ r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'),
+ display_id, transform_source=js_to_json)
+
+ video_id = flashvars.get('video_id')
+ thumbnail = flashvars.get('preview_url')
+ ext = flashvars.get('postfix', '.mp4')[1:]
+
+ formats = []
+ for video_url_key in ['video_url', 'video_alt_url']:
+ video_url = flashvars.get(video_url_key)
+ if not video_url:
+ continue
+ video_text = flashvars.get('%s_text' % video_url_key)
+ fmt = {
+ 'url': video_url,
+ 'ext': ext,
+ 'format_id': video_text,
+ }
+ m = re.search(r'^(?P<height>\d+)[pP]', video_text)
+ if m:
+ fmt['height'] = int(m.group('height'))
+ formats.append(fmt)
+ self._sort_formats(formats)
+
+ categories = self._html_search_meta(
+ 'keywords', webpage, 'categories', default='').split(',')
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'categories': categories,
+ 'age_limit': 18,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/helsinki.py b/youtube_dl/extractor/helsinki.py
index 5268efa49..93107b306 100644
--- a/youtube_dl/extractor/helsinki.py
+++ b/youtube_dl/extractor/helsinki.py
@@ -2,9 +2,8 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..utils import js_to_json
class HelsinkiIE(InfoExtractor):
@@ -24,39 +23,21 @@ class HelsinkiIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- formats = []
-
- mobj = re.search(r'file=((\w+):[^&]+)', webpage)
- if mobj:
- formats.append({
- 'ext': mobj.group(2),
- 'play_path': mobj.group(1),
- 'url': 'rtmp://flashvideo.it.helsinki.fi/vod/',
- 'player_url': 'http://video.helsinki.fi/player.swf',
- 'format_note': 'sd',
- 'quality': 0,
- })
-
- mobj = re.search(r'hd\.file=((\w+):[^&]+)', webpage)
- if mobj:
- formats.append({
- 'ext': mobj.group(2),
- 'play_path': mobj.group(1),
- 'url': 'rtmp://flashvideo.it.helsinki.fi/vod/',
- 'player_url': 'http://video.helsinki.fi/player.swf',
- 'format_note': 'hd',
- 'quality': 1,
- })
+ params = self._parse_json(self._html_search_regex(
+ r'(?s)jwplayer\("player"\).setup\((\{.*?\})\);',
+ webpage, 'player code'), video_id, transform_source=js_to_json)
+ formats = [{
+ 'url': s['file'],
+ 'ext': 'mp4',
+ } for s in params['sources']]
self._sort_formats(formats)
return {
'id': video_id,
'title': self._og_search_title(webpage).replace('Video: ', ''),
'description': self._og_search_description(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
}
diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py
new file mode 100644
index 000000000..84bd7c080
--- /dev/null
+++ b/youtube_dl/extractor/hitbox.py
@@ -0,0 +1,166 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ parse_iso8601,
+ float_or_none,
+ int_or_none,
+ compat_str,
+)
+
+
+class HitboxIE(InfoExtractor):
+ IE_NAME = 'hitbox'
+ _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/video/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.hitbox.tv/video/203213',
+ 'info_dict': {
+ 'id': '203213',
+ 'title': 'hitbox @ gamescom, Sub Button Hype extended, Giveaway - hitbox News Update with Oxy',
+ 'alt_title': 'hitboxlive - Aug 9th #6',
+ 'description': '',
+ 'ext': 'mp4',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 215.1666,
+ 'resolution': 'HD 720p',
+ 'uploader': 'hitboxlive',
+ 'view_count': int,
+ 'timestamp': 1407576133,
+ 'upload_date': '20140809',
+ 'categories': ['Live Show'],
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _extract_metadata(self, url, video_id):
+ thumb_base = 'https://edge.sf.hitbox.tv'
+ metadata = self._download_json(
+ '%s/%s' % (url, video_id), video_id)
+
+ date = 'media_live_since'
+ media_type = 'livestream'
+ if metadata.get('media_type') == 'video':
+ media_type = 'video'
+ date = 'media_date_added'
+
+ video_meta = metadata.get(media_type, [])[0]
+ title = video_meta.get('media_status')
+ alt_title = video_meta.get('media_title')
+ description = clean_html(
+ video_meta.get('media_description') or
+ video_meta.get('media_description_md'))
+ duration = float_or_none(video_meta.get('media_duration'))
+ uploader = video_meta.get('media_user_name')
+ views = int_or_none(video_meta.get('media_views'))
+ timestamp = parse_iso8601(video_meta.get(date), ' ')
+ categories = [video_meta.get('category_name')]
+ thumbs = [
+ {'url': thumb_base + video_meta.get('media_thumbnail'),
+ 'width': 320,
+ 'height': 180},
+ {'url': thumb_base + video_meta.get('media_thumbnail_large'),
+ 'width': 768,
+ 'height': 432},
+ ]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'alt_title': alt_title,
+ 'description': description,
+ 'ext': 'mp4',
+ 'thumbnails': thumbs,
+ 'duration': duration,
+ 'uploader': uploader,
+ 'view_count': views,
+ 'timestamp': timestamp,
+ 'categories': categories,
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ metadata = self._extract_metadata(
+ 'https://www.hitbox.tv/api/media/video',
+ video_id)
+
+ player_config = self._download_json(
+ 'https://www.hitbox.tv/api/player/config/video/%s' % video_id,
+ video_id)
+
+ clip = player_config.get('clip')
+ video_url = clip.get('url')
+ res = clip.get('bitrates', [])[0].get('label')
+
+ metadata['resolution'] = res
+ metadata['url'] = video_url
+ metadata['protocol'] = 'm3u8'
+
+ return metadata
+
+
+class HitboxLiveIE(HitboxIE):
+ IE_NAME = 'hitbox:live'
+ _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/(?!video)(?P<id>.+)'
+ _TEST = {
+ 'url': 'http://www.hitbox.tv/dimak',
+ 'info_dict': {
+ 'id': 'dimak',
+ 'ext': 'mp4',
+ 'description': 'md5:c9f80fa4410bc588d7faa40003fc7d0e',
+ 'timestamp': int,
+ 'upload_date': compat_str,
+ 'title': compat_str,
+ 'uploader': 'Dimak',
+ },
+ 'params': {
+ # live
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ metadata = self._extract_metadata(
+ 'https://www.hitbox.tv/api/media/live',
+ video_id)
+
+ player_config = self._download_json(
+ 'https://www.hitbox.tv/api/player/config/live/%s' % video_id,
+ video_id)
+
+ formats = []
+ cdns = player_config.get('cdns')
+ servers = []
+ for cdn in cdns:
+ base_url = cdn.get('netConnectionUrl')
+ host = re.search('.+\.([^\.]+\.[^\./]+)/.+', base_url).group(1)
+ if base_url not in servers:
+ servers.append(base_url)
+ for stream in cdn.get('bitrates'):
+ label = stream.get('label')
+ if label != 'Auto':
+ formats.append({
+ 'url': '%s/%s' % (base_url, stream.get('url')),
+ 'ext': 'mp4',
+ 'vbr': stream.get('bitrate'),
+ 'resolution': label,
+ 'rtmp_live': True,
+ 'format_note': host,
+ 'page_url': url,
+ 'player_url': 'http://www.hitbox.tv/static/player/flowplayer/flowplayer.commercial-3.2.16.swf',
+ })
+
+ self._sort_formats(formats)
+ metadata['formats'] = formats
+ metadata['is_live'] = True
+ metadata['title'] = self._live_title(metadata.get('title'))
+ return metadata
diff --git a/youtube_dl/extractor/hostingbulk.py b/youtube_dl/extractor/hostingbulk.py
index 8e812b669..704d0285d 100644
--- a/youtube_dl/extractor/hostingbulk.py
+++ b/youtube_dl/extractor/hostingbulk.py
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+)
from ..utils import (
ExtractorError,
- compat_urllib_request,
int_or_none,
urlencode_postdata,
)
@@ -30,9 +32,7 @@ class HostingBulkIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
url = 'http://hostingbulk.com/{0:}.html'.format(video_id)
# Custom request with cookie to set language to English, so our file
diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py
index fccc23884..e97339121 100644
--- a/youtube_dl/extractor/howstuffworks.py
+++ b/youtube_dl/extractor/howstuffworks.py
@@ -1,12 +1,12 @@
from __future__ import unicode_literals
-import re
-import json
-import random
-import string
-
from .common import InfoExtractor
-from ..utils import find_xpath_attr
+from ..utils import (
+ find_xpath_attr,
+ int_or_none,
+ js_to_json,
+ unescapeHTML,
+)
class HowStuffWorksIE(InfoExtractor):
@@ -16,98 +16,74 @@ class HowStuffWorksIE(InfoExtractor):
'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm',
'info_dict': {
'id': '450221',
- 'display_id': 'cool-jobs-iditarod-musher',
'ext': 'flv',
'title': 'Cool Jobs - Iditarod Musher',
- 'description': 'md5:82bb58438a88027b8186a1fccb365f90',
+ 'description': 'Cold sleds, freezing temps and warm dog breath... an Iditarod musher\'s dream. Kasey-Dee Gardner jumps on a sled to find out what the big deal is.',
+ 'display_id': 'cool-jobs-iditarod-musher',
'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 161,
},
- 'params': {
- # md5 is not consistent
- 'skip_download': True
- }
},
{
'url': 'http://adventure.howstuffworks.com/7199-survival-zone-food-and-water-in-the-savanna-video.htm',
'info_dict': {
'id': '453464',
- 'display_id': 'survival-zone-food-and-water-in-the-savanna',
'ext': 'mp4',
'title': 'Survival Zone: Food and Water In the Savanna',
- 'description': 'md5:7e1c89f6411434970c15fa094170c371',
+ 'description': 'Learn how to find both food and water while trekking in the African savannah. In this video from the Discovery Channel.',
+ 'display_id': 'survival-zone-food-and-water-in-the-savanna',
'thumbnail': 're:^https?://.*\.jpg$',
},
- 'params': {
- # md5 is not consistent
- 'skip_download': True
- }
},
{
'url': 'http://entertainment.howstuffworks.com/arts/2706-sword-swallowing-1-by-dan-meyer-video.htm',
'info_dict': {
'id': '440011',
- 'display_id': 'sword-swallowing-1-by-dan-meyer',
'ext': 'flv',
'title': 'Sword Swallowing #1 by Dan Meyer',
- 'description': 'md5:b2409e88172913e2e7d3d1159b0ef735',
+ 'description': 'Video footage (1 of 3) used by permission of the owner Dan Meyer through Sword Swallowers Association International <www.swordswallow.org>',
+ 'display_id': 'sword-swallowing-1-by-dan-meyer',
'thumbnail': 're:^https?://.*\.jpg$',
},
- 'params': {
- # md5 is not consistent
- 'skip_download': True
- }
},
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('id')
+ display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
+ clip_js = self._search_regex(
+ r'(?s)var clip = ({.*?});', webpage, 'clip info')
+ clip_info = self._parse_json(
+ clip_js, display_id, transform_source=js_to_json)
- content_id = self._search_regex(r'var siteSectionId="(\d+)";', webpage, 'content id')
-
- mp4 = self._search_regex(
- r'''(?xs)var\s+clip\s*=\s*{\s*
- .+?\s*
- content_id\s*:\s*%s\s*,\s*
- .+?\s*
- mp4\s*:\s*\[(.*?),?\]\s*
- };\s*
- videoData\.push\(clip\);''' % content_id,
- webpage, 'mp4', fatal=False, default=None)
-
- smil = self._download_xml(
- 'http://services.media.howstuffworks.com/videos/%s/smil-service.smil' % content_id,
- content_id, 'Downloading video SMIL')
-
- http_base = find_xpath_attr(
- smil,
- './{0}head/{0}meta'.format('{http://www.w3.org/2001/SMIL20/Language}'),
- 'name',
- 'httpBase').get('content')
-
- def random_string(str_len=0):
- return ''.join([random.choice(string.ascii_uppercase) for _ in range(str_len)])
-
- URL_SUFFIX = '?v=2.11.3&fp=LNX 11,2,202,356&r=%s&g=%s' % (random_string(5), random_string(12))
-
+ video_id = clip_info['content_id']
formats = []
+ m3u8_url = clip_info.get('m3u8')
+ if m3u8_url:
+ formats += self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
+ for video in clip_info.get('mp4', []):
+ formats.append({
+ 'url': video['src'],
+ 'format_id': video['bitrate'],
+ 'vbr': int(video['bitrate'].rstrip('k')),
+ })
+
+ if not formats:
+ smil = self._download_xml(
+ 'http://services.media.howstuffworks.com/videos/%s/smil-service.smil' % video_id,
+ video_id, 'Downloading video SMIL')
+
+ http_base = find_xpath_attr(
+ smil,
+ './{0}head/{0}meta'.format('{http://www.w3.org/2001/SMIL20/Language}'),
+ 'name',
+ 'httpBase').get('content')
+
+ URL_SUFFIX = '?v=2.11.3&fp=LNX 11,2,202,356&r=A&g=A'
- if mp4:
- for video in json.loads('[%s]' % mp4):
- bitrate = video['bitrate']
- fmt = {
- 'url': video['src'].replace('http://pmd.video.howstuffworks.com', http_base) + URL_SUFFIX,
- 'format_id': bitrate,
- }
- m = re.search(r'(?P<vbr>\d+)[Kk]', bitrate)
- if m:
- fmt['vbr'] = int(m.group('vbr'))
- formats.append(fmt)
- else:
for video in smil.findall(
- './/{0}body/{0}switch/{0}video'.format('{http://www.w3.org/2001/SMIL20/Language}')):
- vbr = int(video.attrib['system-bitrate']) / 1000
+ './{0}body/{0}switch/{0}video'.format('{http://www.w3.org/2001/SMIL20/Language}')):
+ vbr = int_or_none(video.attrib['system-bitrate'], scale=1000)
formats.append({
'url': '%s/%s%s' % (http_base, video.attrib['src'], URL_SUFFIX),
'format_id': '%dk' % vbr,
@@ -116,19 +92,12 @@ class HowStuffWorksIE(InfoExtractor):
self._sort_formats(formats)
- title = self._og_search_title(webpage)
- TITLE_SUFFIX = ' : HowStuffWorks'
- if title.endswith(TITLE_SUFFIX):
- title = title[:-len(TITLE_SUFFIX)]
-
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
-
return {
- 'id': content_id,
+ 'id': '%s' % video_id,
'display_id': display_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
+ 'title': unescapeHTML(clip_info['clip_title']),
+ 'description': unescapeHTML(clip_info.get('caption')),
+ 'thumbnail': clip_info.get('video_still_url'),
+ 'duration': clip_info.get('duration'),
'formats': formats,
}
diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py
index 4ccf6b9b8..a38eae421 100644
--- a/youtube_dl/extractor/huffpost.py
+++ b/youtube_dl/extractor/huffpost.py
@@ -39,8 +39,9 @@ class HuffPostIE(InfoExtractor):
data = self._download_json(api_url, video_id)['data']
video_title = data['title']
- duration = parse_duration(data['running_time'])
- upload_date = unified_strdate(data['schedule']['starts_at'])
+ duration = parse_duration(data.get('running_time'))
+ upload_date = unified_strdate(
+ data.get('schedule', {}).get('starts_at') or data.get('segment_start_date_time'))
description = data.get('description')
thumbnails = []
@@ -59,16 +60,11 @@ class HuffPostIE(InfoExtractor):
'ext': 'mp4',
'url': url,
'vcodec': 'none' if key.startswith('audio/') else None,
- } for key, url in data['sources']['live'].items()]
- if data.get('fivemin_id'):
- fid = data['fivemin_id']
- fcat = str(int(fid) // 100 + 1)
- furl = 'http://avideos.5min.com/2/' + fcat[-3:] + '/' + fcat + '/' + fid + '.mp4'
- formats.append({
- 'format': 'fivemin',
- 'url': furl,
- 'preference': 1,
- })
+ } for key, url in data.get('sources', {}).get('live', {}).items()]
+
+ if not formats and data.get('fivemin_id'):
+ return self.url_result('5min:%s' % data['fivemin_id'])
+
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py
index 6d0d847c6..aa0724a02 100644
--- a/youtube_dl/extractor/hypem.py
+++ b/youtube_dl/extractor/hypem.py
@@ -1,20 +1,20 @@
from __future__ import unicode_literals
import json
-import re
import time
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
-
+)
+from ..utils import (
ExtractorError,
)
class HypemIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
+ _VALID_URL = r'http://(?:www\.)?hypem\.com/track/(?P<id>[^/]+)/'
_TEST = {
'url': 'http://hypem.com/track/1v6ga/BODYWORK+-+TAME',
'md5': 'b9cc91b5af8995e9f0c1cee04c575828',
@@ -27,8 +27,7 @@ class HypemIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- track_id = mobj.group(1)
+ track_id = self._match_id(url)
data = {'ax': 1, 'ts': time.time()}
data_encoded = compat_urllib_parse.urlencode(data)
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index f2c1c10f5..f29df36b5 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -4,7 +4,7 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
)
@@ -16,7 +16,6 @@ class ImdbIE(InfoExtractor):
_TEST = {
'url': 'http://www.imdb.com/video/imdb/vi2524815897',
- 'md5': '9f34fa777ade3a6e57a054fdbcb3a068',
'info_dict': {
'id': '2524815897',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py
index e76dd222d..f25f43664 100644
--- a/youtube_dl/extractor/infoq.py
+++ b/youtube_dl/extractor/infoq.py
@@ -1,10 +1,9 @@
from __future__ import unicode_literals
import base64
-import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
@@ -24,9 +23,7 @@ class InfoQIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py
index 1e4799187..483cc6f9e 100644
--- a/youtube_dl/extractor/internetvideoarchive.py
+++ b/youtube_dl/extractor/internetvideoarchive.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
compat_urllib_parse,
+)
+from ..utils import (
xpath_with_ns,
)
@@ -20,7 +22,7 @@ class InternetVideoArchiveIE(InfoExtractor):
'ext': 'mp4',
'title': 'SKYFALL',
'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
- 'duration': 149,
+ 'duration': 152,
},
}
diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py
index 4247d6391..8529bedfc 100644
--- a/youtube_dl/extractor/iprima.py
+++ b/youtube_dl/extractor/iprima.py
@@ -6,8 +6,10 @@ from random import random
from math import floor
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
)
diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py
index f0fba1adb..7a400323d 100644
--- a/youtube_dl/extractor/ivi.py
+++ b/youtube_dl/extractor/ivi.py
@@ -5,8 +5,10 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
)
diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py
index 5d679e88d..c0956ba09 100644
--- a/youtube_dl/extractor/keek.py
+++ b/youtube_dl/extractor/keek.py
@@ -1,34 +1,39 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
class KeekIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
+ _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<id>\w+)'
IE_NAME = 'keek'
_TEST = {
'url': 'https://www.keek.com/ytdl/keeks/NODfbab',
- 'file': 'NODfbab.mp4',
- 'md5': '9b0636f8c0f7614afa4ea5e4c6e57e83',
+ 'md5': '09c5c109067536c1cec8bac8c21fea05',
'info_dict': {
- 'uploader': 'ytdl',
+ 'id': 'NODfbab',
+ 'ext': 'mp4',
+ 'uploader': 'youtube-dl project',
+ 'uploader_id': 'ytdl',
'title': 'test chars: "\'/\\\u00e4<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de .',
},
}
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('videoID')
+ video_id = self._match_id(url)
video_url = 'http://cdn.keek.com/keek/video/%s' % video_id
thumbnail = 'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
webpage = self._download_webpage(url, video_id)
- uploader = self._html_search_regex(
- r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
- webpage, 'uploader', fatal=False)
+ raw_desc = self._html_search_meta('description', webpage)
+ if raw_desc:
+ uploader = self._html_search_regex(
+ r'Watch (.*?)\s+\(', raw_desc, 'uploader', fatal=False)
+ uploader_id = self._html_search_regex(
+ r'Watch .*?\(@(.+?)\)', raw_desc, 'uploader_id', fatal=False)
+ else:
+ uploader = None
+ uploader_id = None
return {
'id': video_id,
@@ -36,5 +41,6 @@ class KeekIE(InfoExtractor):
'ext': 'mp4',
'title': self._og_search_title(webpage),
'thumbnail': thumbnail,
- 'uploader': uploader
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
}
diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py
index 75b63cffb..97dcb518a 100644
--- a/youtube_dl/extractor/keezmovies.py
+++ b/youtube_dl/extractor/keezmovies.py
@@ -4,7 +4,7 @@ import os
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
@@ -15,7 +15,7 @@ from ..aes import (
class KeezMoviesIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?keezmovies\.com/video/.+?(?P<videoid>[0-9]+)(?:[/?&]|$)'
+ _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/.+?(?P<id>[0-9]+)(?:[/?&]|$)'
_TEST = {
'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
'file': '1214711.mp4',
@@ -27,8 +27,7 @@ class KeezMoviesIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
+ video_id = self._match_id(url)
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py
index 408d00944..08a671fa8 100644
--- a/youtube_dl/extractor/khanacademy.py
+++ b/youtube_dl/extractor/khanacademy.py
@@ -22,8 +22,10 @@ class KhanAcademyIE(InfoExtractor):
'description': 'The perfect cipher',
'duration': 176,
'uploader': 'Brit Cruise',
+ 'uploader_id': 'khanacademy',
'upload_date': '20120411',
- }
+ },
+ 'add_ie': ['Youtube'],
}, {
'url': 'https://www.khanacademy.org/math/applied-math/cryptography',
'info_dict': {
diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py
index 41fd62009..720bc939b 100644
--- a/youtube_dl/extractor/kontrtube.py
+++ b/youtube_dl/extractor/kontrtube.py
@@ -10,13 +10,14 @@ from ..utils import int_or_none
class KontrTubeIE(InfoExtractor):
IE_NAME = 'kontrtube'
IE_DESC = 'KontrTube.ru - Труба зовёт'
- _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/.+'
+ _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/'
_TEST = {
'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/',
'md5': '975a991a4926c9a85f383a736a2e6b80',
'info_dict': {
'id': '2678',
+ 'display_id': 'nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag',
'ext': 'mp4',
'title': 'Над олимпийской деревней в Сочи поднят российский флаг',
'description': 'md5:80edc4c613d5887ae8ccf1d59432be41',
@@ -28,21 +29,28 @@ class KontrTubeIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
- webpage = self._download_webpage(url, video_id, 'Downloading page')
+ webpage = self._download_webpage(
+ url, display_id, 'Downloading page')
- video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL')
- thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False)
+ video_url = self._html_search_regex(
+ r"video_url\s*:\s*'(.+?)/?',", webpage, 'video URL')
+ thumbnail = self._html_search_regex(
+ r"preview_url\s*:\s*'(.+?)/?',", webpage, 'video thumbnail', fatal=False)
title = self._html_search_regex(
r'<title>(.+?)</title>', webpage, 'video title')
- description = self._html_search_meta('description', webpage, 'video description')
+ description = self._html_search_meta(
+ 'description', webpage, 'video description')
mobj = re.search(
- r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>', webpage)
+ r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>',
+ webpage)
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
view_count = self._html_search_regex(
- r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage, 'view count', fatal=False)
+ r'<div class="col_2">Просмотров: <span>(\d+)</span></div>',
+ webpage, 'view count', fatal=False)
comment_count = None
comment_str = self._html_search_regex(
@@ -56,6 +64,7 @@ class KontrTubeIE(InfoExtractor):
return {
'id': video_id,
+ 'display_id': display_id,
'url': video_url,
'thumbnail': thumbnail,
'title': title,
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index 03c4691c6..5247c6f58 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -4,10 +4,12 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
compat_urllib_parse_urlparse,
compat_urlparse,
+)
+from ..utils import (
ExtractorError,
find_xpath_attr,
int_or_none,
diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py
index d72d470aa..9c2fbdd96 100644
--- a/youtube_dl/extractor/lrt.py
+++ b/youtube_dl/extractor/lrt.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
from ..utils import (
@@ -28,7 +27,6 @@ class LRTIE(InfoExtractor):
'params': {
'skip_download': True, # HLS download
},
-
}
def _real_extract(self, url):
@@ -44,7 +42,9 @@ class LRTIE(InfoExtractor):
formats = []
for js in re.findall(r'(?s)config:\s*(\{.*?\})', webpage):
- data = json.loads(js_to_json(js))
+ data = self._parse_json(js, video_id, transform_source=js_to_json)
+ if 'provider' not in data:
+ continue
if data['provider'] == 'rtmp':
formats.append({
'format_id': 'rtmp',
diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py
index 2160d6cb0..26e84970d 100644
--- a/youtube_dl/extractor/lynda.py
+++ b/youtube_dl/extractor/lynda.py
@@ -5,12 +5,14 @@ import json
from .subtitles import SubtitlesInfoExtractor
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
+ compat_str,
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
int_or_none,
- compat_str,
)
diff --git a/youtube_dl/extractor/malemotion.py b/youtube_dl/extractor/malemotion.py
index 1abf6e4f8..0b85a59d1 100644
--- a/youtube_dl/extractor/malemotion.py
+++ b/youtube_dl/extractor/malemotion.py
@@ -1,43 +1,33 @@
+# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
class MalemotionIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?malemotion\.com/video/(.+?)\.(?P<id>.+?)(#|$)'
+ _VALID_URL = r'https?://malemotion\.com/video/(.+?)\.(?P<id>.+?)(#|$)'
_TEST = {
- 'url': 'http://malemotion.com/video/bien-dur.10ew',
- 'file': '10ew.mp4',
- 'md5': 'b3cc49f953b107e4a363cdff07d100ce',
+ 'url': 'http://malemotion.com/video/bete-de-concours.ltc',
+ 'md5': '3013e53a0afbde2878bc39998c33e8a5',
'info_dict': {
- "title": "Bien dur",
- "age_limit": 18,
+ 'id': 'ltc',
+ 'ext': 'mp4',
+ 'title': 'Bête de Concours',
+ 'age_limit': 18,
},
- 'skip': 'This video has been deleted.'
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group("id")
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- self.report_extraction(video_id)
-
- # Extract video URL
- video_url = compat_urllib_parse.unquote(
- self._search_regex(r'<source type="video/mp4" src="(.+?)"', webpage, 'video URL'))
-
- # Extract title
+ video_url = compat_urllib_parse.unquote(self._search_regex(
+ r'<source type="video/mp4" src="(.+?)"', webpage, 'video URL'))
video_title = self._html_search_regex(
r'<title>(.*?)</title', webpage, 'title')
-
- # Extract video thumbnail
video_thumbnail = self._search_regex(
r'<video .+?poster="(.+?)"', webpage, 'thumbnail', fatal=False)
@@ -47,14 +37,12 @@ class MalemotionIE(InfoExtractor):
'format_id': 'mp4',
'preference': 1,
}]
+ self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
- 'uploader': None,
- 'upload_date': None,
'title': video_title,
'thumbnail': video_thumbnail,
- 'description': None,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py
index 858c1c0c3..8bc333b02 100644
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -3,10 +3,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_parse_qs,
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
diff --git a/youtube_dl/extractor/minhateca.py b/youtube_dl/extractor/minhateca.py
new file mode 100644
index 000000000..14934b7ec
--- /dev/null
+++ b/youtube_dl/extractor/minhateca.py
@@ -0,0 +1,72 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+from ..utils import (
+ int_or_none,
+ parse_duration,
+ parse_filesize,
+)
+
+
+class MinhatecaIE(InfoExtractor):
+ _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P<id>[0-9]+)\.'
+ _TEST = {
+ 'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)',
+ 'info_dict': {
+ 'id': '125848331',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'filesize_approx': 1530000,
+ 'duration': 9,
+ 'view_count': int,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ token = self._html_search_regex(
+ r'<input name="__RequestVerificationToken".*?value="([^"]+)"',
+ webpage, 'request token')
+ token_data = [
+ ('fileId', video_id),
+ ('__RequestVerificationToken', token),
+ ]
+ req = compat_urllib_request.Request(
+ 'http://minhateca.com.br/action/License/Download',
+ data=compat_urllib_parse.urlencode(token_data))
+ req.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ data = self._download_json(
+ req, video_id, note='Downloading metadata')
+
+ video_url = data['redirectUrl']
+ title_str = self._html_search_regex(
+ r'<h1.*?>(.*?)</h1>', webpage, 'title')
+ title, _, ext = title_str.rpartition('.')
+ filesize_approx = parse_filesize(self._html_search_regex(
+ r'<p class="fileSize">(.*?)</p>',
+ webpage, 'file size approximation', fatal=False))
+ duration = parse_duration(self._html_search_regex(
+ r'(?s)<p class="fileLeng[ht][th]">.*?class="bold">(.*?)<',
+ webpage, 'duration', fatal=False))
+ view_count = int_or_none(self._html_search_regex(
+ r'<p class="downloadsCounter">([0-9]+)</p>',
+ webpage, 'view count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'ext': ext,
+ 'filesize_approx': filesize_approx,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
index 807b1dc89..3c61a850f 100644
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@@ -5,8 +5,10 @@ import json
from .common import InfoExtractor
from .youtube import YoutubeIE
-from ..utils import (
+from ..compat import (
compat_urlparse,
+)
+from ..utils import (
clean_html,
ExtractorError,
get_element_by_id,
@@ -15,7 +17,7 @@ from ..utils import (
class TechTVMITIE(InfoExtractor):
IE_NAME = 'techtv.mit.edu'
- _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)'
+ _VALID_URL = r'https?://techtv\.mit\.edu/(?:videos|embeds)/(?P<id>\d+)'
_TEST = {
'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
@@ -29,8 +31,7 @@ class TechTVMITIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
raw_page = self._download_webpage(
'http://techtv.mit.edu/videos/%s' % video_id, video_id)
clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)
@@ -104,7 +105,10 @@ class OCWMITIE(InfoExtractor):
'ext': 'mp4',
'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
- #'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'
+ 'upload_date': '20121109',
+ 'uploader_id': 'MIT',
+ 'uploader': 'MIT OpenCourseWare',
+ # 'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'
}
},
{
@@ -113,8 +117,11 @@ class OCWMITIE(InfoExtractor):
'id': '7K1sB05pE0A',
'ext': 'mp4',
'title': 'Session 1: Introduction to Derivatives',
+ 'upload_date': '20090818',
+ 'uploader_id': 'MIT',
+ 'uploader': 'MIT OpenCourseWare',
'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
- #'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'
+ # 'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'
}
}
]
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py
index 6691521e5..256758323 100644
--- a/youtube_dl/extractor/mitele.py
+++ b/youtube_dl/extractor/mitele.py
@@ -1,12 +1,13 @@
from __future__ import unicode_literals
-import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urlparse,
+)
+from ..utils import (
get_element_by_attribute,
parse_duration,
strip_jsonp,
@@ -15,7 +16,7 @@ from ..utils import (
class MiTeleIE(InfoExtractor):
IE_NAME = 'mitele.es'
- _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<episode>[^/]+)/'
+ _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/'
_TEST = {
'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
@@ -31,12 +32,10 @@ class MiTeleIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- episode = mobj.group('episode')
+ episode = self._match_id(url)
webpage = self._download_webpage(url, episode)
embed_data_json = self._search_regex(
- r'MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
- flags=re.DOTALL
+ r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
).replace('\'', '"')
embed_data = json.loads(embed_data_json)
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index bb8937c4d..07d194562 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -3,8 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
+)
+from ..utils import (
ExtractorError,
HEADRequest,
int_or_none,
@@ -70,7 +72,7 @@ class MixcloudIE(InfoExtractor):
raise ExtractorError('Unable to extract track url')
PREFIX = (
- r'<div class="cloudcast-play-button-container[^"]*?"'
+ r'<span class="play-button[^"]*?"'
r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')
title = self._html_search_regex(
PREFIX + r'm-title="([^"]+)"', webpage, 'title')
diff --git a/youtube_dl/extractor/moevideo.py b/youtube_dl/extractor/moevideo.py
index 2ff79b9b8..5a66302f6 100644
--- a/youtube_dl/extractor/moevideo.py
+++ b/youtube_dl/extractor/moevideo.py
@@ -5,10 +5,12 @@ import json
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
+ ExtractorError,
int_or_none,
)
@@ -50,7 +52,8 @@ class MoeVideoIE(InfoExtractor):
'height': 296,
'duration': 6027,
'filesize': 588257923,
- }
+ },
+ 'skip': 'Video has been removed',
},
]
diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py
index d658647e6..2cec12d35 100644
--- a/youtube_dl/extractor/mofosex.py
+++ b/youtube_dl/extractor/mofosex.py
@@ -4,7 +4,7 @@ import os
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
@@ -12,7 +12,7 @@ from ..utils import (
class MofosexIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?(?P<url>mofosex\.com/videos/(?P<videoid>[0-9]+)/.*?\.html)'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>mofosex\.com/videos/(?P<id>[0-9]+)/.*?\.html)'
_TEST = {
'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html',
'md5': '1b2eb47ac33cc75d4a80e3026b613c5a',
@@ -26,7 +26,7 @@ class MofosexIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
+ video_id = mobj.group('id')
url = 'http://www.' + mobj.group('url')
req = compat_urllib_request.Request(url)
diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py
index 1c4f589cc..5de719bdc 100644
--- a/youtube_dl/extractor/moniker.py
+++ b/youtube_dl/extractor/moniker.py
@@ -5,7 +5,7 @@ import os.path
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
@@ -37,10 +37,9 @@ class MonikerIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
orig_webpage = self._download_webpage(url, video_id)
+
fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage)
data = dict(fields)
diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py
index 34a4bec3a..7603af5e2 100644
--- a/youtube_dl/extractor/mooshare.py
+++ b/youtube_dl/extractor/mooshare.py
@@ -1,14 +1,15 @@
from __future__ import unicode_literals
import re
-import time
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_urllib_request,
compat_urllib_parse,
)
+from ..utils import (
+ ExtractorError,
+)
class MooshareIE(InfoExtractor):
@@ -43,9 +44,7 @@ class MooshareIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
page = self._download_webpage(url, video_id, 'Downloading page')
if re.search(r'>Video Not Found or Deleted<', page) is not None:
@@ -64,8 +63,7 @@ class MooshareIE(InfoExtractor):
'http://mooshare.biz/%s' % video_id, compat_urllib_parse.urlencode(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- self.to_screen('%s: Waiting for timeout' % video_id)
- time.sleep(5)
+ self._sleep(5, video_id)
video_page = self._download_webpage(request, video_id, 'Downloading video page')
diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py
index 7c0ec6a12..c1a482dba 100644
--- a/youtube_dl/extractor/motorsport.py
+++ b/youtube_dl/extractor/motorsport.py
@@ -1,63 +1,49 @@
# coding: utf-8
from __future__ import unicode_literals
-import hashlib
-import json
-import re
-import time
-
from .common import InfoExtractor
-from ..utils import (
- compat_parse_qs,
- compat_str,
- int_or_none,
+from ..compat import (
+ compat_urlparse,
)
class MotorsportIE(InfoExtractor):
IE_DESC = 'motorsport.com'
- _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/(?:$|[?#])'
+ _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])'
_TEST = {
'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/',
- 'md5': '5592cb7c5005d9b2c163df5ac3dc04e4',
'info_dict': {
- 'id': '7063',
+ 'id': '2-T3WuR-KMM',
'ext': 'mp4',
'title': 'Red Bull Racing: 2014 Rules Explained',
- 'duration': 207,
+ 'duration': 208,
'description': 'A new clip from Red Bull sees Daniel Ricciardo and Sebastian Vettel explain the 2014 Formula One regulations – which are arguably the most complex the sport has ever seen.',
- 'uploader': 'rainiere',
- 'thumbnail': r're:^http://.*motorsport\.com/.+\.jpg$'
- }
+ 'uploader': 'mcomstaff',
+ 'uploader_id': 'UC334JIYKkVnyFoNCclfZtHQ',
+ 'upload_date': '20140903',
+ 'thumbnail': r're:^https?://.+\.jpg$'
+ },
+ 'add_ie': ['Youtube'],
+ 'params': {
+ 'skip_download': True,
+ },
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('id')
-
+ display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- flashvars_code = self._html_search_regex(
- r'<embed id="player".*?flashvars="([^"]+)"', webpage, 'flashvars')
- flashvars = compat_parse_qs(flashvars_code)
- params = json.loads(flashvars['parameters'][0])
-
- e = compat_str(int(time.time()) + 24 * 60 * 60)
- base_video_url = params['location'] + '?e=' + e
- s = 'h3hg713fh32'
- h = hashlib.md5((s + base_video_url).encode('utf-8')).hexdigest()
- video_url = base_video_url + '&h=' + h
- uploader = self._html_search_regex(
- r'(?s)<span class="label">Video by: </span>(.*?)</a>', webpage,
- 'uploader', fatal=False)
+ iframe_path = self._html_search_regex(
+ r'<iframe id="player_iframe"[^>]+src="([^"]+)"', webpage,
+ 'iframe path')
+ iframe = self._download_webpage(
+ compat_urlparse.urljoin(url, iframe_path), display_id,
+ 'Downloading iframe')
+ youtube_id = self._search_regex(
+ r'www.youtube.com/embed/(.{11})', iframe, 'youtube id')
return {
- 'id': params['video_id'],
+ '_type': 'url_transparent',
'display_id': display_id,
- 'title': params['title'],
- 'url': video_url,
- 'description': params.get('description'),
- 'thumbnail': params.get('main_thumb'),
- 'duration': int_or_none(params.get('duration')),
- 'uploader': uploader,
+ 'url': 'https://youtube.com/watch?v=%s' % youtube_id,
}
diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py
index 456807dd1..04e17d055 100644
--- a/youtube_dl/extractor/movieclips.py
+++ b/youtube_dl/extractor/movieclips.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+)
from ..utils import (
ExtractorError,
- compat_str,
clean_html,
)
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index b482d6d4d..5ebc78033 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py
index e62614670..83414a232 100644
--- a/youtube_dl/extractor/myspace.py
+++ b/youtube_dl/extractor/myspace.py
@@ -88,6 +88,7 @@ class MySpaceIE(InfoExtractor):
self.report_warning(
'%s: No downloadable song on this page' % video_id)
return
+
def search_data(name):
return self._search_regex(
r'''data-%s=([\'"])(?P<data>.*?)\1''' % name,
diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py
index 51e540814..5b9b9fbcd 100644
--- a/youtube_dl/extractor/myspass.py
+++ b/youtube_dl/extractor/myspass.py
@@ -2,9 +2,10 @@ from __future__ import unicode_literals
import os.path
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
-
+)
+from ..utils import (
ExtractorError,
)
diff --git a/youtube_dl/extractor/myvidster.py b/youtube_dl/extractor/myvidster.py
new file mode 100644
index 000000000..a94ab8358
--- /dev/null
+++ b/youtube_dl/extractor/myvidster.py
@@ -0,0 +1,29 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class MyVidsterIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?myvidster\.com/video/(?P<id>\d+)/'
+
+ _TEST = {
+ 'url': 'http://www.myvidster.com/video/32059805/Hot_chemistry_with_raw_love_making',
+ 'md5': '95296d0231c1363222c3441af62dc4ca',
+ 'info_dict': {
+ 'id': '3685814',
+ 'title': 'md5:7d8427d6d02c4fbcef50fe269980c749',
+ 'upload_date': '20141027',
+ 'uploader_id': 'utkualp',
+ 'ext': 'mp4',
+ 'age_limit': 18,
+ },
+ 'add_ie': ['XHamster'],
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ return self.url_result(self._html_search_regex(
+ r'rel="videolink" href="(?P<real_url>.*)">',
+ webpage, 'real video url'))
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index fbe34defd..c10405f04 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -4,8 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
+)
+from ..utils import (
ExtractorError,
clean_html,
)
@@ -26,9 +28,9 @@ class NaverIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+
m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"',
webpage)
if m_id is None:
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py
index f69fe0925..862b706bf 100644
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
remove_end,
@@ -10,8 +8,8 @@ from ..utils import (
class NBAIE(InfoExtractor):
- _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
- _TEST = {
+ _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$'
+ _TESTS = [{
'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
'md5': 'c0edcfc37607344e2ff8f13c378c88a4',
'info_dict': {
@@ -21,12 +19,13 @@ class NBAIE(InfoExtractor):
'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',
'duration': 181,
},
- }
+ }, {
+ 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = 'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
@@ -37,7 +36,7 @@ class NBAIE(InfoExtractor):
description = self._og_search_description(webpage)
duration = parse_duration(
- self._html_search_meta('duration', webpage, 'duration', fatal=False))
+ self._html_search_meta('duration', webpage, 'duration'))
return {
'id': shortened_video_id,
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index 7b5449031..690c46b6a 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -4,31 +4,47 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
+)
+from ..utils import (
ExtractorError,
find_xpath_attr,
)
class NBCIE(InfoExtractor):
- _VALID_URL = r'http://www\.nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+)'
-
- _TEST = {
- 'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
- # md5 checksum is not stable
- 'info_dict': {
- 'id': 'bTmnLCvIbaaH',
- 'ext': 'flv',
- 'title': 'I Am a Firefighter',
- 'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
+ _VALID_URL = r'http://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
+ # md5 checksum is not stable
+ 'info_dict': {
+ 'id': 'bTmnLCvIbaaH',
+ 'ext': 'flv',
+ 'title': 'I Am a Firefighter',
+ 'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
+ },
},
- }
+ {
+ 'url': 'http://www.nbc.com/the-tonight-show/episodes/176',
+ 'info_dict': {
+ 'id': 'XwU9KZkp98TH',
+ 'ext': 'flv',
+ 'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen',
+ 'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.',
+ },
+ 'skip': 'Only works from US',
+ },
+ ]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- theplatform_url = self._search_regex('class="video-player video-player-full" data-mpx-url="(.*?)"', webpage, 'theplatform url')
+ theplatform_url = self._search_regex(
+ '(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
+ webpage, 'theplatform url').replace('_no_endcard', '')
if theplatform_url.startswith('//'):
theplatform_url = 'http:' + theplatform_url
return self.url_result(theplatform_url)
diff --git a/youtube_dl/extractor/nerdcubed.py b/youtube_dl/extractor/nerdcubed.py
new file mode 100644
index 000000000..efc903afa
--- /dev/null
+++ b/youtube_dl/extractor/nerdcubed.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import datetime
+
+from .common import InfoExtractor
+
+
+class NerdCubedFeedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?nerdcubed\.co\.uk/feed\.json'
+ _TEST = {
+ 'url': 'http://www.nerdcubed.co.uk/feed.json',
+ 'info_dict': {
+ 'title': 'nerdcubed.co.uk feed',
+ },
+ 'playlist_mincount': 1300,
+ }
+
+ def _real_extract(self, url):
+ feed = self._download_json(url, url, "Downloading NerdCubed JSON feed")
+
+ entries = [{
+ '_type': 'url',
+ 'title': feed_entry['title'],
+ 'uploader': feed_entry['source']['name'] if feed_entry['source'] else None,
+ 'upload_date': datetime.datetime.strptime(feed_entry['date'], '%Y-%m-%d').strftime('%Y%m%d'),
+ 'url': "http://www.youtube.com/watch?v=" + feed_entry['youtube_id'],
+ } for feed_entry in feed]
+
+ return {
+ '_type': 'playlist',
+ 'title': 'nerdcubed.co.uk feed',
+ 'id': 'nerdcubed-feed',
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/netzkino.py b/youtube_dl/extractor/netzkino.py
new file mode 100644
index 000000000..93567d1e3
--- /dev/null
+++ b/youtube_dl/extractor/netzkino.py
@@ -0,0 +1,86 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ int_or_none,
+ js_to_json,
+ parse_iso8601,
+)
+
+
+class NetzkinoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)'
+
+ _TEST = {
+ 'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond',
+ 'md5': '92a3f8b76f8d7220acce5377ea5d4873',
+ 'info_dict': {
+ 'id': 'rakete-zum-mond',
+ 'ext': 'mp4',
+ 'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)',
+ 'comments': 'mincount:3',
+ 'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28',
+ 'upload_date': '20120813',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'timestamp': 1344858571,
+ 'age_limit': 12,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ category_id = mobj.group('category')
+ video_id = mobj.group('id')
+
+ api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id
+ api_info = self._download_json(api_url, video_id)
+ info = next(
+ p for p in api_info['posts'] if p['slug'] == video_id)
+ custom_fields = info['custom_fields']
+
+ production_js = self._download_webpage(
+ 'http://www.netzkino.de/beta/dist/production.min.js', video_id,
+ note='Downloading player code')
+ avo_js = self._search_regex(
+ r'window\.avoCore\s*=.*?urlTemplate:\s*(\{.*?"\})',
+ production_js, 'URL templates')
+ templates = self._parse_json(
+ avo_js, video_id, transform_source=js_to_json)
+
+ suffix = {
+ 'hds': '.mp4/manifest.f4m',
+ 'hls': '.mp4/master.m3u8',
+ 'pmd': '.mp4',
+ }
+ film_fn = custom_fields['Streaming'][0]
+ formats = [{
+ 'format_id': key,
+ 'ext': 'mp4',
+ 'url': tpl.replace('{}', film_fn) + suffix[key],
+ } for key, tpl in templates.items()]
+ self._sort_formats(formats)
+
+ comments = [{
+ 'timestamp': parse_iso8601(c.get('date'), delimiter=' '),
+ 'id': c['id'],
+ 'author': c['name'],
+ 'html': c['content'],
+ 'parent': 'root' if c.get('parent', 0) == 0 else c['parent'],
+ } for c in info.get('comments', [])]
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'comments': comments,
+ 'title': info['title'],
+ 'age_limit': int_or_none(custom_fields.get('FSK')[0]),
+ 'timestamp': parse_iso8601(info.get('date'), delimiter=' '),
+ 'description': clean_html(info.get('content')),
+ 'thumbnail': info.get('thumbnail'),
+ 'playlist_title': api_info.get('title'),
+ 'playlist_id': category_id,
+ }
diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py
index 7ce1d481d..ea077254b 100644
--- a/youtube_dl/extractor/nfb.py
+++ b/youtube_dl/extractor/nfb.py
@@ -1,9 +1,7 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
compat_urllib_parse,
)
@@ -12,7 +10,7 @@ from ..utils import (
class NFBIE(InfoExtractor):
IE_NAME = 'nfb'
IE_DESC = 'National Film Board of Canada'
- _VALID_URL = r'https?://(?:www\.)?(nfb|onf)\.ca/film/(?P<id>[\da-z_-]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:nfb|onf)\.ca/film/(?P<id>[\da-z_-]+)'
_TEST = {
'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
@@ -32,10 +30,10 @@ class NFBIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- page = self._download_webpage('https://www.nfb.ca/film/%s' % video_id, video_id, 'Downloading film page')
+ video_id = self._match_id(url)
+ page = self._download_webpage(
+ 'https://www.nfb.ca/film/%s' % video_id, video_id,
+ 'Downloading film page')
uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
page, 'director id', fatal=False)
diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py
index cc7c921c3..606e2294e 100644
--- a/youtube_dl/extractor/nfl.py
+++ b/youtube_dl/extractor/nfl.py
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
ExtractorError,
- compat_urllib_parse_urlparse,
int_or_none,
remove_end,
)
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py
index 0244368e9..d3a4fc513 100644
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -2,6 +2,7 @@ from __future__ import unicode_literals
import re
import json
+import os
from .common import InfoExtractor
from ..compat import (
@@ -26,7 +27,8 @@ class NHLBaseInfoExtractor(InfoExtractor):
initial_video_url = info['publishPoint']
if info['formats'] == '1':
parsed_url = compat_urllib_parse_urlparse(initial_video_url)
- path = parsed_url.path.replace('.', '_sd.', 1)
+ filename, ext = os.path.splitext(parsed_url.path)
+ path = '%s_sd%s' % (filename, ext)
data = compat_urllib_parse.urlencode({
'type': 'fvod',
'path': compat_urlparse.urlunparse(parsed_url[:2] + (path,) + parsed_url[3:])
@@ -52,7 +54,7 @@ class NHLBaseInfoExtractor(InfoExtractor):
class NHLIE(NHLBaseInfoExtractor):
IE_NAME = 'nhl.com'
- _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console(?:\?(?:.*?[?&])?)id=(?P<id>[0-9a-z-]+)'
+ _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console(?:\?(?:.*?[?&])?)id=(?P<id>[-0-9a-zA-Z]+)'
_TESTS = [{
'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614',
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
index 1d9c1a096..4c1890416 100644
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@@ -5,14 +5,16 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
- unified_strdate,
- parse_duration,
- int_or_none,
+)
+from ..utils import (
ExtractorError,
+ int_or_none,
+ parse_duration,
+ unified_strdate,
)
diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py
index 16a02ad79..7f842b5c2 100644
--- a/youtube_dl/extractor/ninegag.py
+++ b/youtube_dl/extractor/ninegag.py
@@ -23,6 +23,9 @@ class NineGagIE(InfoExtractor):
"ext": "mp4",
"description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
"title": "\"People Are Awesome 2013\" Is Absolutely Awesome",
+ 'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA',
+ 'uploader': 'CompilationChannel',
+ 'upload_date': '20131110',
"view_count": int,
"thumbnail": "re:^https?://",
},
@@ -35,6 +38,9 @@ class NineGagIE(InfoExtractor):
'display_id': 'alternate-banned-opening-scene-of-gravity',
"description": "While Gravity was a pretty awesome movie already, YouTuber Krishna Shenoi came up with a way to improve upon it, introducing a much better solution to Sandra Bullock's seemingly endless tumble in space. The ending is priceless.",
'title': "Banned Opening Scene Of \"Gravity\" That Changes The Whole Movie",
+ 'uploader': 'Krishna Shenoi',
+ 'upload_date': '20140401',
+ 'uploader_id': 'krishnashenoi93',
},
}]
diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py
index 7d2ff7b9a..251e6da07 100644
--- a/youtube_dl/extractor/noco.py
+++ b/youtube_dl/extractor/noco.py
@@ -6,13 +6,15 @@ import time
import hashlib
from .common import InfoExtractor
-from ..utils import (
- compat_urllib_request,
+from ..compat import (
+ compat_str,
compat_urllib_parse,
- ExtractorError,
+ compat_urllib_request,
+)
+from ..utils import (
clean_html,
+ ExtractorError,
unified_strdate,
- compat_str,
)
diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py
index 3d35b11ac..c13ff0d65 100644
--- a/youtube_dl/extractor/normalboots.py
+++ b/youtube_dl/extractor/normalboots.py
@@ -22,7 +22,11 @@ class NormalbootsIE(InfoExtractor):
'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/',
'uploader': 'JonTron',
'upload_date': '20140125',
- }
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/nosvideo.py b/youtube_dl/extractor/nosvideo.py
index f3be8f552..f5ef856db 100644
--- a/youtube_dl/extractor/nosvideo.py
+++ b/youtube_dl/extractor/nosvideo.py
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+)
from ..utils import (
ExtractorError,
- compat_urllib_request,
urlencode_postdata,
xpath_text,
xpath_with_ns,
@@ -32,8 +34,7 @@ class NosVideoIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
fields = {
'id': video_id,
diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py
index 38d05e466..04d779890 100644
--- a/youtube_dl/extractor/novamov.py
+++ b/youtube_dl/extractor/novamov.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urlparse,
+)
from ..utils import (
ExtractorError,
- compat_urlparse
)
diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py
index ecb38de2d..dec09cdfe 100644
--- a/youtube_dl/extractor/nowvideo.py
+++ b/youtube_dl/extractor/nowvideo.py
@@ -7,7 +7,7 @@ class NowVideoIE(NovaMovIE):
IE_NAME = 'nowvideo'
IE_DESC = 'NowVideo'
- _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:ch|sx|eu|at|ag|co)'}
+ _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:ch|sx|eu|at|ag|co|li)'}
_HOST = 'www.nowvideo.ch'
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
index ce31694a5..8da76ae45 100644
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@@ -9,6 +9,7 @@ from ..utils import (
qualities,
strip_jsonp,
url_basename,
+ fix_xml_ampersands,
)
@@ -51,7 +52,21 @@ class NPOIE(InfoExtractor):
'upload_date': '20130225',
'duration': 3000,
},
- }
+ },
+ {
+ 'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706',
+ 'info_dict': {
+ 'id': 'WO_VPRO_043706',
+ 'ext': 'wmv',
+ 'title': 'De nieuwe mens - Deel 1',
+ 'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b',
+ 'duration': 4680,
+ },
+ 'params': {
+ # mplayer mms download
+ 'skip_download': True,
+ }
+ },
]
def _real_extract(self, url):
@@ -74,31 +89,58 @@ class NPOIE(InfoExtractor):
token = self._search_regex(r'npoplayer\.token = "(.+?)"', token_page, 'token')
formats = []
- quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std'])
- for format_id in metadata['pubopties']:
- format_info = self._download_json(
- 'http://ida.omroep.nl/odi/?prid=%s&puboptions=%s&adaptive=yes&token=%s' % (video_id, format_id, token),
- video_id, 'Downloading %s JSON' % format_id)
- if format_info.get('error_code', 0) or format_info.get('errorcode', 0):
- continue
- streams = format_info.get('streams')
- if streams:
- video_info = self._download_json(
- streams[0] + '&type=json',
- video_id, 'Downloading %s stream JSON' % format_id)
- else:
- video_info = format_info
- video_url = video_info.get('url')
- if not video_url:
- continue
- if format_id == 'adaptive':
- formats.extend(self._extract_m3u8_formats(video_url, video_id))
- else:
+
+ pubopties = metadata.get('pubopties')
+ if pubopties:
+ quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std'])
+ for format_id in pubopties:
+ format_info = self._download_json(
+ 'http://ida.omroep.nl/odi/?prid=%s&puboptions=%s&adaptive=yes&token=%s'
+ % (video_id, format_id, token),
+ video_id, 'Downloading %s JSON' % format_id)
+ if format_info.get('error_code', 0) or format_info.get('errorcode', 0):
+ continue
+ streams = format_info.get('streams')
+ if streams:
+ video_info = self._download_json(
+ streams[0] + '&type=json',
+ video_id, 'Downloading %s stream JSON' % format_id)
+ else:
+ video_info = format_info
+ video_url = video_info.get('url')
+ if not video_url:
+ continue
+ if format_id == 'adaptive':
+ formats.extend(self._extract_m3u8_formats(video_url, video_id))
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'quality': quality(format_id),
+ })
+
+ streams = metadata.get('streams')
+ if streams:
+ for i, stream in enumerate(streams):
+ stream_url = stream.get('url')
+ if not stream_url:
+ continue
+ asx = self._download_xml(
+ stream_url, video_id,
+ 'Downloading stream %d ASX playlist' % i,
+ transform_source=fix_xml_ampersands)
+ ref = asx.find('./ENTRY/Ref')
+ if ref is None:
+ continue
+ video_url = ref.get('href')
+ if not video_url:
+ continue
formats.append({
'url': video_url,
- 'format_id': format_id,
- 'quality': quality(format_id),
+ 'ext': stream.get('formaat', 'asf'),
+ 'quality': stream.get('kwaliteit'),
})
+
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index 96f0ae1eb..321ce5ce7 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -72,12 +72,12 @@ class NRKIE(InfoExtractor):
class NRKTVIE(InfoExtractor):
- _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})'
+ _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
_TESTS = [
{
'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
- 'md5': '7b96112fbae1faf09a6f9ae1aff6cb84',
+ 'md5': 'adf2c5454fa2bf032f47a9f8fb351342',
'info_dict': {
'id': 'MUHH48000314',
'ext': 'flv',
@@ -85,11 +85,11 @@ class NRKTVIE(InfoExtractor):
'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
'upload_date': '20140523',
'duration': 1741.52,
- }
+ },
},
{
'url': 'http://tv.nrk.no/program/mdfp15000514',
- 'md5': 'af01795a31f1cf7265c8657534d8077b',
+ 'md5': '383650ece2b25ecec996ad7b5bb2a384',
'info_dict': {
'id': 'mdfp15000514',
'ext': 'flv',
@@ -97,39 +97,119 @@ class NRKTVIE(InfoExtractor):
'description': 'md5:654c12511f035aed1e42bdf5db3b206a',
'upload_date': '20140524',
'duration': 4605.0,
- }
+ },
},
+ {
+ # single playlist video
+ 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
+ 'md5': 'adbd1dbd813edaf532b0a253780719c2',
+ 'info_dict': {
+ 'id': 'MSPO40010515-part2',
+ 'ext': 'flv',
+ 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
+ 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+ 'upload_date': '20150106',
+ },
+ 'skip': 'Only works from Norway',
+ },
+ {
+ 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
+ 'playlist': [
+ {
+ 'md5': '9480285eff92d64f06e02a5367970a7a',
+ 'info_dict': {
+ 'id': 'MSPO40010515-part1',
+ 'ext': 'flv',
+ 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)',
+ 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+ 'upload_date': '20150106',
+ },
+ },
+ {
+ 'md5': 'adbd1dbd813edaf532b0a253780719c2',
+ 'info_dict': {
+ 'id': 'MSPO40010515-part2',
+ 'ext': 'flv',
+ 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
+ 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+ 'upload_date': '20150106',
+ },
+ },
+ ],
+ 'info_dict': {
+ 'id': 'MSPO40010515',
+ 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn',
+ 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+ 'upload_date': '20150106',
+ 'duration': 6947.5199999999995,
+ },
+ 'skip': 'Only works from Norway',
+ }
]
+ def _extract_f4m(self, manifest_url, video_id):
+ return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id)
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
-
- page = self._download_webpage(url, video_id)
-
- title = self._html_search_meta('title', page, 'title')
- description = self._html_search_meta('description', page, 'description')
- thumbnail = self._html_search_regex(r'data-posterimage="([^"]+)"', page, 'thumbnail', fatal=False)
- upload_date = unified_strdate(self._html_search_meta('rightsfrom', page, 'upload date', fatal=False))
- duration = float_or_none(
- self._html_search_regex(r'data-duration="([^"]+)"', page, 'duration', fatal=False))
+ part_id = mobj.group('part_id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_meta(
+ 'title', webpage, 'title')
+ description = self._html_search_meta(
+ 'description', webpage, 'description')
+
+ thumbnail = self._html_search_regex(
+ r'data-posterimage="([^"]+)"',
+ webpage, 'thumbnail', fatal=False)
+ upload_date = unified_strdate(self._html_search_meta(
+ 'rightsfrom', webpage, 'upload date', fatal=False))
+ duration = float_or_none(self._html_search_regex(
+ r'data-duration="([^"]+)"',
+ webpage, 'duration', fatal=False))
+
+ # playlist
+ parts = re.findall(
+ r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage)
+ if parts:
+ entries = []
+ for current_part_id, stream_url, part_title in parts:
+ if part_id and current_part_id != part_id:
+ continue
+ video_part_id = '%s-part%s' % (video_id, current_part_id)
+ formats = self._extract_f4m(stream_url, video_part_id)
+ entries.append({
+ 'id': video_part_id,
+ 'title': part_title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'formats': formats,
+ })
+ if part_id:
+ if entries:
+ return entries[0]
+ else:
+ playlist = self.playlist_result(entries, video_id, title, description)
+ playlist.update({
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ })
+ return playlist
formats = []
- f4m_url = re.search(r'data-media="([^"]+)"', page)
+ f4m_url = re.search(r'data-media="([^"]+)"', webpage)
if f4m_url:
- formats.append({
- 'url': f4m_url.group(1) + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
- 'format_id': 'f4m',
- 'ext': 'flv',
- })
+ formats.extend(self._extract_f4m(f4m_url.group(1), video_id))
- m3u8_url = re.search(r'data-hls-media="([^"]+)"', page)
+ m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage)
if m3u8_url:
- formats.append({
- 'url': m3u8_url.group(1),
- 'format_id': 'm3u8',
- })
+ formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4'))
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/ntv.py b/youtube_dl/extractor/ntv.py
index 13c8d79cd..ee740cd9c 100644
--- a/youtube_dl/extractor/ntv.py
+++ b/youtube_dl/extractor/ntv.py
@@ -130,7 +130,7 @@ class NTVIE(InfoExtractor):
'rtmp_conn': 'B:1',
'player_url': 'http://www.ntv.ru/swf/vps1.swf?update=20131128',
'page_url': 'http://www.ntv.ru',
- 'flash_ver': 'LNX 11,2,202,341',
+ 'flash_version': 'LNX 11,2,202,341',
'rtmp_live': True,
'ext': 'flv',
'filesize': int(size.text),
diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py
index 449c8a6a3..57928f2ae 100644
--- a/youtube_dl/extractor/nuvid.py
+++ b/youtube_dl/extractor/nuvid.py
@@ -3,15 +3,17 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+)
from ..utils import (
parse_duration,
unified_strdate,
- compat_urllib_request,
)
class NuvidIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www|m)\.nuvid\.com/video/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www|m)\.nuvid\.com/video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://m.nuvid.com/video/1310741/',
'md5': 'eab207b7ac4fccfb4e23c86201f11277',
@@ -26,8 +28,7 @@ class NuvidIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
formats = []
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py
index f17a52858..d5b05c18f 100644
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -16,7 +16,6 @@ class OoyalaIE(InfoExtractor):
{
# From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
- 'md5': '3f5cceb3a7bf461d6c29dc466cf8033c',
'info_dict': {
'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
'ext': 'mp4',
@@ -26,7 +25,6 @@ class OoyalaIE(InfoExtractor):
}, {
# Only available for ipad
'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
- 'md5': '4b9754921fddb68106e48c142e2a01e6',
'info_dict': {
'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/openfilm.py b/youtube_dl/extractor/openfilm.py
new file mode 100644
index 000000000..2249657eb
--- /dev/null
+++ b/youtube_dl/extractor/openfilm.py
@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_iso8601,
+ compat_urllib_parse,
+ parse_age_limit,
+ int_or_none,
+)
+
+
+class OpenFilmIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)openfilm\.com/videos/(?P<id>.+)'
+ _TEST = {
+ 'url': 'http://www.openfilm.com/videos/human-resources-remastered',
+ 'md5': '42bcd88c2f3ec13b65edf0f8ad1cac37',
+ 'info_dict': {
+ 'id': '32736',
+ 'display_id': 'human-resources-remastered',
+ 'ext': 'mp4',
+ 'title': 'Human Resources (Remastered)',
+ 'description': 'Social Engineering in the 20th Century.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 7164,
+ 'timestamp': 1334756988,
+ 'upload_date': '20120418',
+ 'uploader_id': '41117',
+ 'view_count': int,
+ 'age_limit': 0,
+ },
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ player = compat_urllib_parse.unquote_plus(
+ self._og_search_video_url(webpage))
+
+ video = json.loads(self._search_regex(
+ r'\bp=({.+?})(?:&|$)', player, 'video JSON'))
+
+ video_url = '%s1.mp4' % video['location']
+ video_id = video.get('video_id')
+ display_id = video.get('alias') or display_id
+ title = video.get('title')
+ description = video.get('description')
+ thumbnail = video.get('main_thumb')
+ duration = int_or_none(video.get('duration'))
+ timestamp = parse_iso8601(video.get('dt_published'), ' ')
+ uploader_id = video.get('user_id')
+ view_count = int_or_none(video.get('views_count'))
+ age_limit = parse_age_limit(video.get('age_limit'))
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'uploader_id': uploader_id,
+ 'view_count': view_count,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index 572a234ad..4fed83bd6 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -17,24 +17,39 @@ from ..utils import (
class ORFTVthekIE(InfoExtractor):
IE_NAME = 'orf:tvthek'
IE_DESC = 'ORF TVthek'
- _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)'
-
- _TEST = {
- 'url': 'http://tvthek.orf.at/program/matinee-Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7317210/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319746/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319747',
- 'file': '7319747.mp4',
- 'md5': 'bd803c5d8c32d3c64a0ea4b4eeddf375',
- 'info_dict': {
- 'title': 'Was Sie schon immer über Klassik wissen wollten',
- 'description': 'md5:0ddf0d5f0060bd53f744edaa5c2e04a4',
- 'duration': 3508,
- 'upload_date': '20140105',
- },
- 'skip': 'Blocked outside of Austria',
- }
+ _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics?/.+?|program/[^/]+)/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
+ 'playlist': [{
+ 'md5': '2942210346ed779588f428a92db88712',
+ 'info_dict': {
+ 'id': '8896777',
+ 'ext': 'mp4',
+ 'title': 'Aufgetischt: Mit der Steirischen Tafelrunde',
+ 'description': 'md5:c1272f0245537812d4e36419c207b67d',
+ 'duration': 2668,
+ 'upload_date': '20141208',
+ },
+ }],
+ 'skip': 'Blocked outside of Austria / Germany',
+ }, {
+ 'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256',
+ 'playlist': [{
+ 'md5': '68f543909aea49d621dfc7703a11cfaf',
+ 'info_dict': {
+ 'id': '7982259',
+ 'ext': 'mp4',
+ 'title': 'Best of Ingrid Thurnher',
+ 'upload_date': '20140527',
+ 'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".',
+ }
+ }],
+ '_skip': 'Blocked outside of Austria / Germany',
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
+ playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
data_json = self._search_regex(
@@ -43,7 +58,9 @@ class ORFTVthekIE(InfoExtractor):
def get_segments(all_data):
for data in all_data:
- if data['name'] == 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM':
+ if data['name'] in (
+ 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM',
+ 'Tracker::EPISODE_DETAIL_PAGE_OVER_TOPIC'):
return data['values']['segments']
sdata = get_segments(all_data)
@@ -120,9 +137,7 @@ class ORFOE1IE(InfoExtractor):
_VALID_URL = r'http://oe1\.orf\.at/programm/(?P<id>[0-9]+)'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- show_id = mobj.group('id')
-
+ show_id = self._match_id(url)
data = self._download_json(
'http://oe1.orf.at/programm/%s/konsole' % show_id,
show_id
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 6118ed5c2..afce732e1 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -4,6 +4,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
unified_strdate,
US_RATINGS,
)
@@ -151,6 +152,19 @@ class PBSIE(InfoExtractor):
info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
info = self._download_json(info_url, display_id)
+ redirect_url = info['alternate_encoding']['url']
+ redirect_info = self._download_json(
+ redirect_url + '?format=json', display_id,
+ 'Downloading video url info')
+ if redirect_info['status'] == 'error':
+ if redirect_info['http_code'] == 403:
+ message = (
+ 'The video is not available in your region due to '
+ 'right restrictions')
+ else:
+ message = redirect_info['message']
+ raise ExtractorError(message, expected=True)
+
rating_str = info.get('rating')
if rating_str is not None:
rating_str = rating_str.rpartition('-')[2]
@@ -160,7 +174,7 @@ class PBSIE(InfoExtractor):
'id': video_id,
'display_id': display_id,
'title': info['title'],
- 'url': info['alternate_encoding']['url'],
+ 'url': redirect_info['url'],
'ext': 'mp4',
'description': info['program'].get('description'),
'thumbnail': info.get('image_url'),
diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py
index b4389e0b6..c66db3cdc 100644
--- a/youtube_dl/extractor/photobucket.py
+++ b/youtube_dl/extractor/photobucket.py
@@ -4,16 +4,17 @@ import json
import re
from .common import InfoExtractor
-from ..utils import compat_urllib_parse
+from ..compat import compat_urllib_parse
class PhotobucketIE(InfoExtractor):
_VALID_URL = r'http://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
_TEST = {
'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0',
- 'file': 'zpsc0c3b9fa.mp4',
'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99',
'info_dict': {
+ 'id': 'zpsc0c3b9fa',
+ 'ext': 'mp4',
'timestamp': 1367669341,
'upload_date': '20130504',
'uploader': 'rachaneronas',
diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py
index 17880471d..45716c75d 100644
--- a/youtube_dl/extractor/played.py
+++ b/youtube_dl/extractor/played.py
@@ -5,11 +5,13 @@ import re
import os.path
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
+from ..utils import (
+ ExtractorError,
+)
class PlayedIE(InfoExtractor):
@@ -24,11 +26,11 @@ class PlayedIE(InfoExtractor):
'ext': 'flv',
'title': 'youtube-dl_test_video.mp4',
},
+ 'skip': 'Removed for copyright infringement.', # oh wow
}
def _real_extract(self, url):
video_id = self._match_id(url)
-
orig_webpage = self._download_webpage(url, video_id)
m_error = re.search(
diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py
index ebc046804..9576aed0e 100644
--- a/youtube_dl/extractor/playfm.py
+++ b/youtube_dl/extractor/playfm.py
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py
index cd3905acb..c3e667e9e 100644
--- a/youtube_dl/extractor/playvid.py
+++ b/youtube_dl/extractor/playvid.py
@@ -3,31 +3,31 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+)
from ..utils import (
- ExtractorError,
clean_html,
- compat_urllib_parse,
+ ExtractorError,
)
class PlayvidIE(InfoExtractor):
- _VALID_URL = r'^https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)'
+ _VALID_URL = r'https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)'
_TEST = {
- 'url': 'http://www.playvid.com/watch/agbDDi7WZTV',
- 'md5': '44930f8afa616efdf9482daf4fe53e1e',
+ 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu',
+ 'md5': 'ffa2f6b2119af359f544388d8c01eb6c',
'info_dict': {
- 'id': 'agbDDi7WZTV',
+ 'id': 'RnmBNgtrrJu',
'ext': 'mp4',
- 'title': 'Michelle Lewin in Miami Beach',
- 'duration': 240,
+ 'title': 'md5:9256d01c6317e3f703848b5906880dc8',
+ 'duration': 82,
'age_limit': 18,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
m_error = re.search(
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
index bac484c67..954dfccb7 100644
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -8,7 +8,6 @@ from ..utils import (
int_or_none,
js_to_json,
qualities,
- determine_ext,
)
@@ -45,13 +44,18 @@ class PornHdIE(InfoExtractor):
thumbnail = self._search_regex(
r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False)
- quality = qualities(['SD', 'HD'])
- formats = [{
- 'url': source['file'],
- 'format_id': '%s-%s' % (source['label'], determine_ext(source['file'])),
- 'quality': quality(source['label']),
- } for source in json.loads(js_to_json(self._search_regex(
- r"(?s)'sources'\s*:\s*(\[.+?\])", webpage, 'sources')))]
+ quality = qualities(['sd', 'hd'])
+ sources = json.loads(js_to_json(self._search_regex(
+ r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}\);", webpage, 'sources')))
+ formats = []
+ for container, s in sources.items():
+ for qname, video_url in s.items():
+ formats.append({
+ 'url': video_url,
+ 'container': container,
+ 'format_id': '%s-%s' % (container, qname),
+ 'quality': quality(qname),
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 2ca15b717..634142d0d 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -4,10 +4,12 @@ import os
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
+ compat_urllib_parse,
compat_urllib_parse_urlparse,
compat_urllib_request,
- compat_urllib_parse,
+)
+from ..utils import (
str_to_int,
)
from ..aes import (
@@ -16,7 +18,7 @@ from ..aes import (
class PornHubIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)'
+ _VALID_URL = r'https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)'
_TEST = {
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
'md5': '882f488fa1f0026f023f33576004a2ed',
diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py
index 5253aa3d3..34735c51e 100644
--- a/youtube_dl/extractor/pornotube.py
+++ b/youtube_dl/extractor/pornotube.py
@@ -1,56 +1,94 @@
from __future__ import unicode_literals
-import re
+import json
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+)
from ..utils import (
- compat_urllib_parse,
-
- unified_strdate,
+ int_or_none,
)
class PornotubeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
+ _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com/(?:[^?#]*?)/video/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing',
- 'md5': '374dd6dcedd24234453b295209aa69b6',
+ 'url': 'http://www.pornotube.com/orientation/straight/video/4964/title/weird-hot-and-wet-science',
+ 'md5': '60fc5a4f0d93a97968fc7999d98260c9',
'info_dict': {
- 'id': '1689755',
- 'ext': 'flv',
- 'upload_date': '20090708',
- 'title': 'Marilyn-Monroe-Bathing',
- 'age_limit': 18
+ 'id': '4964',
+ 'ext': 'mp4',
+ 'upload_date': '20141203',
+ 'title': 'Weird Hot and Wet Science',
+ 'description': 'md5:a8304bef7ef06cb4ab476ca6029b01b0',
+ 'categories': ['Adult Humor', 'Blondes'],
+ 'uploader': 'Alpha Blue Archives',
+ 'thumbnail': 're:^https?://.*\\.jpg$',
+ 'timestamp': 1417582800,
+ 'age_limit': 18,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ video_id = self._match_id(url)
- video_id = mobj.group('videoid')
- video_title = mobj.group('title')
+ # Fetch origin token
+ js_config = self._download_webpage(
+ 'http://www.pornotube.com/assets/src/app/config.js', video_id,
+ note='Download JS config')
+ originAuthenticationSpaceKey = self._search_regex(
+ r"constant\('originAuthenticationSpaceKey',\s*'([^']+)'",
+ js_config, 'originAuthenticationSpaceKey')
+
+ # Fetch actual token
+ token_req_data = {
+ 'authenticationSpaceKey': originAuthenticationSpaceKey,
+ 'credentials': 'Clip Application',
+ }
+ token_req = compat_urllib_request.Request(
+ 'https://api.aebn.net/auth/v1/token/primal',
+ data=json.dumps(token_req_data).encode('utf-8'))
+ token_req.add_header('Content-Type', 'application/json')
+ token_req.add_header('Origin', 'http://www.pornotube.com')
+ token_answer = self._download_json(
+ token_req, video_id, note='Requesting primal token')
+ token = token_answer['tokenKey']
- # Get webpage content
- webpage = self._download_webpage(url, video_id)
+ # Get video URL
+ delivery_req = compat_urllib_request.Request(
+ 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id)
+ delivery_req.add_header('Authorization', token)
+ delivery_info = self._download_json(
+ delivery_req, video_id, note='Downloading delivery information')
+ video_url = delivery_info['mediaUrl']
- # Get the video URL
- VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
- video_url = self._search_regex(VIDEO_URL_RE, webpage, 'video url')
- video_url = compat_urllib_parse.unquote(video_url)
+ # Get additional info (title etc.)
+ info_req = compat_urllib_request.Request(
+ 'https://api.aebn.net/content/v1/clips/%s?expand='
+ 'title,description,primaryImageNumber,startSecond,endSecond,'
+ 'movie.title,movie.MovieId,movie.boxCoverFront,movie.stars,'
+ 'movie.studios,stars.name,studios.name,categories.name,'
+ 'clipActive,movieActive,publishDate,orientations' % video_id)
+ info_req.add_header('Authorization', token)
+ info = self._download_json(
+ info_req, video_id, note='Downloading metadata')
- # Get the uploaded date
- VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
- upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, 'upload date', fatal=False)
- if upload_date:
- upload_date = unified_strdate(upload_date)
- age_limit = self._rta_search(webpage)
+ timestamp = int_or_none(info.get('publishDate'), scale=1000)
+ uploader = info.get('studios', [{}])[0].get('name')
+ movie_id = info['movie']['movieId']
+ thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % (
+ movie_id, movie_id, info['primaryImageNumber'])
+ categories = [c['name'] for c in info.get('categories')]
return {
'id': video_id,
'url': video_url,
- 'upload_date': upload_date,
- 'title': video_title,
- 'ext': 'flv',
- 'format': 'flv',
- 'age_limit': age_limit,
+ 'title': info['title'],
+ 'description': info.get('description'),
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'thumbnail': thumbnail,
+ 'categories': categories,
+ 'age_limit': 18,
}
diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py
index 7fcde086c..f536e6e6c 100644
--- a/youtube_dl/extractor/promptfile.py
+++ b/youtube_dl/extractor/promptfile.py
@@ -4,12 +4,14 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- determine_ext,
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
+from ..utils import (
+ determine_ext,
+ ExtractorError,
+)
class PromptFileIE(InfoExtractor):
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
index 32d747ede..385681d06 100644
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -5,8 +5,10 @@ import re
from hashlib import sha1
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
+)
+from ..utils import (
unified_strdate,
)
@@ -85,7 +87,7 @@ class ProSiebenSat1IE(InfoExtractor):
'ext': 'mp4',
'title': 'Im Interview: Kai Wiesinger',
'description': 'md5:e4e5370652ec63b95023e914190b4eb9',
- 'upload_date': '20140225',
+ 'upload_date': '20140203',
'duration': 522.56,
},
'params': {
@@ -100,7 +102,7 @@ class ProSiebenSat1IE(InfoExtractor):
'ext': 'mp4',
'title': 'Jagd auf Fertigkost im Elsthal - Teil 2',
'description': 'md5:2669cde3febe9bce13904f701e774eb6',
- 'upload_date': '20140225',
+ 'upload_date': '20141014',
'duration': 2410.44,
},
'params': {
@@ -152,12 +154,22 @@ class ProSiebenSat1IE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ 'url': 'http://www.prosieben.de/tv/joko-gegen-klaas/videos/playlists/episode-8-ganze-folge-playlist',
+ 'info_dict': {
+ 'id': '439664',
+ 'title': 'Episode 8 - Ganze Folge - Playlist',
+ 'description': 'md5:63b8963e71f481782aeea877658dec84',
+ },
+ 'playlist_count': 2,
+ },
]
_CLIPID_REGEXES = [
r'"clip_id"\s*:\s+"(\d+)"',
r'clipid: "(\d+)"',
r'clip[iI]d=(\d+)',
+ r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)",
]
_TITLE_REGEXES = [
r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>',
@@ -178,11 +190,19 @@ class ProSiebenSat1IE(InfoExtractor):
r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>',
r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>',
]
+ _PAGE_TYPE_REGEXES = [
+ r'<meta name="page_type" content="([^"]+)">',
+ r"'itemType'\s*:\s*'([^']*)'",
+ ]
+ _PLAYLIST_ID_REGEXES = [
+ r'content[iI]d=(\d+)',
+ r"'itemId'\s*:\s*'([^']*)'",
+ ]
+ _PLAYLIST_CLIP_REGEXES = [
+ r'(?s)data-qvt=.+?<a href="([^"]+)"',
+ ]
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
+ def _extract_clip(self, url, webpage):
clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
access_token = 'testclient'
@@ -281,3 +301,31 @@ class ProSiebenSat1IE(InfoExtractor):
'duration': duration,
'formats': formats,
}
+
+ def _extract_playlist(self, url, webpage):
+ playlist_id = self._html_search_regex(
+ self._PLAYLIST_ID_REGEXES, webpage, 'playlist id')
+ for regex in self._PLAYLIST_CLIP_REGEXES:
+ playlist_clips = re.findall(regex, webpage)
+ if playlist_clips:
+ title = self._html_search_regex(
+ self._TITLE_REGEXES, webpage, 'title')
+ description = self._html_search_regex(
+ self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
+ entries = [
+ self.url_result(
+ re.match('(.+?//.+?)/', url).group(1) + clip_path,
+ 'ProSiebenSat1')
+ for clip_path in playlist_clips]
+ return self.playlist_result(entries, playlist_id, title, description)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ page_type = self._search_regex(
+ self._PAGE_TYPE_REGEXES, webpage,
+ 'page type', default='clip').lower()
+ if page_type == 'clip':
+ return self._extract_clip(url, webpage)
+ elif page_type == 'playlist':
+ return self._extract_playlist(url, webpage)
diff --git a/youtube_dl/extractor/quickvid.py b/youtube_dl/extractor/quickvid.py
index 3bc78060d..af7d76cf4 100644
--- a/youtube_dl/extractor/quickvid.py
+++ b/youtube_dl/extractor/quickvid.py
@@ -3,8 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
+)
+from ..utils import (
determine_ext,
int_or_none,
)
diff --git a/youtube_dl/extractor/radiobremen.py b/youtube_dl/extractor/radiobremen.py
new file mode 100644
index 000000000..0d706312e
--- /dev/null
+++ b/youtube_dl/extractor/radiobremen.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import parse_duration
+
+
+class RadioBremenIE(InfoExtractor):
+ _VALID_URL = r'http?://(?:www\.)?radiobremen\.de/mediathek/(?:index\.html)?\?id=(?P<id>[0-9]+)'
+ IE_NAME = 'radiobremen'
+
+ _TEST = {
+ 'url': 'http://www.radiobremen.de/mediathek/index.html?id=114720',
+ 'info_dict': {
+ 'id': '114720',
+ 'ext': 'mp4',
+ 'duration': 1685,
+ 'width': 512,
+ 'title': 'buten un binnen vom 22. Dezember',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'description': 'Unter anderem mit diesen Themen: 45 Flüchtlinge sind in Worpswede angekommen +++ Freies Internet für alle: Bremer arbeiten an einem flächendeckenden W-Lan-Netzwerk +++ Aktivisten kämpfen für das Unibad +++ So war das Wetter 2014 +++',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ meta_url = "http://www.radiobremen.de/apps/php/mediathek/metadaten.php?id=%s" % video_id
+ meta_doc = self._download_webpage(
+ meta_url, video_id, 'Downloading metadata')
+ title = self._html_search_regex(
+ r"<h1.*>(?P<title>.+)</h1>", meta_doc, "title")
+ description = self._html_search_regex(
+ r"<p>(?P<description>.*)</p>", meta_doc, "description", fatal=False)
+ duration = parse_duration(self._html_search_regex(
+ r"L&auml;nge:</td>\s+<td>(?P<duration>[0-9]+:[0-9]+)</td>",
+ meta_doc, "duration", fatal=False))
+
+ page_doc = self._download_webpage(
+ url, video_id, 'Downloading video information')
+ mobj = re.search(
+ r"ardformatplayerclassic\(\'playerbereich\',\'(?P<width>[0-9]+)\',\'.*\',\'(?P<video_id>[0-9]+)\',\'(?P<secret>[0-9]+)\',\'(?P<thumbnail>.+)\',\'\'\)",
+ page_doc)
+ video_url = (
+ "http://dl-ondemand.radiobremen.de/mediabase/%s/%s_%s_%s.mp4" %
+ (video_id, video_id, mobj.group("secret"), mobj.group('width')))
+
+ formats = [{
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'width': int(mobj.group("width")),
+ }]
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'formats': formats,
+ 'thumbnail': mobj.group('thumbnail'),
+ }
diff --git a/youtube_dl/extractor/radiode.py b/youtube_dl/extractor/radiode.py
new file mode 100644
index 000000000..f95bc9454
--- /dev/null
+++ b/youtube_dl/extractor/radiode.py
@@ -0,0 +1,55 @@
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+
+
+class RadioDeIE(InfoExtractor):
+ IE_NAME = 'radio.de'
+ _VALID_URL = r'https?://(?P<id>.+?)\.(?:radio\.(?:de|at|fr|pt|es|pl|it)|rad\.io)'
+ _TEST = {
+ 'url': 'http://ndr2.radio.de/',
+ 'md5': '3b4cdd011bc59174596b6145cda474a4',
+ 'info_dict': {
+ 'id': 'ndr2',
+ 'ext': 'mp3',
+ 'title': 're:^NDR 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'md5:591c49c702db1a33751625ebfb67f273',
+ 'thumbnail': 're:^https?://.*\.png',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }
+
+ def _real_extract(self, url):
+ radio_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, radio_id)
+
+ broadcast = json.loads(self._search_regex(
+ r'_getBroadcast\s*=\s*function\(\s*\)\s*{\s*return\s+({.+?})\s*;\s*}',
+ webpage, 'broadcast'))
+
+ title = self._live_title(broadcast['name'])
+ description = broadcast.get('description') or broadcast.get('shortDescription')
+ thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl')
+
+ formats = [{
+ 'url': stream['streamUrl'],
+ 'ext': stream['streamContentFormat'].lower(),
+ 'acodec': stream['streamContentFormat'],
+ 'abr': stream['bitRate'],
+ 'asr': stream['sampleRate']
+ } for stream in broadcast['streamUrls']]
+ self._sort_formats(formats)
+
+ return {
+ 'id': radio_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'is_live': True,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py
index 2d39ecfe4..aa26b7e0b 100644
--- a/youtube_dl/extractor/rai.py
+++ b/youtube_dl/extractor/rai.py
@@ -3,10 +3,12 @@ from __future__ import unicode_literals
import re
from .subtitles import SubtitlesInfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+)
from ..utils import (
parse_duration,
unified_strdate,
- compat_urllib_parse,
)
diff --git a/youtube_dl/extractor/restudy.py b/youtube_dl/extractor/restudy.py
new file mode 100644
index 000000000..b17c2bfc0
--- /dev/null
+++ b/youtube_dl/extractor/restudy.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class RestudyIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?restudy\.dk/video/play/id/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://www.restudy.dk/video/play/id/1637',
+ 'info_dict': {
+ 'id': '1637',
+ 'ext': 'flv',
+ 'title': 'Leiden-frosteffekt',
+ 'description': 'Denne video er et eksperiment med flydende kvælstof.',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(webpage).strip()
+ description = self._og_search_description(webpage).strip()
+
+ formats = self._extract_smil_formats(
+ 'https://www.restudy.dk/awsmedia/SmilDirectory/video_%s.xml' % video_id,
+ video_id)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py
index d029b0ec5..a3ca79f2c 100644
--- a/youtube_dl/extractor/rtlnl.py
+++ b/youtube_dl/extractor/rtlnl.py
@@ -8,7 +8,7 @@ from ..utils import parse_duration
class RtlXlIE(InfoExtractor):
IE_NAME = 'rtlxl.nl'
- _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)'
+ _VALID_URL = r'https?://(www\.)?rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)'
_TEST = {
'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677',
diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py
new file mode 100644
index 000000000..7736cabba
--- /dev/null
+++ b/youtube_dl/extractor/rtp.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class RTPIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?'
+ _TESTS = [{
+ 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas',
+ 'info_dict': {
+ 'id': 'e174042',
+ 'ext': 'mp3',
+ 'title': 'Paixões Cruzadas',
+ 'description': 'As paixões musicais de António Cartaxo e António Macedo',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ },
+ 'params': {
+ 'skip_download': True, # RTMP download
+ },
+ }, {
+ 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_meta(
+ 'twitter:title', webpage, display_name='title', fatal=True)
+ description = self._html_search_meta('description', webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ player_config = self._search_regex(
+ r'(?s)RTPPLAY\.player\.newPlayer\(\s*(\{.*?\})\s*\)', webpage, 'player config')
+ config = json.loads(js_to_json(player_config))
+
+ path, ext = config.get('file').rsplit('.', 1)
+ formats = [{
+ 'app': config.get('application'),
+ 'play_path': '{ext:s}:{path:s}'.format(ext=ext, path=path),
+ 'page_url': url,
+ 'url': 'rtmp://{streamer:s}/{application:s}'.format(**config),
+ 'rtmp_live': config.get('live', False),
+ 'ext': ext,
+ 'vcodec': config.get('type') == 'audio' and 'none' or None,
+ 'player_url': 'http://programas.rtp.pt/play/player.swf?v3',
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py
index dc59a5e5c..5e84c1098 100644
--- a/youtube_dl/extractor/rts.py
+++ b/youtube_dl/extractor/rts.py
@@ -4,12 +4,14 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+)
from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
unescapeHTML,
- compat_str,
)
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py
index 6941d96fb..5b1c3577a 100644
--- a/youtube_dl/extractor/rutube.py
+++ b/youtube_dl/extractor/rutube.py
@@ -5,10 +5,12 @@ import re
import itertools
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
- unified_strdate,
+)
+from ..utils import (
ExtractorError,
+ unified_strdate,
)
@@ -36,9 +38,7 @@ class RutubeIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
video = self._download_json(
'http://rutube.ru/api/video/%s/?format=json' % video_id,
video_id, 'Downloading video JSON')
@@ -70,6 +70,37 @@ class RutubeIE(InfoExtractor):
}
+class RutubeEmbedIE(InfoExtractor):
+ IE_NAME = 'rutube:embed'
+ IE_DESC = 'Rutube embedded videos'
+ _VALID_URL = 'https?://rutube\.ru/video/embed/(?P<id>[0-9]+)'
+
+ _TEST = {
+ 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=',
+ 'info_dict': {
+ 'id': 'a10e53b86e8f349080f718582ce4c661',
+ 'ext': 'mp4',
+ 'upload_date': '20131223',
+ 'uploader_id': '297833',
+ 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89',
+ 'uploader': 'subziro89 ILya',
+ 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89',
+ },
+ 'params': {
+ 'skip_download': 'Requires ffmpeg',
+ },
+ }
+
+ def _real_extract(self, url):
+ embed_id = self._match_id(url)
+ webpage = self._download_webpage(url, embed_id)
+
+ canonical_url = self._html_search_regex(
+ r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage,
+ 'Canonical URL')
+ return self.url_result(canonical_url, 'Rutube')
+
+
class RutubeChannelIE(InfoExtractor):
IE_NAME = 'rutube:channel'
IE_DESC = 'Rutube channels'
@@ -114,8 +145,7 @@ class RutubeMovieIE(RutubeChannelIE):
_PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- movie_id = mobj.group('id')
+ movie_id = self._match_id(url)
movie = self._download_json(
self._MOVIE_TEMPLATE % movie_id, movie_id,
'Downloading movie JSON')
diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py
index c145f6fc7..dfd897ba3 100644
--- a/youtube_dl/extractor/screencast.py
+++ b/youtube_dl/extractor/screencast.py
@@ -1,14 +1,14 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_parse_qs,
compat_urllib_request,
)
+from ..utils import (
+ ExtractorError,
+)
class ScreencastIE(InfoExtractor):
@@ -57,8 +57,7 @@ class ScreencastIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._html_search_regex(
diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py
new file mode 100644
index 000000000..05337421c
--- /dev/null
+++ b/youtube_dl/extractor/screencastomatic.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ ExtractorError,
+ js_to_json,
+)
+
+
+class ScreencastOMaticIE(InfoExtractor):
+ _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P<id>[0-9a-zA-Z]+)'
+ _TEST = {
+ 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl',
+ 'md5': '483583cb80d92588f15ccbedd90f0c18',
+ 'info_dict': {
+ 'id': 'c2lD3BeOPl',
+ 'ext': 'mp4',
+ 'title': 'Welcome to 3-4 Philosophy @ DECV!',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ setup_js = self._search_regex(
+ r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);",
+ webpage, 'setup code')
+ data = self._parse_json(setup_js, video_id, transform_source=js_to_json)
+ try:
+ video_data = next(
+ m for m in data['modes'] if m.get('type') == 'html5')
+ except StopIteration:
+ raise ExtractorError('Could not find any video entries!')
+ video_url = compat_urlparse.urljoin(url, video_data['config']['file'])
+ thumbnail = data.get('image')
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/screenwavemedia.py
index b7fa73c3b..6c9fdb7c1 100644
--- a/youtube_dl/extractor/cinemassacre.py
+++ b/youtube_dl/extractor/screenwavemedia.py
@@ -5,61 +5,27 @@ import re
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
int_or_none,
+ unified_strdate,
)
-class CinemassacreIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
- _TESTS = [
- {
- 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
- 'md5': 'fde81fbafaee331785f58cd6c0d46190',
- 'info_dict': {
- 'id': '19911',
- 'ext': 'mp4',
- 'upload_date': '20121110',
- 'title': '“Angry Video Game Nerd: The Movie” – Trailer',
- 'description': 'md5:fb87405fcb42a331742a0dce2708560b',
- },
- },
- {
- 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
- 'md5': 'd72f10cd39eac4215048f62ab477a511',
- 'info_dict': {
- 'id': '521be8ef82b16',
- 'ext': 'mp4',
- 'upload_date': '20131002',
- 'title': 'The Mummy’s Hand (1940)',
- },
- }
- ]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('display_id')
-
- webpage = self._download_webpage(url, display_id)
- video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
- mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<full_video_id>(?:Cinemassacre-)?(?P<video_id>.+?)))"', webpage)
- if not mobj:
- raise ExtractorError('Can\'t extract embed url and video id')
- playerdata_url = mobj.group('embed_url')
- video_id = mobj.group('video_id')
- full_video_id = mobj.group('full_video_id')
+class ScreenwaveMediaIE(InfoExtractor):
+ _VALID_URL = r'http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<id>.+)'
- video_title = self._html_search_regex(
- r'<title>(?P<title>.+?)\|', webpage, 'title')
- video_description = self._html_search_regex(
- r'<div class="entry-content">(?P<description>.+?)</div>',
- webpage, 'description', flags=re.DOTALL, fatal=False)
- video_thumbnail = self._og_search_thumbnail(webpage)
+ _TESTS = [{
+ 'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911',
+ 'only_matching': True,
+ }]
- playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage')
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ playerdata = self._download_webpage(url, video_id, 'Downloading player webpage')
+ vidtitle = self._search_regex(
+ r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/')
vidurl = self._search_regex(
- r'\'vidurl\'\s*:\s*"([^\']+)"', playerdata, 'vidurl').replace('\\/', '/')
+ r'\'vidurl\'\s*:\s*"([^"]+)"', playerdata, 'vidurl').replace('\\/', '/')
videolist_url = None
@@ -67,7 +33,7 @@ class CinemassacreIE(InfoExtractor):
if mobj:
videoserver = mobj.group('videoserver')
mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata)
- vidid = mobj.group('vidid') if mobj else full_video_id
+ vidid = mobj.group('vidid') if mobj else video_id
videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)
else:
mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata)
@@ -85,34 +51,128 @@ class CinemassacreIE(InfoExtractor):
file_ = src.partition(':')[-1]
width = int_or_none(video.get('width'))
height = int_or_none(video.get('height'))
- bitrate = int_or_none(video.get('system-bitrate'))
+ bitrate = int_or_none(video.get('system-bitrate'), scale=1000)
format = {
'url': baseurl + file_,
'format_id': src.rpartition('.')[0].rpartition('_')[-1],
}
if width or height:
format.update({
- 'tbr': bitrate // 1000 if bitrate else None,
+ 'tbr': bitrate,
'width': width,
'height': height,
})
else:
format.update({
- 'abr': bitrate // 1000 if bitrate else None,
+ 'abr': bitrate,
'vcodec': 'none',
})
formats.append(format)
- self._sort_formats(formats)
else:
formats = [{
'url': vidurl,
}]
+ self._sort_formats(formats)
return {
'id': video_id,
- 'title': video_title,
+ 'title': vidtitle,
'formats': formats,
+ }
+
+
+class CinemassacreIE(InfoExtractor):
+ _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
+ _TESTS = [
+ {
+ 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
+ 'md5': 'fde81fbafaee331785f58cd6c0d46190',
+ 'info_dict': {
+ 'id': 'Cinemassacre-19911',
+ 'ext': 'mp4',
+ 'upload_date': '20121110',
+ 'title': '“Angry Video Game Nerd: The Movie” – Trailer',
+ 'description': 'md5:fb87405fcb42a331742a0dce2708560b',
+ },
+ },
+ {
+ 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
+ 'md5': 'd72f10cd39eac4215048f62ab477a511',
+ 'info_dict': {
+ 'id': 'Cinemassacre-521be8ef82b16',
+ 'ext': 'mp4',
+ 'upload_date': '20131002',
+ 'title': 'The Mummy’s Hand (1940)',
+ },
+ }
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('display_id')
+ video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d')
+
+ webpage = self._download_webpage(url, display_id)
+
+ playerdata_url = self._search_regex(
+ r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
+ webpage, 'player data URL')
+ video_title = self._html_search_regex(
+ r'<title>(?P<title>.+?)\|', webpage, 'title')
+ video_description = self._html_search_regex(
+ r'<div class="entry-content">(?P<description>.+?)</div>',
+ webpage, 'description', flags=re.DOTALL, fatal=False)
+ video_thumbnail = self._og_search_thumbnail(webpage)
+
+ return {
+ '_type': 'url_transparent',
+ 'display_id': display_id,
+ 'title': video_title,
+ 'description': video_description,
+ 'upload_date': video_date,
+ 'thumbnail': video_thumbnail,
+ 'url': playerdata_url,
+ }
+
+
+class TeamFourIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/video/(?P<id>[a-z0-9\-]+)/?'
+ _TEST = {
+ 'url': 'http://teamfourstar.com/video/a-moment-with-tfs-episode-4/',
+ 'info_dict': {
+ 'id': 'TeamFourStar-5292a02f20bfa',
+ 'ext': 'mp4',
+ 'upload_date': '20130401',
+ 'description': 'Check out this and more on our website: http://teamfourstar.com\nTFS Store: http://sharkrobot.com/team-four-star\nFollow on Twitter: http://twitter.com/teamfourstar\nLike on FB: http://facebook.com/teamfourstar',
+ 'title': 'A Moment With TFS Episode 4',
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ playerdata_url = self._search_regex(
+ r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
+ webpage, 'player data URL')
+
+ video_title = self._html_search_regex(
+ r'<div class="heroheadingtitle">(?P<title>.+?)</div>',
+ webpage, 'title')
+ video_date = unified_strdate(self._html_search_regex(
+ r'<div class="heroheadingdate">(?P<date>.+?)</div>',
+ webpage, 'date', fatal=False))
+ video_description = self._html_search_regex(
+ r'(?s)<div class="postcontent">(?P<description>.+?)</div>',
+ webpage, 'description', fatal=False)
+ video_thumbnail = self._og_search_thumbnail(webpage)
+
+ return {
+ '_type': 'url_transparent',
+ 'display_id': display_id,
+ 'title': video_title,
'description': video_description,
'upload_date': video_date,
'thumbnail': video_thumbnail,
+ 'url': playerdata_url,
}
diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py
index c833fc8ee..6446d26dc 100644
--- a/youtube_dl/extractor/sexykarma.py
+++ b/youtube_dl/extractor/sexykarma.py
@@ -24,7 +24,7 @@ class SexyKarmaIE(InfoExtractor):
'title': 'Taking a quick pee.',
'thumbnail': 're:^https?://.*\.jpg$',
'uploader': 'wildginger7',
- 'upload_date': '20141007',
+ 'upload_date': '20141008',
'duration': 22,
'view_count': int,
'comment_count': int,
@@ -45,6 +45,7 @@ class SexyKarmaIE(InfoExtractor):
'view_count': int,
'comment_count': int,
'categories': list,
+ 'age_limit': 18,
}
}, {
'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html',
@@ -61,6 +62,7 @@ class SexyKarmaIE(InfoExtractor):
'view_count': int,
'comment_count': int,
'categories': list,
+ 'age_limit': 18,
}
}]
@@ -114,4 +116,5 @@ class SexyKarmaIE(InfoExtractor):
'view_count': view_count,
'comment_count': comment_count,
'categories': categories,
+ 'age_limit': 18,
}
diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py
index fdc31603a..26ced716e 100644
--- a/youtube_dl/extractor/shared.py
+++ b/youtube_dl/extractor/shared.py
@@ -4,10 +4,12 @@ import re
import base64
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
from ..utils import (
ExtractorError,
- compat_urllib_request,
- compat_urllib_parse,
int_or_none,
)
@@ -26,26 +28,30 @@ class SharedIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- page = self._download_webpage(url, video_id)
-
- if re.search(r'>File does not exist<', page) is not None:
- raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
- download_form = dict(re.findall(r'<input type="hidden" name="([^"]+)" value="([^"]*)"', page))
+ if '>File does not exist<' in webpage:
+ raise ExtractorError(
+ 'Video %s does not exist' % video_id, expected=True)
- request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(download_form))
+ download_form = dict(re.findall(
+ r'<input type="hidden" name="([^"]+)" value="([^"]*)"', webpage))
+ request = compat_urllib_request.Request(
+ url, compat_urllib_parse.urlencode(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- video_page = self._download_webpage(request, video_id, 'Downloading video page')
+ video_page = self._download_webpage(
+ request, video_id, 'Downloading video page')
- video_url = self._html_search_regex(r'data-url="([^"]+)"', video_page, 'video URL')
- title = base64.b64decode(self._html_search_meta('full:title', page, 'title')).decode('utf-8')
- filesize = int_or_none(self._html_search_meta('full:size', page, 'file size', fatal=False))
+ video_url = self._html_search_regex(
+ r'data-url="([^"]+)"', video_page, 'video URL')
+ title = base64.b64decode(self._html_search_meta(
+ 'full:title', webpage, 'title')).decode('utf-8')
+ filesize = int_or_none(self._html_search_meta(
+ 'full:size', webpage, 'file size', fatal=False))
thumbnail = self._html_search_regex(
- r'data-poster="([^"]+)"', video_page, 'thumbnail', fatal=False, default=None)
+ r'data-poster="([^"]+)"', video_page, 'thumbnail', default=None)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/sharesix.py b/youtube_dl/extractor/sharesix.py
index 7531e8325..ac3e3adf2 100644
--- a/youtube_dl/extractor/sharesix.py
+++ b/youtube_dl/extractor/sharesix.py
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
parse_duration,
)
diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py
index 5eadbb7ea..a63d126d4 100644
--- a/youtube_dl/extractor/sina.py
+++ b/youtube_dl/extractor/sina.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
compat_urllib_parse,
)
diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py
index 5864b9936..9f79ff5c1 100644
--- a/youtube_dl/extractor/slideshare.py
+++ b/youtube_dl/extractor/slideshare.py
@@ -4,8 +4,10 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
+)
+from ..utils import (
ExtractorError,
)
@@ -28,7 +30,7 @@ class SlideshareIE(InfoExtractor):
page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title)
slideshare_obj = self._search_regex(
- r'var slideshare_object = ({.*?}); var user_info =',
+ r'var\s+slideshare_object\s*=\s*({.*?});\s*var\s+user_info\s*=',
webpage, 'slideshare object')
info = json.loads(slideshare_obj)
if info['slideshow']['type'] != 'video':
@@ -39,7 +41,7 @@ class SlideshareIE(InfoExtractor):
ext = info['jsplayer']['video_extension']
video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
description = self._html_search_regex(
- r'<p\s+(?:style="[^"]*"\s+)?class=".*?description.*?"[^>]*>(.*?)</p>', webpage,
+ r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage,
'description', fatal=False)
return {
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
index 0751efc61..baef3daa0 100644
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -7,9 +7,11 @@ import hashlib
import uuid
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
int_or_none,
unified_strdate,
@@ -67,6 +69,7 @@ class SmotriIE(InfoExtractor):
'params': {
'videopassword': 'qwerty',
},
+ 'skip': 'Video is not approved by moderator',
},
# age limit + video-password
{
@@ -84,7 +87,8 @@ class SmotriIE(InfoExtractor):
},
'params': {
'videopassword': '333'
- }
+ },
+ 'skip': 'Video is not approved by moderator',
},
# swf player
{
@@ -274,15 +278,18 @@ class SmotriBroadcastIE(InfoExtractor):
broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page')
if re.search('>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None:
- raise ExtractorError('Broadcast %s does not exist' % broadcast_id, expected=True)
+ raise ExtractorError(
+ 'Broadcast %s does not exist' % broadcast_id, expected=True)
# Adult content
if re.search('EroConfirmText">', broadcast_page) is not None:
(username, password) = self._get_login_info()
if username is None:
- raise ExtractorError('Erotic broadcasts allowed only for registered users, '
- 'use --username and --password options to provide account credentials.', expected=True)
+ raise ExtractorError(
+ 'Erotic broadcasts allowed only for registered users, '
+ 'use --username and --password options to provide account credentials.',
+ expected=True)
login_form = {
'login-hint53': '1',
@@ -291,9 +298,11 @@ class SmotriBroadcastIE(InfoExtractor):
'password': password,
}
- request = compat_urllib_request.Request(broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form))
+ request = compat_urllib_request.Request(
+ broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- broadcast_page = self._download_webpage(request, broadcast_id, 'Logging in and confirming age')
+ broadcast_page = self._download_webpage(
+ request, broadcast_id, 'Logging in and confirming age')
if re.search('>Неверный логин или пароль<', broadcast_page) is not None:
raise ExtractorError('Unable to log in: bad username or password', expected=True)
@@ -303,7 +312,7 @@ class SmotriBroadcastIE(InfoExtractor):
adult_content = False
ticket = self._html_search_regex(
- 'window\.broadcast_control\.addFlashVar\\(\'file\', \'([^\']+)\'\\);',
+ r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'([^']+)'\)",
broadcast_page, 'broadcast ticket')
url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket
@@ -312,26 +321,31 @@ class SmotriBroadcastIE(InfoExtractor):
if broadcast_password:
url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest()
- broadcast_json_page = self._download_webpage(url, broadcast_id, 'Downloading broadcast JSON')
+ broadcast_json_page = self._download_webpage(
+ url, broadcast_id, 'Downloading broadcast JSON')
try:
broadcast_json = json.loads(broadcast_json_page)
protected_broadcast = broadcast_json['_pass_protected'] == 1
if protected_broadcast and not broadcast_password:
- raise ExtractorError('This broadcast is protected by a password, use the --video-password option', expected=True)
+ raise ExtractorError(
+ 'This broadcast is protected by a password, use the --video-password option',
+ expected=True)
broadcast_offline = broadcast_json['is_play'] == 0
if broadcast_offline:
raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True)
rtmp_url = broadcast_json['_server']
- if not rtmp_url.startswith('rtmp://'):
+ mobj = re.search(r'^rtmp://[^/]+/(?P<app>.+)/?$', rtmp_url)
+ if not mobj:
raise ExtractorError('Unexpected broadcast rtmp URL')
broadcast_playpath = broadcast_json['_streamName']
+ broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL'])
broadcast_thumbnail = broadcast_json['_imgURL']
- broadcast_title = broadcast_json['title']
+ broadcast_title = self._live_title(broadcast_json['title'])
broadcast_description = broadcast_json['description']
broadcaster_nick = broadcast_json['nick']
broadcaster_login = broadcast_json['login']
@@ -352,6 +366,9 @@ class SmotriBroadcastIE(InfoExtractor):
'age_limit': 18 if adult_content else 0,
'ext': 'flv',
'play_path': broadcast_playpath,
+ 'player_url': 'http://pics.smotri.com/broadcast_play.swf',
+ 'app': broadcast_app,
'rtmp_live': True,
- 'rtmp_conn': rtmp_conn
+ 'rtmp_conn': rtmp_conn,
+ 'is_live': True,
}
diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py
index c663e56d4..7d3c0e937 100644
--- a/youtube_dl/extractor/sockshare.py
+++ b/youtube_dl/extractor/sockshare.py
@@ -1,13 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
-from ..utils import (
- ExtractorError,
+import re
+
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
determine_ext,
+ ExtractorError,
)
-import re
from .common import InfoExtractor
@@ -27,9 +30,7 @@ class SockshareIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
url = 'http://sockshare.com/file/%s' % video_id
webpage = self._download_webpage(url, video_id)
diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
index 07f514a46..c04791997 100644
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@@ -1,11 +1,10 @@
# encoding: utf-8
from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
-from ..utils import ExtractorError
+from .common import compat_str
class SohuIE(InfoExtractor):
@@ -29,60 +28,73 @@ class SohuIE(InfoExtractor):
base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid='
else:
base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
- data_url = base_data_url + str(vid_id)
- data_json = self._download_webpage(
- data_url, video_id,
- note='Downloading JSON data for ' + str(vid_id))
- return json.loads(data_json)
+
+ return self._download_json(
+ base_data_url + vid_id, video_id,
+ 'Downloading JSON data for %s' % vid_id)
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
mytv = mobj.group('mytv') is not None
webpage = self._download_webpage(url, video_id)
- raw_title = self._html_search_regex(r'(?s)<title>(.+?)</title>',
- webpage, 'video title')
+ raw_title = self._html_search_regex(
+ r'(?s)<title>(.+?)</title>',
+ webpage, 'video title')
title = raw_title.partition('-')[0].strip()
- vid = self._html_search_regex(r'var vid ?= ?["\'](\d+)["\']', webpage,
- 'video path')
- data = _fetch_data(vid, mytv)
-
- QUALITIES = ('ori', 'super', 'high', 'nor')
- vid_ids = [data['data'][q + 'Vid']
- for q in QUALITIES
- if data['data'][q + 'Vid'] != 0]
- if not vid_ids:
- raise ExtractorError('No formats available for this video')
+ vid = self._html_search_regex(
+ r'var vid ?= ?["\'](\d+)["\']',
+ webpage, 'video path')
+ vid_data = _fetch_data(vid, mytv)
- # For now, we just pick the highest available quality
- vid_id = vid_ids[-1]
+ formats_json = {}
+ for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'):
+ vid_id = vid_data['data'].get('%sVid' % format_id)
+ if not vid_id:
+ continue
+ vid_id = compat_str(vid_id)
+ formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv)
- format_data = data if vid == vid_id else _fetch_data(vid_id, mytv)
- part_count = format_data['data']['totalBlocks']
- allot = format_data['allot']
- prot = format_data['prot']
- clipsURL = format_data['data']['clipsURL']
- su = format_data['data']['su']
+ part_count = vid_data['data']['totalBlocks']
playlist = []
for i in range(part_count):
- part_url = ('http://%s/?prot=%s&file=%s&new=%s' %
- (allot, prot, clipsURL[i], su[i]))
- part_str = self._download_webpage(
- part_url, video_id,
- note='Downloading part %d of %d' % (i + 1, part_count))
-
- part_info = part_str.split('|')
- video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3])
-
- video_info = {
- 'id': '%s_part%02d' % (video_id, i + 1),
+ formats = []
+ for format_id, format_data in formats_json.items():
+ allot = format_data['allot']
+ prot = format_data['prot']
+
+ data = format_data['data']
+ clips_url = data['clipsURL']
+ su = data['su']
+
+ part_str = self._download_webpage(
+ 'http://%s/?prot=%s&file=%s&new=%s' %
+ (allot, prot, clips_url[i], su[i]),
+ video_id,
+ 'Downloading %s video URL part %d of %d'
+ % (format_id, i + 1, part_count))
+
+ part_info = part_str.split('|')
+ video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3])
+
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'filesize': data['clipsBytes'][i],
+ 'width': data['width'],
+ 'height': data['height'],
+ 'fps': data['fps'],
+ })
+ self._sort_formats(formats)
+
+ playlist.append({
+ 'id': '%s_part%d' % (video_id, i + 1),
'title': title,
- 'url': video_url,
- 'ext': 'mp4',
- }
- playlist.append(video_info)
+ 'duration': vid_data['data']['clipsDuration'][i],
+ 'formats': formats,
+ })
if len(playlist) == 1:
info = playlist[0]
diff --git a/youtube_dl/extractor/soulanime.py b/youtube_dl/extractor/soulanime.py
new file mode 100644
index 000000000..feef33e27
--- /dev/null
+++ b/youtube_dl/extractor/soulanime.py
@@ -0,0 +1,80 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ HEADRequest,
+ urlhandle_detect_ext,
+)
+
+
+class SoulAnimeWatchingIE(InfoExtractor):
+ IE_NAME = "soulanime:watching"
+ IE_DESC = "SoulAnime video"
+ _TEST = {
+ 'url': 'http://www.soul-anime.net/watching/seirei-tsukai-no-blade-dance-episode-9/',
+ 'md5': '05fae04abf72298098b528e98abf4298',
+ 'info_dict': {
+ 'id': 'seirei-tsukai-no-blade-dance-episode-9',
+ 'ext': 'mp4',
+ 'title': 'seirei-tsukai-no-blade-dance-episode-9',
+ 'description': 'seirei-tsukai-no-blade-dance-episode-9'
+ }
+ }
+ _VALID_URL = r'http://[w.]*soul-anime\.(?P<domain>[^/]+)/watch[^/]*/(?P<id>[^/]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ domain = mobj.group('domain')
+
+ page = self._download_webpage(url, video_id)
+
+ video_url_encoded = self._html_search_regex(
+ r'<div id="download">[^<]*<a href="(?P<url>[^"]+)"', page, 'url')
+ video_url = "http://www.soul-anime." + domain + video_url_encoded
+
+ ext_req = HEADRequest(video_url)
+ ext_handle = self._request_webpage(
+ ext_req, video_id, note='Determining extension')
+ ext = urlhandle_detect_ext(ext_handle)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': ext,
+ 'title': video_id,
+ 'description': video_id
+ }
+
+
+class SoulAnimeSeriesIE(InfoExtractor):
+ IE_NAME = "soulanime:series"
+ IE_DESC = "SoulAnime Series"
+
+ _VALID_URL = r'http://[w.]*soul-anime\.(?P<domain>[^/]+)/anime./(?P<id>[^/]+)'
+
+ _EPISODE_REGEX = r'<option value="(/watch[^/]*/[^"]+)">[^<]*</option>'
+
+ _TEST = {
+ 'url': 'http://www.soul-anime.net/anime1/black-rock-shooter-tv/',
+ 'info_dict': {
+ 'id': 'black-rock-shooter-tv'
+ },
+ 'playlist_count': 8
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ series_id = mobj.group('id')
+ domain = mobj.group('domain')
+
+ pattern = re.compile(self._EPISODE_REGEX)
+
+ page = self._download_webpage(url, series_id, "Downloading series page")
+ mobj = pattern.findall(page)
+
+ entries = [self.url_result("http://www.soul-anime." + domain + obj) for obj in mobj]
+
+ return self.playlist_result(entries, series_id)
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index ab9483d2d..5d60c4939 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -5,11 +5,12 @@ import re
import itertools
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
compat_urlparse,
compat_urllib_parse,
-
+)
+from ..utils import (
ExtractorError,
int_or_none,
unified_strdate,
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py
index 94602e89e..b936202f6 100644
--- a/youtube_dl/extractor/spankwire.py
+++ b/youtube_dl/extractor/spankwire.py
@@ -3,12 +3,14 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
+ compat_urllib_parse,
compat_urllib_parse_urlparse,
compat_urllib_request,
- compat_urllib_parse,
- unified_strdate,
+)
+from ..utils import (
str_to_int,
+ unified_strdate,
)
from ..aes import aes_decrypt_text
diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py
index 057ef5251..1a57aebf1 100644
--- a/youtube_dl/extractor/sportdeutschland.py
+++ b/youtube_dl/extractor/sportdeutschland.py
@@ -4,8 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+)
+from ..utils import (
parse_iso8601,
)
@@ -58,9 +60,10 @@ class SportDeutschlandIE(InfoExtractor):
categories = list(data.get('section', {}).get('tags', {}).values())
asset = data['asset']
+ assets_info = self._download_json(asset['url'], video_id)
formats = []
- smil_url = asset['video']
+ smil_url = assets_info['video']
if '.smil' in smil_url:
m3u8_url = smil_url.replace('.smil', '.m3u8')
formats.extend(
diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py
index c1178f26d..d4e134015 100644
--- a/youtube_dl/extractor/streamcloud.py
+++ b/youtube_dl/extractor/streamcloud.py
@@ -2,10 +2,9 @@
from __future__ import unicode_literals
import re
-import time
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
@@ -40,8 +39,7 @@ class StreamcloudIE(InfoExtractor):
''', orig_webpage)
post = compat_urllib_parse.urlencode(fields)
- self.to_screen('%s: Waiting for timeout' % video_id)
- time.sleep(12)
+ self._sleep(12, video_id)
headers = {
b'Content-Type': b'application/x-www-form-urlencoded',
}
diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py
index 73efe9542..c3ceb5f76 100644
--- a/youtube_dl/extractor/streamcz.py
+++ b/youtube_dl/extractor/streamcz.py
@@ -1,18 +1,14 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import re
-import json
-
from .common import InfoExtractor
from ..utils import (
int_or_none,
- compat_str,
)
class StreamCZIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<videoid>.+)'
+ _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti',
@@ -21,61 +17,63 @@ class StreamCZIE(InfoExtractor):
'id': '765767',
'ext': 'mp4',
'title': 'Peklo na talíři: Éčka pro děti',
- 'description': 'md5:49ace0df986e95e331d0fe239d421519',
- 'thumbnail': 'http://im.stream.cz/episode/52961d7e19d423f8f06f0100',
+ 'description': 'Taška s grónskou pomazánkou a další pekelnosti ZDE',
+ 'thumbnail': 're:^http://im.stream.cz/episode/52961d7e19d423f8f06f0100',
'duration': 256,
},
}, {
'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka',
- 'md5': '246272e753e26bbace7fcd9deca0650c',
+ 'md5': 'e54a254fb8b871968fd8403255f28589',
'info_dict': {
'id': '10002447',
'ext': 'mp4',
'title': 'Kancelář Blaník: Tři roky pro Mazánka',
- 'description': 'md5:9177695a8b756a0a8ab160de4043b392',
- 'thumbnail': 'http://im.stream.cz/episode/537f838c50c11f8d21320000',
+ 'description': 'md5:3862a00ba7bf0b3e44806b544032c859',
+ 'thumbnail': 're:^http://im.stream.cz/episode/537f838c50c11f8d21320000',
'duration': 368,
},
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
-
- webpage = self._download_webpage(url, video_id)
-
- data = self._html_search_regex(r'Stream\.Data\.Episode\((.+?)\);', webpage, 'stream data')
-
- jsonData = json.loads(data)
+ video_id = self._match_id(url)
+ data = self._download_json(
+ 'http://www.stream.cz/API/episode/%s' % video_id, video_id)
formats = []
- for video in jsonData['instances']:
- for video_format in video['instances']:
- format_id = video_format['quality']
-
- if format_id == '240p':
- quality = 0
- elif format_id == '360p':
- quality = 1
- elif format_id == '480p':
- quality = 2
- elif format_id == '720p':
- quality = 3
-
+ for quality, video in enumerate(data['video_qualities']):
+ for f in video['formats']:
+ typ = f['type'].partition('/')[2]
+ qlabel = video.get('quality_label')
formats.append({
- 'format_id': '%s-%s' % (video_format['type'].split('/')[1], format_id),
- 'url': video_format['source'],
+ 'format_note': '%s-%s' % (qlabel, typ) if qlabel else typ,
+ 'format_id': '%s-%s' % (typ, f['quality']),
+ 'url': f['source'],
+ 'height': int_or_none(f['quality'].rstrip('p')),
'quality': quality,
})
-
self._sort_formats(formats)
+ image = data.get('image')
+ if image:
+ thumbnail = self._proto_relative_url(
+ image.replace('{width}', '1240').replace('{height}', '697'),
+ scheme='http:',
+ )
+ else:
+ thumbnail = None
+
+ stream = data.get('_embedded', {}).get('stream:show', {}).get('name')
+ if stream:
+ title = '%s: %s' % (stream, data['name'])
+ else:
+ title = data['name']
+
return {
- 'id': compat_str(jsonData['episode_id']),
- 'title': self._og_search_title(webpage),
- 'thumbnail': jsonData['episode_image_original_url'].replace('//', 'http://'),
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
'formats': formats,
- 'description': self._og_search_description(webpage),
- 'duration': int_or_none(jsonData['duration']),
- 'view_count': int_or_none(jsonData['stats_total']),
+ 'description': data.get('web_site_text'),
+ 'duration': int_or_none(data.get('duration')),
+ 'view_count': int_or_none(data.get('views')),
}
diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py
index 263f09b46..8a333f1d2 100644
--- a/youtube_dl/extractor/sunporno.py
+++ b/youtube_dl/extractor/sunporno.py
@@ -28,23 +28,27 @@ class SunPornoIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
- description = self._html_search_meta('description', webpage, 'description')
+ title = self._html_search_regex(
+ r'<title>([^<]+)</title>', webpage, 'title')
+ description = self._html_search_meta(
+ 'description', webpage, 'description')
thumbnail = self._html_search_regex(
r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
duration = parse_duration(self._search_regex(
- r'Duration:\s*(\d+:\d+)\s*<', webpage, 'duration', fatal=False))
+ r'itemprop="duration">\s*(\d+:\d+)\s*<',
+ webpage, 'duration', fatal=False))
view_count = int_or_none(self._html_search_regex(
- r'class="views">\s*(\d+)\s*<', webpage, 'view count', fatal=False))
+ r'class="views">\s*(\d+)\s*<',
+ webpage, 'view count', fatal=False))
comment_count = int_or_none(self._html_search_regex(
- r'(\d+)</b> Comments?', webpage, 'comment count', fatal=False))
+ r'(\d+)</b> Comments?',
+ webpage, 'comment count', fatal=False))
formats = []
quality = qualities(['mp4', 'flv'])
diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py
index b87047451..bfe07b024 100644
--- a/youtube_dl/extractor/tagesschau.py
+++ b/youtube_dl/extractor/tagesschau.py
@@ -4,10 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import parse_filesize
class TagesschauIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/video/video(?P<id>-?[0-9]+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/ts|video/video)(?P<id>-?[0-9]+)\.html'
_TESTS = [{
'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html',
@@ -19,6 +20,16 @@ class TagesschauIE(InfoExtractor):
'description': 'md5:69da3c61275b426426d711bde96463ab',
'thumbnail': 're:^http:.*\.jpg$',
},
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
+ 'md5': '3c54c1f6243d279b706bde660ceec633',
+ 'info_dict': {
+ 'id': '5727',
+ 'ext': 'mp4',
+ 'description': 'md5:695c01bfd98b7e313c501386327aea59',
+ 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
+ 'thumbnail': 're:^http:.*\.jpg$',
+ }
}]
_FORMATS = {
@@ -28,42 +39,82 @@ class TagesschauIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- if video_id.startswith('-'):
- display_id = video_id.strip('-')
- else:
- display_id = video_id
-
+ video_id = self._match_id(url)
+ display_id = video_id.lstrip('-')
webpage = self._download_webpage(url, display_id)
- playerpage = self._download_webpage(
- 'http://www.tagesschau.de/multimedia/video/video%s~player_autoplay-true.html' % video_id,
- display_id, 'Downloading player page')
-
- medias = re.findall(
- r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"',
- playerpage)
+ player_url = self._html_search_meta(
+ 'twitter:player', webpage, 'player URL', default=None)
+ if player_url:
+ playerpage = self._download_webpage(
+ player_url, display_id, 'Downloading player page')
- formats = []
- for url, ext, res in medias:
- f = {
- 'format_id': res + '_' + ext,
- 'url': url,
- 'ext': ext,
- }
- f.update(self._FORMATS.get(res, {}))
- formats.append(f)
+ medias = re.findall(
+ r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"',
+ playerpage)
+ formats = []
+ for url, ext, res in medias:
+ f = {
+ 'format_id': res + '_' + ext,
+ 'url': url,
+ 'ext': ext,
+ }
+ f.update(self._FORMATS.get(res, {}))
+ formats.append(f)
+ thumbnail_fn = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
+ title = self._og_search_title(webpage).strip()
+ description = self._og_search_description(webpage).strip()
+ else:
+ download_text = self._search_regex(
+ r'(?s)<p>Wir bieten dieses Video in folgenden Formaten zum Download an:</p>\s*<div class="controls">(.*?)</div>\s*<p>',
+ webpage, 'download links')
+ links = re.finditer(
+ r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
+ download_text)
+ formats = []
+ for l in links:
+ format_id = self._search_regex(
+ r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID')
+ format = {
+ 'format_id': format_id,
+ 'url': l.group('url'),
+ 'format_name': l.group('name'),
+ }
+ m = re.match(
+ r'''(?x)
+ Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
+ (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
+ (?P<vbr>[0-9]+)kbps&\#10;
+ Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
+ Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
+ l.group('title'))
+ if m:
+ format.update({
+ 'format_note': m.group('audio_desc'),
+ 'vcodec': m.group('vcodec'),
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ 'abr': int(m.group('abr')),
+ 'vbr': int(m.group('vbr')),
+ 'filesize_approx': parse_filesize(m.group('filesize_approx')),
+ })
+ formats.append(format)
+ thumbnail_fn = self._search_regex(
+ r'(?s)<img alt="Sendungsbild".*?src="([^"]+)"',
+ webpage, 'thumbnail', fatal=False)
+ description = self._html_search_regex(
+ r'(?s)<p class="teasertext">(.*?)</p>',
+ webpage, 'description', fatal=False)
+ title = self._html_search_regex(
+ r'<span class="headline".*?>(.*?)</span>', webpage, 'title')
self._sort_formats(formats)
-
- thumbnail = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
+ thumbnail = 'http://www.tagesschau.de' + thumbnail_fn
return {
'id': display_id,
- 'title': self._og_search_title(webpage).strip(),
- 'thumbnail': 'http://www.tagesschau.de' + thumbnail,
+ 'title': title,
+ 'thumbnail': thumbnail,
'formats': formats,
- 'description': self._og_search_description(webpage).strip(),
+ 'description': description,
}
diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py
index 283e11350..f1f43d0a7 100644
--- a/youtube_dl/extractor/tapely.py
+++ b/youtube_dl/extractor/tapely.py
@@ -4,10 +4,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+)
from ..utils import (
- ExtractorError,
clean_html,
- compat_urllib_request,
+ ExtractorError,
float_or_none,
parse_iso8601,
)
diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py
index 6c3445d79..82675431f 100644
--- a/youtube_dl/extractor/teachertube.py
+++ b/youtube_dl/extractor/teachertube.py
@@ -57,9 +57,7 @@ class TeacherTubeIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_meta('title', webpage, 'title', fatal=True)
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 72160503c..10b3b706a 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -5,7 +5,7 @@ import re
from .subtitles import SubtitlesInfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
)
@@ -13,7 +13,7 @@ from ..utils import (
class TEDIE(SubtitlesInfoExtractor):
_VALID_URL = r'''(?x)
(?P<proto>https?://)
- (?P<type>www|embed)(?P<urlmain>\.ted\.com/
+ (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
(
(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
|
@@ -98,7 +98,7 @@ class TEDIE(SubtitlesInfoExtractor):
def _real_extract(self, url):
m = re.match(self._VALID_URL, url, re.VERBOSE)
- if m.group('type') == 'embed':
+ if m.group('type').startswith('embed'):
desktop_url = m.group('proto') + 'www' + m.group('urlmain')
return self.url_result(desktop_url, 'TED')
name = m.group('name')
diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py
index 2a2fff5e1..be3f72df7 100644
--- a/youtube_dl/extractor/telecinco.py
+++ b/youtube_dl/extractor/telecinco.py
@@ -6,7 +6,7 @@ from .mitele import MiTeleIE
class TelecincoIE(MiTeleIE):
IE_NAME = 'telecinco.es'
- _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/[^/]+/(?P<episode>.*?)\.html'
+ _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/[^/]+/(?P<id>.*?)\.html'
_TEST = {
'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
diff --git a/youtube_dl/extractor/teletask.py b/youtube_dl/extractor/teletask.py
new file mode 100644
index 000000000..e54145105
--- /dev/null
+++ b/youtube_dl/extractor/teletask.py
@@ -0,0 +1,53 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class TeleTaskIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tele-task\.de/archive/video/html5/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.tele-task.de/archive/video/html5/26168/',
+ 'info_dict': {
+ 'title': 'Duplicate Detection',
+ },
+ 'playlist': [{
+ 'md5': '290ef69fb2792e481169c3958dbfbd57',
+ 'info_dict': {
+ 'id': '26168-speaker',
+ 'ext': 'mp4',
+ 'title': 'Duplicate Detection',
+ 'upload_date': '20141218',
+ }
+ }, {
+ 'md5': 'e1e7218c5f0e4790015a437fcf6c71b4',
+ 'info_dict': {
+ 'id': '26168-slides',
+ 'ext': 'mp4',
+ 'title': 'Duplicate Detection',
+ 'upload_date': '20141218',
+ }
+ }]
+ }
+
+ def _real_extract(self, url):
+ lecture_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, lecture_id)
+
+ title = self._html_search_regex(
+ r'itemprop="name">([^<]+)</a>', webpage, 'title')
+ upload_date = unified_strdate(self._html_search_regex(
+ r'Date:</td><td>([^<]+)</td>', webpage, 'date', fatal=False))
+
+ entries = [{
+ 'id': '%s-%s' % (lecture_id, format_id),
+ 'url': video_url,
+ 'title': title,
+ 'upload_date': upload_date,
+ } for format_id, video_url in re.findall(
+ r'<video class="([^"]+)"[^>]*>\s*<source src="([^"]+)"', webpage)]
+
+ return self.playlist_result(entries, lecture_id, title)
diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py
index 81ba169fb..466155ef8 100644
--- a/youtube_dl/extractor/tenplay.py
+++ b/youtube_dl/extractor/tenplay.py
@@ -8,7 +8,6 @@ class TenPlayIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ten(play)?\.com\.au/.+'
_TEST = {
'url': 'http://tenplay.com.au/ten-insider/extra/season-2013/tenplay-tv-your-way',
- #'md5': 'd68703d9f73dc8fccf3320ab34202590',
'info_dict': {
'id': '2695695426001',
'ext': 'flv',
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index 6e61cc9e2..025d0877c 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -1,15 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
class TF1IE(InfoExtractor):
"""TF1 uses the wat.tv player."""
- _VALID_URL = r'http://videos\.tf1\.fr/.*-(?P<id>.*?)\.html'
- _TEST = {
+ _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
+ _TESTS = {
'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
'info_dict': {
'id': '10635995',
@@ -21,14 +19,26 @@ class TF1IE(InfoExtractor):
# Sometimes wat serves the whole file with the --test option
'skip_download': True,
},
+ }, {
+ 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html',
+ 'info_dict': {
+ 'id': '12043945',
+ 'ext': 'mp4',
+ 'title': 'Le grand Mystérioso - Chuggington',
+ 'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.',
+ 'upload_date': '20150103',
+ },
+ 'params': {
+ # Sometimes wat serves the whole file with the --test option
+ 'skip_download': True,
+ },
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
embed_url = self._html_search_regex(
- r'"(https://www.wat.tv/embedframe/.*?)"', webpage, 'embed url')
+ r'["\'](https?://www.wat.tv/embedframe/.*?)["\']', webpage, 'embed url')
embed_page = self._download_webpage(embed_url, video_id,
'Downloading embed player page')
wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id')
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index e2653d62d..110ed976d 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
import json
-from .common import InfoExtractor
-from ..utils import (
+from .subtitles import SubtitlesInfoExtractor
+from ..compat import (
compat_str,
+)
+from ..utils import (
determine_ext,
ExtractorError,
xpath_with_ns,
@@ -14,7 +16,7 @@ from ..utils import (
_x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'})
-class ThePlatformIE(InfoExtractor):
+class ThePlatformIE(SubtitlesInfoExtractor):
_VALID_URL = r'''(?x)
(?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/
(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
@@ -64,6 +66,20 @@ class ThePlatformIE(InfoExtractor):
info_json = self._download_webpage(info_url, video_id)
info = json.loads(info_json)
+ subtitles = {}
+ captions = info.get('captions')
+ if isinstance(captions, list):
+ for caption in captions:
+ lang, src = caption.get('lang'), caption.get('src')
+ if lang and src:
+ subtitles[lang] = src
+
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, subtitles)
+ return
+
+ subtitles = self.extract_subtitles(video_id, subtitles)
+
head = meta.find(_x('smil:head'))
body = meta.find(_x('smil:body'))
@@ -115,6 +131,7 @@ class ThePlatformIE(InfoExtractor):
return {
'id': video_id,
'title': info['title'],
+ 'subtitles': subtitles,
'formats': formats,
'description': info['description'],
'thumbnail': info['defaultThumbnailUrl'],
diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py
index 66d159e99..9f9e388c5 100644
--- a/youtube_dl/extractor/tlc.py
+++ b/youtube_dl/extractor/tlc.py
@@ -5,7 +5,7 @@ import re
from .common import InfoExtractor
from .brightcove import BrightcoveIE
from .discovery import DiscoveryIE
-from ..utils import compat_urlparse
+from ..compat import compat_urlparse
class TlcIE(DiscoveryIE):
diff --git a/youtube_dl/extractor/tmz.py b/youtube_dl/extractor/tmz.py
index 827aa08a4..c5c6fdc51 100644
--- a/youtube_dl/extractor/tmz.py
+++ b/youtube_dl/extractor/tmz.py
@@ -15,7 +15,7 @@ class TMZIE(InfoExtractor):
'ext': 'mp4',
'title': 'Kim Kardashian\'s Boobs Unlock a Mystery!',
'description': 'Did Kim Kardasain try to one-up Khloe by one-upping Kylie??? Or is she just showing off her amazing boobs?',
- 'thumbnail': 'http://cdnbakmi.kaltura.com/p/591531/sp/59153100/thumbnail/entry_id/0_okj015ty/version/100002/acv/182/width/640',
+ 'thumbnail': r're:http://cdnbakmi\.kaltura\.com/.*thumbnail.*',
}
}
diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py
index 0ecd695f8..d48cbbf14 100644
--- a/youtube_dl/extractor/tnaflix.py
+++ b/youtube_dl/extractor/tnaflix.py
@@ -12,7 +12,7 @@ from ..utils import (
class TNAFlixIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)'
- _TITLE_REGEX = None
+ _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
_DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
_CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
@@ -49,8 +49,8 @@ class TNAFlixIE(InfoExtractor):
if duration:
duration = parse_duration(duration[1:])
- cfg_url = self._html_search_regex(
- self._CONFIG_REGEX, webpage, 'flashvars.config')
+ cfg_url = self._proto_relative_url(self._html_search_regex(
+ self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:')
cfg_xml = self._download_xml(
cfg_url, display_id, note='Downloading metadata',
diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py
index 64a1e9030..d73ad3762 100644
--- a/youtube_dl/extractor/tube8.py
+++ b/youtube_dl/extractor/tube8.py
@@ -4,9 +4,11 @@ import json
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
compat_urllib_request,
+)
+from ..utils import (
int_or_none,
str_to_int,
)
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py
index 161e47624..c89de5ba4 100644
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -9,7 +9,7 @@ from .common import InfoExtractor
class TudouIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs|albumplay)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
+ _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/.*?/(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])'
_TESTS = [{
'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
'md5': '140a49ed444bd22f93330985d8475fcb',
@@ -27,13 +27,6 @@ class TudouIE(InfoExtractor):
'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
'thumbnail': 're:^https?://.*\.jpg$',
}
- }, {
- 'url': 'http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html',
- 'info_dict': {
- 'title': 'todo.mp4',
- },
- 'add_ie': ['Youku'],
- 'skip': 'Only works from China'
}]
def _url_for_id(self, id, quality=None):
@@ -45,8 +38,7 @@ class TudouIE(InfoExtractor):
return final_url
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(2)
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage)
@@ -87,4 +79,9 @@ class TudouIE(InfoExtractor):
}
result.append(part_info)
- return result
+ return {
+ '_type': 'multi_video',
+ 'entries': result,
+ 'id': video_id,
+ 'title': title,
+ }
diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py
index 4ce5aeeba..b6b1f2568 100644
--- a/youtube_dl/extractor/tunein.py
+++ b/youtube_dl/extractor/tunein.py
@@ -24,7 +24,7 @@ class TuneInIE(InfoExtractor):
_INFO_DICT = {
'id': '34682',
'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2',
- 'ext': 'AAC',
+ 'ext': 'aac',
'thumbnail': 're:^https?://.*\.png$',
'location': 'Tacoma, WA',
}
@@ -78,14 +78,21 @@ class TuneInIE(InfoExtractor):
for stream in streams:
if stream.get('Type') == 'Live':
is_live = True
+ reliability = stream.get('Reliability')
+ format_note = (
+ 'Reliability: %d%%' % reliability
+ if reliability is not None else None)
formats.append({
+ 'preference': (
+ 0 if reliability is None or reliability > 90
+ else 1),
'abr': stream.get('Bandwidth'),
- 'ext': stream.get('MediaType'),
+ 'ext': stream.get('MediaType').lower(),
'acodec': stream.get('MediaType'),
'vcodec': 'none',
'url': stream.get('Url'),
- # Sometimes streams with the highest quality do not exist
- 'preference': stream.get('Reliability'),
+ 'source_preference': reliability,
+ 'format_note': format_note,
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py
index d516b6427..4de0aac52 100644
--- a/youtube_dl/extractor/tutv.py
+++ b/youtube_dl/extractor/tutv.py
@@ -1,10 +1,9 @@
from __future__ import unicode_literals
import base64
-import re
from .common import InfoExtractor
-from ..utils import compat_parse_qs
+from ..compat import compat_parse_qs
class TutvIE(InfoExtractor):
@@ -20,10 +19,9 @@ class TutvIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+
internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, 'internal video ID')
data_content = self._download_webpage(
diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py
index d81d1d1a6..ba65996dc 100644
--- a/youtube_dl/extractor/tvigle.py
+++ b/youtube_dl/extractor/tvigle.py
@@ -1,32 +1,30 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
float_or_none,
- str_to_int,
+ parse_age_limit,
)
class TvigleIE(InfoExtractor):
IE_NAME = 'tvigle'
IE_DESC = 'Интернет-телевидение Tvigle.ru'
- _VALID_URL = r'http://(?:www\.)?tvigle\.ru/(?:[^/]+/)+(?P<display_id>[^/]+)/$'
+ _VALID_URL = r'http://(?:www\.)?tvigle\.ru/(?:[^/]+/)+(?P<id>[^/]+)/$'
_TESTS = [
{
- 'url': 'http://www.tvigle.ru/video/brat/',
- 'md5': 'ff4344a4894b0524441fb6f8218dc716',
+ 'url': 'http://www.tvigle.ru/video/sokrat/',
+ 'md5': '36514aed3657d4f70b4b2cef8eb520cd',
'info_dict': {
- 'id': '5118490',
- 'display_id': 'brat',
- 'ext': 'mp4',
- 'title': 'Брат',
- 'description': 'md5:d16ac7c0b47052ea51fddb92c4e413eb',
- 'duration': 5722.6,
- 'age_limit': 16,
+ 'id': '1848932',
+ 'display_id': 'sokrat',
+ 'ext': 'flv',
+ 'title': 'Сократ',
+ 'description': 'md5:a05bd01be310074d5833efc6743be95e',
+ 'duration': 6586,
+ 'age_limit': 0,
},
},
{
@@ -44,8 +42,7 @@ class TvigleIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('display_id')
+ display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
@@ -60,8 +57,8 @@ class TvigleIE(InfoExtractor):
title = item['title']
description = item['description']
thumbnail = item['thumbnail']
- duration = float_or_none(item['durationMilliseconds'], 1000)
- age_limit = str_to_int(item['ageRestrictions'])
+ duration = float_or_none(item.get('durationMilliseconds'), 1000)
+ age_limit = parse_age_limit(item.get('ageRestrictions'))
formats = []
for vcodec, fmts in item['videos'].items():
diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py
index eb9473754..9a53a3c74 100644
--- a/youtube_dl/extractor/tvplay.py
+++ b/youtube_dl/extractor/tvplay.py
@@ -6,7 +6,6 @@ import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
- ExtractorError,
parse_iso8601,
qualities,
)
@@ -182,8 +181,8 @@ class TVPlayIE(InfoExtractor):
'http://playapi.mtgx.tv/v1/videos/%s' % video_id, video_id, 'Downloading video JSON')
if video['is_geo_blocked']:
- raise ExtractorError(
- 'This content is not available in your country due to copyright reasons', expected=True)
+ self.report_warning(
+ 'This content might not be available in your country due to copyright reasons')
streams = self._download_json(
'http://playapi.mtgx.tv/v1/videos/stream/%s' % video_id, video_id, 'Downloading streams JSON')
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index 36aa1ad6e..b11a1d561 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -1,9 +1,14 @@
+# coding: utf-8
from __future__ import unicode_literals
import itertools
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
from ..utils import (
ExtractorError,
parse_iso8601,
@@ -17,6 +22,7 @@ class TwitchIE(InfoExtractor):
_VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?twitch\.tv/
(?:
(?P<channelid>[^/]+)|
+ (?:(?:[^/]+)/v/(?P<vodid>[^/]+))|
(?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
(?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
)
@@ -24,6 +30,7 @@ class TwitchIE(InfoExtractor):
"""
_PAGE_LIMIT = 100
_API_BASE = 'https://api.twitch.tv'
+ _LOGIN_URL = 'https://secure.twitch.tv/user/login'
_TESTS = [{
'url': 'http://www.twitch.tv/riotgames/b/577357806',
'info_dict': {
@@ -64,11 +71,24 @@ class TwitchIE(InfoExtractor):
def _extract_media(self, item, item_id):
ITEMS = {
'a': 'video',
+ 'v': 'vod',
'c': 'chapter',
}
info = self._extract_info(self._download_json(
'%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
'Downloading %s info JSON' % ITEMS[item]))
+
+ if item == 'v':
+ access_token = self._download_json(
+ '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
+ 'Downloading %s access token' % ITEMS[item])
+ formats = self._extract_m3u8_formats(
+ 'http://usher.twitch.tv/vod/%s?nauth=%s&nauthsig=%s'
+ % (item_id, access_token['token'], access_token['sig']),
+ item_id, 'mp4')
+ info['formats'] = formats
+ return info
+
response = self._download_json(
'%s/api/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
'Downloading %s playlist JSON' % ITEMS[item])
@@ -109,6 +129,44 @@ class TwitchIE(InfoExtractor):
'view_count': info['views'],
}
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None, 'Downloading login page')
+
+ authenticity_token = self._search_regex(
+ r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
+ login_page, 'authenticity token')
+
+ login_form = {
+ 'utf8': '✓'.encode('utf-8'),
+ 'authenticity_token': authenticity_token,
+ 'redirect_on_login': '',
+ 'embed_form': 'false',
+ 'mp_source_action': '',
+ 'follow': '',
+ 'user[login]': username,
+ 'user[password]': password,
+ }
+
+ request = compat_urllib_request.Request(
+ self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ request.add_header('Referer', self._LOGIN_URL)
+ response = self._download_webpage(
+ request, None, 'Logging in as %s' % username)
+
+ m = re.search(
+ r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
+ if m:
+ raise ExtractorError(
+ 'Unable to login: %s' % m.group('msg').strip(), expected=True)
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj.group('chapterid'):
@@ -165,6 +223,8 @@ class TwitchIE(InfoExtractor):
"""
elif mobj.group('videoid'):
return self._extract_media('a', mobj.group('videoid'))
+ elif mobj.group('vodid'):
+ return self._extract_media('v', mobj.group('vodid'))
elif mobj.group('channelid'):
channel_id = mobj.group('channelid')
info = self._download_json(
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py
index 0e4d386a8..4667ed83b 100644
--- a/youtube_dl/extractor/udemy.py
+++ b/youtube_dl/extractor/udemy.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
)
@@ -97,11 +99,8 @@ class UdemyIE(InfoExtractor):
if 'returnUrl' not in response:
raise ExtractorError('Unable to log in')
-
-
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- lecture_id = mobj.group('id')
+ lecture_id = self._match_id(url)
lecture = self._download_json(
'https://www.udemy.com/api-1.1/lectures/%s' % lecture_id,
diff --git a/youtube_dl/extractor/urort.py b/youtube_dl/extractor/urort.py
index 5d06fcc9e..8872cfcb2 100644
--- a/youtube_dl/extractor/urort.py
+++ b/youtube_dl/extractor/urort.py
@@ -1,11 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
+)
+from ..utils import (
unified_strdate,
)
@@ -18,11 +18,10 @@ class UrortIE(InfoExtractor):
'url': 'https://urort.p3.no/#!/Band/Gerilja',
'md5': '5ed31a924be8a05e47812678a86e127b',
'info_dict': {
- 'id': '33124-4',
+ 'id': '33124-24',
'ext': 'mp3',
'title': 'The Bomb',
'thumbnail': 're:^https?://.+\.jpg',
- 'like_count': int,
'uploader': 'Gerilja',
'uploader_id': 'Gerilja',
'upload_date': '20100323',
@@ -33,25 +32,31 @@ class UrortIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
+ playlist_id = self._match_id(url)
fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id)
- json_url = 'http://urort.p3.no/breeze/urort/TrackDtos?$filter=' + fstr
+ json_url = 'http://urort.p3.no/breeze/urort/TrackDTOViews?$filter=%s&$orderby=Released%%20desc&$expand=Tags%%2CFiles' % fstr
songs = self._download_json(json_url, playlist_id)
- print(songs[0])
-
- entries = [{
- 'id': '%d-%s' % (s['BandId'], s['$id']),
- 'title': s['Title'],
- 'url': s['TrackUrl'],
- 'ext': 'mp3',
- 'uploader_id': playlist_id,
- 'uploader': s.get('BandName', playlist_id),
- 'like_count': s.get('LikeCount'),
- 'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'],
- 'upload_date': unified_strdate(s.get('Released')),
- } for s in songs]
+ entries = []
+ for s in songs:
+ formats = [{
+ 'tbr': f.get('Quality'),
+ 'ext': f['FileType'],
+ 'format_id': '%s-%s' % (f['FileType'], f.get('Quality', '')),
+ 'url': 'http://p3urort.blob.core.windows.net/tracks/%s' % f['FileRef'],
+ 'preference': 3 if f['FileType'] == 'mp3' else 2,
+ } for f in s['Files']]
+ self._sort_formats(formats)
+ e = {
+ 'id': '%d-%s' % (s['BandId'], s['$id']),
+ 'title': s['Title'],
+ 'uploader_id': playlist_id,
+ 'uploader': s.get('BandName', playlist_id),
+ 'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'],
+ 'upload_date': unified_strdate(s.get('Released')),
+ 'formats': formats,
+ }
+ entries.append(e)
return {
'_type': 'playlist',
diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py
index 53dc3a496..68d03b999 100644
--- a/youtube_dl/extractor/ustream.py
+++ b/youtube_dl/extractor/ustream.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
)
diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py
index 455b6d9da..dd026748d 100644
--- a/youtube_dl/extractor/vbox7.py
+++ b/youtube_dl/extractor/vbox7.py
@@ -1,19 +1,18 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
-
+)
+from ..utils import (
ExtractorError,
)
class Vbox7IE(InfoExtractor):
- _VALID_URL = r'http://(www\.)?vbox7\.com/play:(?P<id>[^/]+)'
+ _VALID_URL = r'http://(?:www\.)?vbox7\.com/play:(?P<id>[^/]+)'
_TEST = {
'url': 'http://vbox7.com/play:249bb972c2',
'md5': '99f65c0c9ef9b682b97313e052734c3f',
@@ -25,8 +24,7 @@ class Vbox7IE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
redirect_page, urlh = self._download_webpage_handle(url, video_id)
new_location = self._search_regex(r'window\.location = \'(.*)\';',
diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py
index 94647d1c8..815f58468 100644
--- a/youtube_dl/extractor/veehd.py
+++ b/youtube_dl/extractor/veehd.py
@@ -4,10 +4,12 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
- get_element_by_id,
+)
+from ..utils import (
clean_html,
+ get_element_by_id,
)
@@ -26,8 +28,7 @@ class VeeHDIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
# VeeHD seems to send garbage on the first request.
# See https://github.com/rg3/youtube-dl/issues/2102
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py
index a7953a7e7..01e258e32 100644
--- a/youtube_dl/extractor/veoh.py
+++ b/youtube_dl/extractor/veoh.py
@@ -4,8 +4,10 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+)
+from ..utils import (
int_or_none,
ExtractorError,
)
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index c912c3cbe..43f6b029d 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -4,8 +4,10 @@ import re
import xml.etree.ElementTree
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
)
diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py
index ac6c25537..0ffc7ff7d 100644
--- a/youtube_dl/extractor/videodetective.py
+++ b/youtube_dl/extractor/videodetective.py
@@ -1,10 +1,8 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..compat import compat_urlparse
from .internetvideoarchive import InternetVideoArchiveIE
-from ..utils import compat_urlparse
class VideoDetectiveIE(InfoExtractor):
@@ -17,13 +15,12 @@ class VideoDetectiveIE(InfoExtractor):
'ext': 'mp4',
'title': 'KICK-ASS 2',
'description': 'md5:65ba37ad619165afac7d432eaded6013',
- 'duration': 135,
+ 'duration': 138,
},
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
og_video = self._og_search_video_url(webpage)
query = compat_urlparse.urlparse(og_video).query
diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py
index 29c4e0101..7a78f0d26 100644
--- a/youtube_dl/extractor/videomega.py
+++ b/youtube_dl/extractor/videomega.py
@@ -1,11 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
+)
+from ..utils import (
remove_start,
)
@@ -27,9 +27,7 @@ class VideoMegaIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id)
webpage = self._download_webpage(url, video_id)
diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py
new file mode 100644
index 000000000..619039e51
--- /dev/null
+++ b/youtube_dl/extractor/vier.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class VierIE(InfoExtractor):
+ IE_NAME = 'vier'
+ _VALID_URL = r'https?://(?:www\.)?vier\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
+ _TESTS = [{
+ 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
+ 'info_dict': {
+ 'id': '16129',
+ 'display_id': 'het-wordt-warm-de-moestuin',
+ 'ext': 'mp4',
+ 'title': 'Het wordt warm in De Moestuin',
+ 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.vier.be/video/v3/embed/16129',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ embed_id = mobj.group('embed_id')
+ display_id = mobj.group('display_id') or embed_id
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ r'"nid"\s*:\s*"(\d+)"', webpage, 'video id')
+ application = self._search_regex(
+ r'"application"\s*:\s*"([^"]+)"', webpage, 'application', default='vier_vod')
+ filename = self._search_regex(
+ r'"filename"\s*:\s*"([^"]+)"', webpage, 'filename')
+
+ playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename)
+ formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4')
+
+ title = self._og_search_title(webpage, default=display_id)
+ description = self._og_search_description(webpage, default=None)
+ thumbnail = self._og_search_thumbnail(webpage, default=None)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
+
+
+class VierVideosIE(InfoExtractor):
+ IE_NAME = 'vier:videos'
+ _VALID_URL = r'https?://(?:www\.)?vier\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)'
+ _TESTS = [{
+ 'url': 'http://www.vier.be/demoestuin/videos',
+ 'info_dict': {
+ 'id': 'demoestuin',
+ },
+ 'playlist_mincount': 153,
+ }, {
+ 'url': 'http://www.vier.be/demoestuin/videos?page=6',
+ 'info_dict': {
+ 'id': 'demoestuin-page6',
+ },
+ 'playlist_mincount': 20,
+ }, {
+ 'url': 'http://www.vier.be/demoestuin/videos?page=7',
+ 'info_dict': {
+ 'id': 'demoestuin-page7',
+ },
+ 'playlist_mincount': 13,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ program = mobj.group('program')
+
+ webpage = self._download_webpage(url, program)
+
+ page_id = mobj.group('page')
+ if page_id:
+ page_id = int(page_id)
+ start_page = page_id
+ last_page = start_page + 1
+ playlist_id = '%s-page%d' % (program, page_id)
+ else:
+ start_page = 0
+ last_page = int(self._search_regex(
+ r'videos\?page=(\d+)">laatste</a>',
+ webpage, 'last page', default=0)) + 1
+ playlist_id = program
+
+ entries = []
+ for current_page_id in range(start_page, last_page):
+ current_page = self._download_webpage(
+ 'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id),
+ program,
+ 'Downloading page %d' % (current_page_id + 1)) if current_page_id != page_id else webpage
+ page_entries = [
+ self.url_result('http://www.vier.be' + video_url, 'Vier')
+ for video_url in re.findall(
+ r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]
+ entries.extend(page_entries)
+
+ return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
index 15f315298..944901e14 100644
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@@ -17,7 +17,6 @@ class VikiIE(SubtitlesInfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
_TEST = {
'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
- 'md5': 'a21454021c2646f5433514177e2caa5f',
'info_dict': {
'id': '1023585v',
'ext': 'mp4',
@@ -31,8 +30,7 @@ class VikiIE(SubtitlesInfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage)
diff --git a/youtube_dl/extractor/vimple.py b/youtube_dl/extractor/vimple.py
index 33d370e1c..ee3d86117 100644
--- a/youtube_dl/extractor/vimple.py
+++ b/youtube_dl/extractor/vimple.py
@@ -14,28 +14,17 @@ class VimpleIE(InfoExtractor):
IE_DESC = 'Vimple.ru'
_VALID_URL = r'https?://(player.vimple.ru/iframe|vimple.ru)/(?P<id>[a-f0-9]{10,})'
_TESTS = [
- # Quality: Large, from iframe
{
- 'url': 'http://player.vimple.ru/iframe/b132bdfd71b546d3972f9ab9a25f201c',
+ 'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf',
+ 'md5': '2e750a330ed211d3fd41821c6ad9a279',
'info_dict': {
- 'id': 'b132bdfd71b546d3972f9ab9a25f201c',
- 'title': 'great-escape-minecraft.flv',
+ 'id': 'c0f6b1687dcd4000a97ebe70068039cf',
'ext': 'mp4',
- 'duration': 352,
- 'webpage_url': 'http://vimple.ru/b132bdfd71b546d3972f9ab9a25f201c',
+ 'title': 'Sunset',
+ 'duration': 20,
+ 'thumbnail': 're:https?://.*?\.jpg',
},
},
- # Quality: Medium, from mainpage
- {
- 'url': 'http://vimple.ru/a15950562888453b8e6f9572dc8600cd',
- 'info_dict': {
- 'id': 'a15950562888453b8e6f9572dc8600cd',
- 'title': 'DB 01',
- 'ext': 'flv',
- 'duration': 1484,
- 'webpage_url': 'http://vimple.ru/a15950562888453b8e6f9572dc8600cd',
- }
- },
]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index 42995226e..0b58fe0fe 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -17,6 +17,7 @@ class VineIE(InfoExtractor):
'id': 'b9KOOWX7HUx',
'ext': 'mp4',
'title': 'Chicken.',
+ 'alt_title': 'Vine by Jack Dorsey',
'description': 'Chicken.',
'upload_date': '20130519',
'uploader': 'Jack Dorsey',
@@ -25,30 +26,26 @@ class VineIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id)
data = json.loads(self._html_search_regex(
r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data'))
- formats = [
- {
- 'url': data['videoLowURL'],
- 'ext': 'mp4',
- 'format_id': 'low',
- },
- {
- 'url': data['videoUrl'],
- 'ext': 'mp4',
- 'format_id': 'standard',
- }
- ]
+ formats = [{
+ 'url': data['videoLowURL'],
+ 'ext': 'mp4',
+ 'format_id': 'low',
+ }, {
+ 'url': data['videoUrl'],
+ 'ext': 'mp4',
+ 'format_id': 'standard',
+ }]
return {
'id': video_id,
'title': self._og_search_title(webpage),
+ 'alt_title': self._og_search_description(webpage),
'description': data['description'],
'thumbnail': data['thumbnailUrl'],
'upload_date': unified_strdate(data['created']),
@@ -63,29 +60,36 @@ class VineIE(InfoExtractor):
class VineUserIE(InfoExtractor):
IE_NAME = 'vine:user'
- _VALID_URL = r'(?:https?://)?vine\.co/(?P<user>[^/]+)/?(\?.*)?$'
+ _VALID_URL = r'(?:https?://)?vine\.co/(?P<u>u/)?(?P<user>[^/]+)/?(\?.*)?$'
_VINE_BASE_URL = "https://vine.co/"
- _TEST = {
- 'url': 'https://vine.co/Visa',
- 'info_dict': {
- 'id': 'Visa',
+ _TESTS = [
+ {
+ 'url': 'https://vine.co/Visa',
+ 'info_dict': {
+ 'id': 'Visa',
+ },
+ 'playlist_mincount': 46,
},
- 'playlist_mincount': 46,
- }
+ {
+ 'url': 'https://vine.co/u/941705360593584128',
+ 'only_matching': True,
+ },
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user = mobj.group('user')
+ u = mobj.group('u')
- profile_url = "%sapi/users/profiles/vanity/%s" % (
- self._VINE_BASE_URL, user)
+ profile_url = "%sapi/users/profiles/%s%s" % (
+ self._VINE_BASE_URL, 'vanity/' if not u else '', user)
profile_data = self._download_json(
profile_url, user, note='Downloading user profile data')
user_id = profile_data['data']['userId']
timeline_data = []
for pagenum in itertools.count(1):
- timeline_url = "%sapi/timelines/users/%s?page=%s" % (
+ timeline_url = "%sapi/timelines/users/%s?page=%s&size=100" % (
self._VINE_BASE_URL, user_id, pagenum)
timeline_page = self._download_json(
timeline_url, user, note='Downloading page %d' % pagenum)
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index ca6b0d5b3..81e02a624 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -5,14 +5,17 @@ import re
import json
from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+ compat_urllib_parse,
+ compat_urllib_request,
+)
from ..utils import (
ExtractorError,
- compat_urllib_request,
- compat_urllib_parse,
- compat_str,
+ orderedSet,
unescapeHTML,
unified_strdate,
- orderedSet)
+)
class VKIE(InfoExtractor):
@@ -161,6 +164,14 @@ class VKIE(InfoExtractor):
self.to_screen('Youtube video detected')
return self.url_result(m_yt.group(1), 'Youtube')
+ m_rutube = re.search(
+ r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page)
+ if m_rutube is not None:
+ self.to_screen('rutube video detected')
+ rutube_url = self._proto_relative_url(
+ m_rutube.group(1).replace('\\', ''))
+ return self.url_result(rutube_url)
+
m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page)
if m_opts:
m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1))
diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py
index affef6507..1c0966a79 100644
--- a/youtube_dl/extractor/vodlocker.py
+++ b/youtube_dl/extractor/vodlocker.py
@@ -2,8 +2,9 @@
from __future__ import unicode_literals
import re
+
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
@@ -24,8 +25,7 @@ class VodlockerIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
fields = dict(re.findall(r'''(?x)<input\s+
diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py
index 1b2f731e9..405cb9db4 100644
--- a/youtube_dl/extractor/vube.py
+++ b/youtube_dl/extractor/vube.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+)
from ..utils import (
int_or_none,
- compat_str,
ExtractorError,
)
diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py
index ec3c010ad..c3fde53f5 100644
--- a/youtube_dl/extractor/vuclip.py
+++ b/youtube_dl/extractor/vuclip.py
@@ -3,8 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
+)
+from ..utils import (
ExtractorError,
parse_duration,
qualities,
@@ -25,10 +27,9 @@ class VuClipIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+
ad_m = re.search(
r'''value="No.*?" onClick="location.href='([^"']+)'"''', webpage)
if ad_m:
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py
index 88bbbb219..c17bebd6e 100644
--- a/youtube_dl/extractor/washingtonpost.py
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -10,14 +10,14 @@ from ..utils import (
class WashingtonPostIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
+ _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
_TEST = {
'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
'info_dict': {
'title': 'Sinkhole of bureaucracy',
},
'playlist': [{
- 'md5': 'c3f4b4922ffa259243f68e928db2db8c',
+ 'md5': '79132cc09ec5309fa590ae46e4cc31bc',
'info_dict': {
'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',
'ext': 'mp4',
@@ -29,7 +29,7 @@ class WashingtonPostIE(InfoExtractor):
'upload_date': '20140322',
},
}, {
- 'md5': 'f645a07652c2950cd9134bb852c5f5eb',
+ 'md5': 'e1d5734c06865cc504ad99dc2de0d443',
'info_dict': {
'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',
'ext': 'mp4',
@@ -44,10 +44,9 @@ class WashingtonPostIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- page_id = mobj.group('id')
-
+ page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
+
title = self._og_search_title(webpage)
uuids = re.findall(r'data-video-uuid="([^"]+)"', webpage)
entries = []
diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py
index 93a6e6454..45466e31b 100644
--- a/youtube_dl/extractor/wdr.py
+++ b/youtube_dl/extractor/wdr.py
@@ -1,12 +1,15 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
+import itertools
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_parse_qs,
compat_urlparse,
+)
+from ..utils import (
determine_ext,
unified_strdate,
)
@@ -65,6 +68,10 @@ class WDRIE(InfoExtractor):
'upload_date': '20140717',
},
},
+ {
+ 'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html',
+ 'playlist_mincount': 146,
+ }
]
def _real_extract(self, url):
@@ -79,6 +86,27 @@ class WDRIE(InfoExtractor):
self.url_result(page_url + href, 'WDR')
for href in re.findall(r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX, webpage)
]
+
+ if entries: # Playlist page
+ return self.playlist_result(entries, page_id)
+
+ # Overview page
+ entries = []
+ for page_num in itertools.count(2):
+ hrefs = re.findall(
+ r'<li class="mediathekvideo"\s*>\s*<img[^>]*>\s*<a href="(/mediathek/video/[^"]+)"',
+ webpage)
+ entries.extend(
+ self.url_result(page_url + href, 'WDR')
+ for href in hrefs)
+ next_url_m = re.search(
+ r'<li class="nextToLast">\s*<a href="([^"]+)"', webpage)
+ if not next_url_m:
+ break
+ next_url = page_url + next_url_m.group(1)
+ webpage = self._download_webpage(
+ next_url, page_id,
+ note='Downloading playlist page %d' % page_num)
return self.playlist_result(entries, page_id)
flashvars = compat_parse_qs(
@@ -141,7 +169,6 @@ class WDRMobileIE(InfoExtractor):
'title': mobj.group('title'),
'age_limit': int(mobj.group('age_limit')),
'url': url,
- 'ext': determine_ext(url),
'user_agent': 'mobile',
}
@@ -171,8 +198,7 @@ class WDRMausIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
param_code = self._html_search_regex(
@@ -223,5 +249,3 @@ class WDRMausIE(InfoExtractor):
'thumbnail': thumbnail,
'upload_date': upload_date,
}
-
-# TODO test _1
diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py
new file mode 100644
index 000000000..396cf4e83
--- /dev/null
+++ b/youtube_dl/extractor/webofstories.py
@@ -0,0 +1,102 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class WebOfStoriesIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?webofstories\.com/play/(?:[^/]+/)?(?P<id>[0-9]+)'
+ _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/'
+ _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/'
+ _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/'
+ _TESTS = [
+ {
+ 'url': 'http://www.webofstories.com/play/hans.bethe/71',
+ 'md5': '373e4dd915f60cfe3116322642ddf364',
+ 'info_dict': {
+ 'id': '4536',
+ 'ext': 'mp4',
+ 'title': 'The temperature of the sun',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Hans Bethe talks about calculating the temperature of the sun',
+ 'duration': 238,
+ }
+ },
+ {
+ 'url': 'http://www.webofstories.com/play/55908',
+ 'md5': '2985a698e1fe3211022422c4b5ed962c',
+ 'info_dict': {
+ 'id': '55908',
+ 'ext': 'mp4',
+ 'title': 'The story of Gemmata obscuriglobus',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
+ 'duration': 169,
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._og_search_title(webpage)
+ description = self._html_search_meta('description', webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ story_filename = self._search_regex(
+ r'\.storyFileName\("([^"]+)"\)', webpage, 'story filename')
+ speaker_id = self._search_regex(
+ r'\.speakerId\("([^"]+)"\)', webpage, 'speaker ID')
+ story_id = self._search_regex(
+ r'\.storyId\((\d+)\)', webpage, 'story ID')
+ speaker_type = self._search_regex(
+ r'\.speakerType\("([^"]+)"\)', webpage, 'speaker type')
+ great_life = self._search_regex(
+ r'isGreatLifeStory\s*=\s*(true|false)', webpage, 'great life story')
+ is_great_life_series = great_life == 'true'
+ duration = int_or_none(self._search_regex(
+ r'\.duration\((\d+)\)', webpage, 'duration', fatal=False))
+
+ # URL building, see: http://www.webofstories.com/scripts/player.js
+ ms_prefix = ''
+ if speaker_type.lower() == 'ms':
+ ms_prefix = 'mini_sites/'
+
+ if is_great_life_series:
+ mp4_url = '{0:}lives/{1:}/{2:}.mp4'.format(
+ self._VIDEO_DOMAIN, speaker_id, story_filename)
+ rtmp_ext = 'flv'
+ streamer = self._GREAT_LIFE_STREAMER
+ play_path = 'stories/{0:}/{1:}'.format(
+ speaker_id, story_filename)
+ else:
+ mp4_url = '{0:}{1:}{2:}/{3:}.mp4'.format(
+ self._VIDEO_DOMAIN, ms_prefix, speaker_id, story_filename)
+ rtmp_ext = 'mp4'
+ streamer = self._USER_STREAMER
+ play_path = 'mp4:{0:}{1:}/{2}.mp4'.format(
+ ms_prefix, speaker_id, story_filename)
+
+ formats = [{
+ 'format_id': 'mp4_sd',
+ 'url': mp4_url,
+ }, {
+ 'format_id': 'rtmp_sd',
+ 'page_url': url,
+ 'url': streamer,
+ 'ext': rtmp_ext,
+ 'play_path': play_path,
+ }]
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': story_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
index 748443f81..13a079151 100644
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -1,9 +1,8 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import ExtractorError, compat_urllib_request
+from ..compat import compat_urllib_request
+from ..utils import ExtractorError
class WistiaIE(InfoExtractor):
@@ -22,8 +21,7 @@ class WistiaIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
request = compat_urllib_request.Request(self._API_URL.format(video_id))
request.add_header('Referer', url) # Some videos require this.
diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py
index 1b4e88365..80c48c37d 100644
--- a/youtube_dl/extractor/xbef.py
+++ b/youtube_dl/extractor/xbef.py
@@ -1,9 +1,7 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
@@ -23,10 +21,9 @@ class XBefIE(InfoExtractor):
}
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+
title = self._html_search_regex(
r'<h1[^>]*>(.*?)</h1>', webpage, 'title')
diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py
index a9aa72e73..9cf867807 100644
--- a/youtube_dl/extractor/xboxclips.py
+++ b/youtube_dl/extractor/xboxclips.py
@@ -1,46 +1,42 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
- parse_iso8601,
- float_or_none,
int_or_none,
+ parse_filesize,
+ unified_strdate,
)
class XboxClipsIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/video\.php\?.*vid=(?P<id>[\w-]{36})'
+ _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})'
_TEST = {
'url': 'https://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325',
'md5': 'fbe1ec805e920aeb8eced3c3e657df5d',
'info_dict': {
'id': '074a69a9-5faf-46aa-b93b-9909c1720325',
'ext': 'mp4',
- 'title': 'Iabdulelah playing Upload Studio',
- 'filesize_approx': 28101836.8,
- 'timestamp': 1407388500,
+ 'title': 'Iabdulelah playing Titanfall',
+ 'filesize_approx': 26800000,
'upload_date': '20140807',
'duration': 56,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._html_search_regex(
- r'>Link: <a href="([^"]+)">', webpage, 'video URL')
+ r'>(?:Link|Download): <a href="([^"]+)">', webpage, 'video URL')
title = self._html_search_regex(
r'<title>XboxClips \| ([^<]+)</title>', webpage, 'title')
- timestamp = parse_iso8601(self._html_search_regex(
+ upload_date = unified_strdate(self._html_search_regex(
r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False))
- filesize = float_or_none(self._html_search_regex(
- r'>Size: ([\d\.]+)MB<', webpage, 'file size', fatal=False), invscale=1024 * 1024)
+ filesize = parse_filesize(self._html_search_regex(
+ r'>Size: ([^<]+)<', webpage, 'file size', fatal=False))
duration = int_or_none(self._html_search_regex(
r'>Duration: (\d+) Seconds<', webpage, 'duration', fatal=False))
view_count = int_or_none(self._html_search_regex(
@@ -50,7 +46,7 @@ class XboxClipsIE(InfoExtractor):
'id': video_id,
'url': video_url,
'title': title,
- 'timestamp': timestamp,
+ 'upload_date': upload_date,
'filesize_approx': filesize,
'duration': duration,
'view_count': view_count,
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 6b37bcbc9..4527567f8 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -14,7 +14,7 @@ from ..utils import (
class XHamsterIE(InfoExtractor):
"""Information Extractor for xHamster"""
- _VALID_URL = r'http://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
+ _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
_TESTS = [
{
'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
@@ -39,7 +39,11 @@ class XHamsterIE(InfoExtractor):
'duration': 200,
'age_limit': 18,
}
- }
+ },
+ {
+ 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
@@ -57,7 +61,8 @@ class XHamsterIE(InfoExtractor):
video_id = mobj.group('id')
seo = mobj.group('seo')
- mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo)
+ proto = mobj.group('proto')
+ mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo)
webpage = self._download_webpage(mrss_url, video_id)
title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')
diff --git a/youtube_dl/extractor/xminus.py b/youtube_dl/extractor/xminus.py
index f7e2e8ac9..8c6241aed 100644
--- a/youtube_dl/extractor/xminus.py
+++ b/youtube_dl/extractor/xminus.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..compat import (
compat_chr,
@@ -25,6 +27,7 @@ class XMinusIE(InfoExtractor):
'tbr': 320,
'filesize_approx': 5900000,
'view_count': int,
+ 'description': 'md5:03238c5b663810bc79cf42ef3c03e371',
}
}
@@ -48,6 +51,11 @@ class XMinusIE(InfoExtractor):
view_count = int_or_none(self._html_search_regex(
r'<div class="quality.*?► ([0-9]+)',
webpage, 'view count', fatal=False))
+ description = self._html_search_regex(
+ r'(?s)<div id="song_texts">(.*?)</div><br',
+ webpage, 'song lyrics', fatal=False)
+ if description:
+ description = re.sub(' *\r *', '\n', description)
enc_token = self._html_search_regex(
r'minus_track\.tkn="(.+?)"', webpage, 'enc_token')
@@ -64,4 +72,5 @@ class XMinusIE(InfoExtractor):
'filesize_approx': filesize_approx,
'tbr': tbr,
'view_count': view_count,
+ 'description': description,
}
diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py
index 53ed7ef5a..79ed6c744 100644
--- a/youtube_dl/extractor/xnxx.py
+++ b/youtube_dl/extractor/xnxx.py
@@ -1,10 +1,8 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
@@ -23,10 +21,7 @@ class XNXXIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- # Get webpage content
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(r'flv_url=(.*?)&amp;',
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index 38448e7c0..e8490b028 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -1,18 +1,20 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+ compat_urllib_parse,
+)
+from ..utils import (
parse_duration,
str_to_int,
)
class XTubeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<id>[^/?&#]+))'
_TEST = {
'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
@@ -28,41 +30,49 @@ class XTubeIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
- url = 'http://www.' + mobj.group('url')
+ video_id = self._match_id(url)
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
- video_title = self._html_search_regex(r'<p class="title">([^<]+)', webpage, 'title')
+ video_title = self._html_search_regex(
+ r'<p class="title">([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(
- r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
+ [r"var\s+contentOwnerId\s*=\s*'([^']+)",
+ r'By:\s*<a href="/community/profile\.php\?user=([^"]+)'],
+ webpage, 'uploader', fatal=False)
video_description = self._html_search_regex(
- r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False)
+ r'<p class="fieldsDesc">([^<]+)',
+ webpage, 'description', fatal=False)
duration = parse_duration(self._html_search_regex(
- r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False))
- view_count = self._html_search_regex(
- r'<span class="bold">Views:</span> ([\d,\.]+)</p>', webpage, 'view count', fatal=False)
- if view_count:
- view_count = str_to_int(view_count)
- comment_count = self._html_search_regex(
- r'<div id="commentBar">([\d,\.]+) Comments</div>', webpage, 'comment count', fatal=False)
- if comment_count:
- comment_count = str_to_int(comment_count)
-
- player_quality_option = json.loads(self._html_search_regex(
- r'playerQualityOption = ({.+?});', webpage, 'player quality option'))
-
- QUALITIES = ['3gp', 'mp4_normal', 'mp4_high', 'flv', 'mp4_ultra', 'mp4_720', 'mp4_1080']
- formats = [
- {
- 'url': furl,
+ r'<span class="bold">Runtime:</span> ([^<]+)</p>',
+ webpage, 'duration', fatal=False))
+ view_count = str_to_int(self._html_search_regex(
+ r'<span class="bold">Views:</span> ([\d,\.]+)</p>',
+ webpage, 'view count', fatal=False))
+ comment_count = str_to_int(self._html_search_regex(
+ r'<div id="commentBar">([\d,\.]+) Comments</div>',
+ webpage, 'comment count', fatal=False))
+
+ formats = []
+ for format_id, video_url in re.findall(
+ r'flashvars\.quality_(.+?)\s*=\s*"([^"]+)"', webpage):
+ fmt = {
+ 'url': compat_urllib_parse.unquote(video_url),
'format_id': format_id,
- 'preference': QUALITIES.index(format_id) if format_id in QUALITIES else -1,
- } for format_id, furl in player_quality_option.items()
- ]
+ }
+ m = re.search(r'^(?P<height>\d+)[pP]', format_id)
+ if m:
+ fmt['height'] = int(m.group('height'))
+ formats.append(fmt)
+
+ if not formats:
+ video_url = compat_urllib_parse.unquote(self._search_regex(
+ r'flashvars\.video_url\s*=\s*"([^"]+)"',
+ webpage, 'video URL'))
+ formats.append({'url': video_url})
+
self._sort_formats(formats)
return {
@@ -85,6 +95,7 @@ class XTubeUserIE(InfoExtractor):
'url': 'http://www.xtube.com/community/profile.php?user=greenshowers',
'info_dict': {
'id': 'greenshowers',
+ 'age_limit': 18,
},
'playlist_mincount': 155,
}
@@ -114,6 +125,7 @@ class XTubeUserIE(InfoExtractor):
return {
'_type': 'playlist',
'id': username,
+ 'age_limit': 18,
'entries': [{
'_type': 'url',
'url': eurl,
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py
index 7e0044824..2a45dc574 100644
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -3,15 +3,17 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
- ExtractorError,
+)
+from ..utils import (
clean_html,
+ ExtractorError,
)
class XVideosIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
+ _VALID_URL = r'https?://(?:www\.)?xvideos\.com/video(?P<id>[0-9]+)(?:.*)'
_TEST = {
'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl',
'md5': '4b46ae6ea5e6e9086e714d883313c0c9',
@@ -24,37 +26,25 @@ class XVideosIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- self.report_extraction(video_id)
-
mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage)
if mobj:
raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True)
- # Extract video URL
video_url = compat_urllib_parse.unquote(
self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL'))
-
- # Extract title
video_title = self._html_search_regex(
r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
-
- # Extract video thumbnail
video_thumbnail = self._search_regex(
r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
return {
'id': video_id,
'url': video_url,
- 'uploader': None,
- 'upload_date': None,
'title': video_title,
'ext': 'flv',
'thumbnail': video_thumbnail,
- 'description': None,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/xxxymovies.py b/youtube_dl/extractor/xxxymovies.py
new file mode 100644
index 000000000..5c8f17eb2
--- /dev/null
+++ b/youtube_dl/extractor/xxxymovies.py
@@ -0,0 +1,81 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ int_or_none,
+)
+
+
+class XXXYMoviesIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?xxxymovies\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)'
+ _TEST = {
+ 'url': 'http://xxxymovies.com/videos/138669/ecstatic-orgasm-sofcore/',
+ 'md5': '810b1bdbbffff89dd13bdb369fe7be4b',
+ 'info_dict': {
+ 'id': '138669',
+ 'display_id': 'ecstatic-orgasm-sofcore',
+ 'ext': 'mp4',
+ 'title': 'Ecstatic Orgasm Sofcore',
+ 'duration': 931,
+ 'categories': list,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_url = self._search_regex(
+ r"video_url\s*:\s*'([^']+)'", webpage, 'video URL')
+
+ title = self._html_search_regex(
+ [r'<div class="block_header">\s*<h1>([^<]+)</h1>',
+ r'<title>(.*?)\s*-\s*XXXYMovies\.com</title>'],
+ webpage, 'title')
+
+ thumbnail = self._search_regex(
+ r"preview_url\s*:\s*'([^']+)'",
+ webpage, 'thumbnail', fatal=False)
+
+ categories = self._html_search_meta(
+ 'keywords', webpage, 'categories', default='').split(',')
+
+ duration = parse_duration(self._search_regex(
+ r'<span>Duration:</span>\s*(\d+:\d+)',
+ webpage, 'duration', fatal=False))
+
+ view_count = int_or_none(self._html_search_regex(
+ r'<div class="video_views">\s*(\d+)',
+ webpage, 'view count', fatal=False))
+ like_count = int_or_none(self._search_regex(
+ r'>\s*Likes? <b>\((\d+)\)',
+ webpage, 'like count', fatal=False))
+ dislike_count = int_or_none(self._search_regex(
+ r'>\s*Dislike <b>\((\d+)\)</b>',
+ webpage, 'dislike count', fatal=False))
+
+ age_limit = self._rta_search(webpage)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'categories': categories,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 0fdb12243..f8e7041a0 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -6,11 +6,14 @@ import json
import re
from .common import InfoExtractor, SearchInfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_urllib_parse,
compat_urlparse,
+)
+from ..utils import (
clean_html,
+ unescapeHTML,
+ ExtractorError,
int_or_none,
)
@@ -53,14 +56,14 @@ class YahooIE(InfoExtractor):
}
},
{
- 'url': 'https://tw.screen.yahoo.com/taipei-opinion-poll/選情站報-街頭民調-台北市篇-102823042.html',
- 'md5': '92a7fdd8a08783c68a174d7aa067dde8',
+ 'url': 'https://tw.screen.yahoo.com/election-2014-askmayor/敢問市長-黃秀霜批賴清德-非常高傲-033009720.html',
+ 'md5': '3a09cf59349cfaddae1797acc3c087fc',
'info_dict': {
- 'id': '7a23b569-7bea-36cb-85b9-bd5301a0a1fb',
+ 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f',
'ext': 'mp4',
- 'title': '選情站報 街頭民調 台北市篇',
- 'description': '選情站報 街頭民調 台北市篇',
- 'duration': 429,
+ 'title': '敢問市長/黃秀霜批賴清德「非常高傲」',
+ 'description': '直言台南沒捷運 交通居五都之末',
+ 'duration': 396,
}
},
{
@@ -85,14 +88,14 @@ class YahooIE(InfoExtractor):
'duration': 121,
}
}, {
- 'url': 'https://ca.finance.yahoo.com/news/20-most-valuable-brands-world-112600775.html',
- 'md5': '3e401e4eed6325aa29d9b96125fd5b4f',
+ 'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html',
+ 'md5': '226a895aae7e21b0129e2a2006fe9690',
'info_dict': {
- 'id': 'c1b4c09c-8ed8-3b65-8b05-169c55358a83',
+ 'id': 'e624c4bc-3389-34de-9dfc-025f74943409',
'ext': 'mp4',
- 'title': "Apple Is The World's Most Valuable Brand",
- 'description': 'md5:73eabc1a11c6f59752593b2ceefa1262',
- 'duration': 21,
+ 'title': '\'The Interview\' TV Spot: War',
+ 'description': 'The Interview',
+ 'duration': 30,
}
}, {
'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
@@ -115,6 +118,16 @@ class YahooIE(InfoExtractor):
'duration': 201,
}
}, {
+ 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html',
+ 'md5': '989396ae73d20c6f057746fb226aa215',
+ 'info_dict': {
+ 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1',
+ 'ext': 'mp4',
+ 'title': '\'True Story\' Trailer',
+ 'description': 'True Story',
+ 'duration': 150,
+ },
+ }, {
'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',
'only_matching': True,
}
@@ -123,6 +136,7 @@ class YahooIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('display_id')
+ page_id = mobj.group('id')
url = mobj.group('url')
host = mobj.group('host')
webpage = self._download_webpage(url, display_id)
@@ -147,6 +161,7 @@ class YahooIE(InfoExtractor):
r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
r'"first_videoid"\s*:\s*"([^"]+)"',
+ r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id),
]
video_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
else:
@@ -161,17 +176,15 @@ class YahooIE(InfoExtractor):
region = self._search_regex(
r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"',
webpage, 'region', fatal=False, default='US')
- query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
- ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="%s"'
- ' AND protocol="http"' % (video_id, region))
data = compat_urllib_parse.urlencode({
- 'q': query,
- 'env': 'prod',
- 'format': 'json',
+ 'protocol': 'http',
+ 'region': region,
})
+ query_url = (
+ 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/'
+ '{id}?{data}'.format(id=video_id, data=data))
query_result = self._download_json(
- 'http://video.query.yahoo.com/v1/public/yql?' + data,
- display_id, 'Downloading video info')
+ query_url, display_id, 'Downloading video info')
info = query_result['query']['results']['mediaObj'][0]
meta = info.get('meta')
@@ -209,7 +222,7 @@ class YahooIE(InfoExtractor):
return {
'id': video_id,
'display_id': display_id,
- 'title': meta['title'],
+ 'title': unescapeHTML(meta['title']),
'formats': formats,
'description': clean_html(meta['description']),
'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
diff --git a/youtube_dl/extractor/yesjapan.py b/youtube_dl/extractor/yesjapan.py
new file mode 100644
index 000000000..112a6c030
--- /dev/null
+++ b/youtube_dl/extractor/yesjapan.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ HEADRequest,
+ get_element_by_attribute,
+ parse_iso8601,
+)
+
+
+class YesJapanIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?yesjapan\.com/video/(?P<slug>[A-Za-z0-9\-]*)_(?P<id>[A-Za-z0-9]+)\.html'
+ _TEST = {
+ 'url': 'http://www.yesjapan.com/video/japanese-in-5-20-wa-and-ga-particle-usages_726497834.html',
+ 'md5': 'f0be416314e5be21a12b499b330c21cf',
+ 'info_dict': {
+ 'id': '726497834',
+ 'title': 'Japanese in 5! #20 - WA And GA Particle Usages',
+ 'description': 'This should clear up some issues most students of Japanese encounter with WA and GA....',
+ 'ext': 'mp4',
+ 'timestamp': 1416391590,
+ 'upload_date': '20141119',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._og_search_title(webpage)
+ video_url = self._og_search_video_url(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ timestamp = None
+ submit_info = get_element_by_attribute('class', 'pm-submit-data', webpage)
+ if submit_info:
+ timestamp = parse_iso8601(self._search_regex(
+ r'datetime="([^"]+)"', submit_info, 'upload date', fatal=False, default=None))
+
+ # attempt to resolve the final URL in order to get a proper extension
+ redirect_req = HEADRequest(video_url)
+ req = self._request_webpage(
+ redirect_req, video_id, note='Resolving final URL', errnote='Could not resolve final URL', fatal=False)
+ if req:
+ video_url = req.geturl()
+
+ formats = [{
+ 'format_id': 'sd',
+ 'url': video_url,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py
index 7b621a9e3..894678a23 100644
--- a/youtube_dl/extractor/ynet.py
+++ b/youtube_dl/extractor/ynet.py
@@ -5,7 +5,7 @@ import re
import json
from .common import InfoExtractor
-from ..utils import compat_urllib_parse
+from ..compat import compat_urllib_parse
class YnetIE(InfoExtractor):
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index 8123928be..107c9ac36 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -6,10 +6,11 @@ import re
import sys
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
compat_urllib_request,
-
+)
+from ..utils import (
ExtractorError,
unescapeHTML,
unified_strdate,
@@ -45,7 +46,9 @@ class YouPornIE(InfoExtractor):
age_limit = self._rta_search(webpage)
# Get JSON parameters
- json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, 'JSON parameters')
+ json_params = self._search_regex(
+ r'var currentVideo = new Video\((.*)\)[,;]',
+ webpage, 'JSON parameters')
try:
params = json.loads(json_params)
except:
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 1cba40387..bc18276d6 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -14,23 +14,24 @@ from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..jsinterp import JSInterpreter
from ..swfinterp import SWFInterpreter
-from ..utils import (
+from ..compat import (
compat_chr,
compat_parse_qs,
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
compat_str,
-
+)
+from ..utils import (
clean_html,
- get_element_by_id,
- get_element_by_attribute,
ExtractorError,
+ get_element_by_attribute,
+ get_element_by_id,
int_or_none,
OnDemandPagedList,
+ orderedSet,
unescapeHTML,
unified_strdate,
- orderedSet,
uppercase_escape,
)
@@ -44,9 +45,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
_LOGIN_REQUIRED = False
def _set_language(self):
- self._set_cookie('.youtube.com', 'PREF', 'f1=50000000&hl=en',
+ self._set_cookie(
+ '.youtube.com', 'PREF', 'f1=50000000&hl=en',
# YouTube sets the expire time to about two months
- expire_time=time.time() + 60*24*3600)
+ expire_time=time.time() + 2 * 30 * 24 * 3600)
def _login(self):
"""
@@ -254,7 +256,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
- '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
@@ -262,9 +264,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
# Dash mp4 audio
- '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
- '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
- '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
+ '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50},
+ '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50},
+ '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50},
# Dash webm
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
@@ -285,7 +287,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
+ '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
+ '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
# Dash webm audio
'171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
@@ -410,12 +414,57 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'id': 'HtVdAasjOgU',
'ext': 'mp4',
'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
- 'description': 'md5:eca57043abae25130f58f655ad9a7771',
+ 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
'uploader': 'The Witcher',
'uploader_id': 'WitcherGame',
'upload_date': '20140605',
},
},
+ # Age-gate video with encrypted signature
+ {
+ 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
+ 'info_dict': {
+ 'id': '6kLq3WMV1nU',
+ 'ext': 'mp4',
+ 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
+ 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
+ 'uploader': 'LloydVEVO',
+ 'uploader_id': 'LloydVEVO',
+ 'upload_date': '20110629',
+ },
+ },
+ # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
+ {
+ 'url': '__2ABJjxzNo',
+ 'info_dict': {
+ 'id': '__2ABJjxzNo',
+ 'ext': 'mp4',
+ 'upload_date': '20100430',
+ 'uploader_id': 'deadmau5',
+ 'description': 'md5:12c56784b8032162bb936a5f76d55360',
+ 'uploader': 'deadmau5',
+ 'title': 'Deadmau5 - Some Chords (HD)',
+ },
+ 'expected_warnings': [
+ 'DASH manifest missing',
+ ]
+ },
+ # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
+ {
+ 'url': 'lqQg6PlCWgI',
+ 'info_dict': {
+ 'id': 'lqQg6PlCWgI',
+ 'ext': 'mp4',
+ 'upload_date': '20120731',
+ 'uploader_id': 'olympic',
+ 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
+ 'uploader': 'Olympics',
+ 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
+ },
+ 'params': {
+ 'skip_download': 'requires avconv',
+ }
+ },
]
def __init__(self, *args, **kwargs):
@@ -444,7 +493,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _extract_signature_function(self, video_id, player_url, example_sig):
id_m = re.match(
- r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
+ r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
player_url)
if not id_m:
raise ExtractorError('Cannot identify player %r' % player_url)
@@ -493,8 +542,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
return 's[%s%s%s]' % (starts, ends, steps)
step = None
- start = '(Never used)' # Quelch pyflakes warnings - start will be
- # set as soon as step is set
+ # Quelch pyflakes warnings - start will be set when step is set
+ start = '(Never used)'
for i, prev in zip(idxs[1:], idxs[:-1]):
if step is not None:
if i - prev == step:
@@ -565,24 +614,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _get_available_subtitles(self, video_id, webpage):
try:
- sub_list = self._download_webpage(
+ subs_doc = self._download_xml(
'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
video_id, note=False)
except ExtractorError as err:
self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
return {}
- lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
sub_lang_list = {}
- for l in lang_list:
- lang = l[1]
+ for track in subs_doc.findall('track'):
+ lang = track.attrib['lang_code']
if lang in sub_lang_list:
continue
params = compat_urllib_parse.urlencode({
'lang': lang,
'v': video_id,
'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
- 'name': unescapeHTML(l[0]).encode('utf-8'),
+ 'name': track.attrib['name'].encode('utf-8'),
})
url = 'https://www.youtube.com/api/timedtext?' + params
sub_lang_list[lang] = url
@@ -615,10 +663,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
list_url = caption_url + '&' + list_params
caption_list = self._download_xml(list_url, video_id)
original_lang_node = caption_list.find('track')
- if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr':
+ if original_lang_node is None:
self._downloader.report_warning('Video doesn\'t have automatic captions')
return {}
original_lang = original_lang_node.attrib['lang_code']
+ caption_kind = original_lang_node.attrib.get('kind', '')
sub_lang_list = {}
for lang_node in caption_list.findall('target'):
@@ -628,7 +677,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'tlang': sub_lang,
'fmt': sub_format,
'ts': timestamp,
- 'kind': 'asr',
+ 'kind': caption_kind,
})
sub_lang_list[sub_lang] = caption_url + '&' + params
return sub_lang_list
@@ -665,6 +714,47 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
+ def _parse_dash_manifest(
+ self, video_id, dash_manifest_url, player_url, age_gate):
+ def decrypt_sig(mobj):
+ s = mobj.group(1)
+ dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
+ return '/signature/%s' % dec_s
+ dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
+ dash_doc = self._download_xml(
+ dash_manifest_url, video_id,
+ note='Downloading DASH manifest',
+ errnote='Could not download DASH manifest')
+
+ formats = []
+ for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
+ url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
+ if url_el is None:
+ continue
+ format_id = r.attrib['id']
+ video_url = url_el.text
+ filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
+ f = {
+ 'format_id': format_id,
+ 'url': video_url,
+ 'width': int_or_none(r.attrib.get('width')),
+ 'height': int_or_none(r.attrib.get('height')),
+ 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
+ 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
+ 'filesize': filesize,
+ 'fps': int_or_none(r.attrib.get('frameRate')),
+ }
+ try:
+ existing_format = next(
+ fo for fo in formats
+ if fo['format_id'] == format_id)
+ except StopIteration:
+ f.update(self._formats.get(format_id, {}).items())
+ formats.append(f)
+ else:
+ existing_format.update(f)
+ return formats
+
def _real_extract(self, url):
proto = (
'http' if self._downloader.params.get('prefer_insecure', False)
@@ -692,11 +782,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
+ url = proto + '://www.youtube.com/embed/%s' % video_id
+ embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
data = compat_urllib_parse.urlencode({
'video_id': video_id,
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
'sts': self._search_regex(
- r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
+ r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
})
video_info_url = proto + '://www.youtube.com/get_video_info?' + data
video_info_webpage = self._download_webpage(
@@ -722,9 +814,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# We fallback to the get_video_info pages (used by the embed page)
self.report_video_info_webpage_download(video_id)
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
- video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
- % (video_id, el_type))
- video_info_webpage = self._download_webpage(video_info_url,
+ video_info_url = (
+ '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+ % (proto, video_id, el_type))
+ video_info_webpage = self._download_webpage(
+ video_info_url,
video_id, note=False,
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
@@ -797,7 +891,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
m_cat_container = self._search_regex(
r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
- video_webpage, 'categories', fatal=False)
+ video_webpage, 'categories', default=None)
if m_cat_container:
category = self._html_search_regex(
r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
@@ -875,7 +969,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'url': video_info['conn'][0],
'player_url': player_url,
}]
- elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
+ elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
if 'rtmpe%3Dyes' in encoded_url_map:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
@@ -892,11 +986,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
elif 's' in url_data:
encrypted_sig = url_data['s'][0]
- if not age_gate:
- jsplayer_url_json = self._search_regex(
- r'"assets":.+?"js":\s*("[^"]+")',
- video_webpage, 'JS player URL')
- player_url = json.loads(jsplayer_url_json)
+ jsplayer_url_json = self._search_regex(
+ r'"assets":.+?"js":\s*("[^"]+")',
+ embed_webpage if age_gate else video_webpage, 'JS player URL')
+ player_url = json.loads(jsplayer_url_json)
if player_url is None:
player_url_json = self._search_regex(
r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
@@ -940,51 +1033,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# Look for the DASH manifest
if self._downloader.params.get('youtube_include_dash_manifest', True):
- try:
- # The DASH manifest used needs to be the one from the original video_webpage.
- # The one found in get_video_info seems to be using different signatures.
- # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
- # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
- # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
- dash_manifest_url = video_info.get('dashmpd')[0]
-
- def decrypt_sig(mobj):
- s = mobj.group(1)
- dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
- return '/signature/%s' % dec_s
- dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
- dash_doc = self._download_xml(
- dash_manifest_url, video_id,
- note='Downloading DASH manifest',
- errnote='Could not download DASH manifest')
- for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
- url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
- if url_el is None:
- continue
- format_id = r.attrib['id']
- video_url = url_el.text
- filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
- f = {
- 'format_id': format_id,
- 'url': video_url,
- 'width': int_or_none(r.attrib.get('width')),
- 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
- 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
- 'filesize': filesize,
- 'fps': int_or_none(r.attrib.get('frameRate')),
- }
- try:
- existing_format = next(
- fo for fo in formats
- if fo['format_id'] == format_id)
- except StopIteration:
- f.update(self._formats.get(format_id, {}))
- formats.append(f)
- else:
- existing_format.update(f)
-
- except (ExtractorError, KeyError) as e:
- self.report_warning('Skipping DASH manifest: %r' % e, video_id)
+ dash_mpd = video_info.get('dashmpd')
+ if dash_mpd:
+ dash_manifest_url = dash_mpd[0]
+ try:
+ dash_formats = self._parse_dash_manifest(
+ video_id, dash_manifest_url, player_url, age_gate)
+ except (ExtractorError, KeyError) as e:
+ self.report_warning(
+ 'Skipping DASH manifest: %r' % e, video_id)
+ else:
+ # Hide the formats we found through non-DASH
+ dash_keys = set(df['format_id'] for df in dash_formats)
+ for f in formats:
+ if f['format_id'] in dash_keys:
+ f['format_id'] = 'nondash-%s' % f['format_id']
+ f['preference'] = f.get('preference', 0) - 10000
+ formats.extend(dash_formats)
self._sort_formats(formats)
@@ -1030,7 +1095,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
)"""
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
- _MORE_PAGES_INDICATOR = r'data-link-type="next"'
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
IE_NAME = 'youtube:playlist'
_TESTS = [{
@@ -1087,6 +1151,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
'info_dict': {
'title': 'JODA7',
}
+ }, {
+ 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
+ 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
+ 'info_dict': {
+ 'title': 'Uploads from Interstellar Movie',
+ },
+ 'playlist_mincout': 21,
}]
def _real_initialize(self):
@@ -1137,9 +1208,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
if playlist_id.startswith('RD'):
# Mixes require a custom extraction process
return self._extract_mix(playlist_id)
- if playlist_id.startswith('TL'):
- raise ExtractorError('For downloading YouTube.com top lists, use '
- 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
url = self._TEMPLATE_URL % playlist_id
page = self._download_webpage(url, playlist_id)
@@ -1171,6 +1239,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
'Downloading page #%s' % page_num,
transform_source=uppercase_escape)
content_html = more['content_html']
+ if not content_html.strip():
+ # Some webpages show a "Load more" button but they don't
+ # have more videos
+ break
more_widget_html = more['load_more_widget_html']
playlist_title = self._html_search_regex(
@@ -1181,54 +1253,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
return self.playlist_result(url_results, playlist_id, playlist_title)
-class YoutubeTopListIE(YoutubePlaylistIE):
- IE_NAME = 'youtube:toplist'
- IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
- ' (Example: "yttoplist:music:Top Tracks")')
- _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
- _TESTS = [{
- 'url': 'yttoplist:music:Trending',
- 'playlist_mincount': 5,
- 'skip': 'Only works for logged-in users',
- }]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- channel = mobj.group('chann')
- title = mobj.group('title')
- query = compat_urllib_parse.urlencode({'title': title})
- channel_page = self._download_webpage(
- 'https://www.youtube.com/%s' % channel, title)
- link = self._html_search_regex(
- r'''(?x)
- <a\s+href="([^"]+)".*?>\s*
- <span\s+class="branded-page-module-title-text">\s*
- <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
- channel_page, 'list')
- url = compat_urlparse.urljoin('https://www.youtube.com/', link)
-
- video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
- ids = []
- # sometimes the webpage doesn't contain the videos
- # retry until we get them
- for i in itertools.count(0):
- msg = 'Downloading Youtube mix'
- if i > 0:
- msg += ', retry #%d' % i
-
- webpage = self._download_webpage(url, title, msg)
- ids = orderedSet(re.findall(video_re, webpage))
- if ids:
- break
- url_results = self._ids_to_results(ids)
- return self.playlist_result(url_results, playlist_title=title)
-
-
class YoutubeChannelIE(InfoExtractor):
IE_DESC = 'YouTube.com channels'
- _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
- _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
- _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
+ _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
IE_NAME = 'youtube:channel'
_TESTS = [{
'note': 'paginated channel',
@@ -1244,13 +1271,8 @@ class YoutubeChannelIE(InfoExtractor):
return ids_in_page
def _real_extract(self, url):
- # Extract channel id
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
+ channel_id = self._match_id(url)
- # Download channel page
- channel_id = mobj.group(1)
video_ids = []
url = 'https://www.youtube.com/channel/%s/videos' % channel_id
channel_page = self._download_webpage(url, channel_id)
@@ -1264,30 +1286,39 @@ class YoutubeChannelIE(InfoExtractor):
# The videos are contained in a single page
# the ajax pages can't be used, they are empty
video_ids = self.extract_videos_from_page(channel_page)
- else:
- # Download all channel pages using the json-based channel_ajax query
+ entries = [
+ self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in video_ids]
+ return self.playlist_result(entries, channel_id)
+
+ def _entries():
+ more_widget_html = content_html = channel_page
for pagenum in itertools.count(1):
- url = self._MORE_PAGES_URL % (pagenum, channel_id)
- page = self._download_json(
- url, channel_id, note='Downloading page #%s' % pagenum,
- transform_source=uppercase_escape)
- ids_in_page = self.extract_videos_from_page(page['content_html'])
- video_ids.extend(ids_in_page)
+ ids_in_page = self.extract_videos_from_page(content_html)
+ for video_id in ids_in_page:
+ yield self.url_result(
+ video_id, 'Youtube', video_id=video_id)
- if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
+ mobj = re.search(
+ r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
+ more_widget_html)
+ if not mobj:
break
- self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), channel_id,
+ 'Downloading page #%s' % (pagenum + 1),
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ more_widget_html = more['load_more_widget_html']
- url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in video_ids]
- return self.playlist_result(url_entries, channel_id)
+ return self.playlist_result(_entries(), channel_id)
class YoutubeUserIE(InfoExtractor):
IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
+ _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
_GDATA_PAGE_SIZE = 50
_GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
@@ -1315,12 +1346,7 @@ class YoutubeUserIE(InfoExtractor):
return super(YoutubeUserIE, cls).suitable(url)
def _real_extract(self, url):
- # Extract username
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
-
- username = mobj.group(1)
+ username = self._match_id(url)
# Download video ids using YouTube Data API. Result size per
# query is limited (currently to 50 videos) so we need to query
@@ -1517,9 +1543,11 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
feed_entries = []
paging = 0
for i in itertools.count(1):
- info = self._download_json(self._FEED_TEMPLATE % paging,
- '%s feed' % self._FEED_NAME,
- 'Downloading page %s' % i)
+ info = self._download_json(
+ self._FEED_TEMPLATE % paging,
+ '%s feed' % self._FEED_NAME,
+ 'Downloading page %s' % i,
+ transform_source=uppercase_escape)
feed_html = info.get('feed_html') or info.get('content_html')
load_more_widget_html = info.get('load_more_widget_html') or feed_html
m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
@@ -1636,3 +1664,20 @@ class YoutubeTruncatedURLIE(InfoExtractor):
'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
' or simply youtube-dl BaW_jenozKc .',
expected=True)
+
+
+class YoutubeTruncatedIDIE(InfoExtractor):
+ IE_NAME = 'youtube:truncated_id'
+ IE_DESC = False # Do not list
+ _VALID_URL = r'https?://(?:www\.)youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
+
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ raise ExtractorError(
+ 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
+ expected=True)
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index 9ff00e26c..98f15177b 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -1,12 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
+import functools
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate,
+ OnDemandPagedList,
)
@@ -87,7 +89,7 @@ def extract_from_xml_url(ie, video_id, xml_url):
class ZDFIE(InfoExtractor):
- _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?'
+ _VALID_URL = r'(?:zdf:|zdf:video:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/(.*beitrag/(?:video/)?))(?P<id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?'
_TEST = {
'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt',
@@ -106,6 +108,52 @@ class ZDFIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
return extract_from_xml_url(self, video_id, xml_url)
+
+
+class ZDFChannelIE(InfoExtractor):
+ _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/)(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.zdf.de/ZDFmediathek#/kanaluebersicht/1586442/sendung/Titanic',
+ 'info_dict': {
+ 'id': '1586442',
+ },
+ 'playlist_count': 3,
+ }
+ _PAGE_SIZE = 50
+
+ def _fetch_page(self, channel_id, page):
+ offset = page * self._PAGE_SIZE
+ xml_url = (
+ 'http://www.zdf.de/ZDFmediathek/xmlservice/web/aktuellste?ak=web&offset=%d&maxLength=%d&id=%s'
+ % (offset, self._PAGE_SIZE, channel_id))
+ doc = self._download_xml(
+ xml_url, channel_id,
+ note='Downloading channel info',
+ errnote='Failed to download channel info')
+
+ title = doc.find('.//information/title').text
+ description = doc.find('.//information/detail').text
+ for asset in doc.findall('.//teasers/teaser'):
+ a_type = asset.find('./type').text
+ a_id = asset.find('./details/assetId').text
+ if a_type not in ('video', 'topic'):
+ continue
+ yield {
+ '_type': 'url',
+ 'playlist_title': title,
+ 'playlist_description': description,
+ 'url': 'zdf:%s:%s' % (a_type, a_id),
+ }
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+ entries = OnDemandPagedList(
+ functools.partial(self._fetch_page, channel_id), self._PAGE_SIZE)
+
+ return {
+ '_type': 'playlist',
+ 'id': channel_id,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 2e8c71508..14006178d 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -109,7 +109,7 @@ def parseOpts(overrideArguments=None):
kw = {
'version': __version__,
'formatter': fmt,
- 'usage': '%prog [options] url [url...]',
+ 'usage': '%prog [OPTIONS] URL [URL...]',
'conflict_handler': 'resolve',
}
@@ -163,7 +163,10 @@ def parseOpts(overrideArguments=None):
general.add_option(
'--ignore-config',
action='store_true',
- help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)')
+ help='Do not read configuration files. '
+ 'When given in the global configuration file /etc/youtube-dl.conf: '
+ 'Do not read the user configuration in ~/.config/youtube-dl/config '
+ '(%APPDATA%/youtube-dl/config.txt on Windows)')
general.add_option(
'--flat-playlist',
action='store_const', dest='extract_flat', const='in_playlist',
@@ -264,10 +267,12 @@ def parseOpts(overrideArguments=None):
action='store', dest='format', metavar='FORMAT', default=None,
help=(
'video format code, specify the order of preference using'
- ' slashes: -f 22/17/18 . -f mp4 , -f m4a and -f flv are also'
- ' supported. You can also use the special names "best",'
- ' "bestvideo", "bestaudio", "worst", "worstvideo" and'
- ' "worstaudio". By default, youtube-dl will pick the best quality.'
+ ' slashes, as in -f 22/17/18 . '
+ ' Instead of format codes, you can select by extension for the '
+ 'extensions aac, m4a, mp3, mp4, ogg, wav, webm. '
+ 'You can also use the special names "best",'
+ ' "bestvideo", "bestaudio", "worst". '
+ ' By default, youtube-dl will pick the best quality.'
' Use commas to download multiple audio formats, such as'
' -f 136/137/mp4/bestvideo,140/m4a/bestaudio.'
' You can merge the video and audio of two formats into a single'
@@ -297,6 +302,12 @@ def parseOpts(overrideArguments=None):
'--youtube-skip-dash-manifest',
action='store_false', dest='youtube_include_dash_manifest',
help='Do not download the DASH manifest on YouTube videos')
+ video_format.add_option(
+ '--merge-output-format',
+ action='store', dest='merge_output_format', metavar='FORMAT', default=None,
+ help=(
+ 'If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.'
+ 'Ignored if no merge is required'))
subtitles = optparse.OptionGroup(parser, 'Subtitle Options')
subtitles.add_option(
@@ -346,6 +357,10 @@ def parseOpts(overrideArguments=None):
'--test',
action='store_true', dest='test', default=False,
help=optparse.SUPPRESS_HELP)
+ downloader.add_option(
+ '--playlist-reverse',
+ action='store_true',
+ help='Download playlist videos in reverse order')
workarounds = optparse.OptionGroup(parser, 'Workarounds')
workarounds.add_option(
@@ -437,6 +452,11 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='dump_single_json', default=False,
help='simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole playlist information in a single line.')
verbosity.add_option(
+ '--print-json',
+ action='store_true', dest='print_json', default=False,
+ help='Be quiet and print the video information as JSON (video is still being downloaded).',
+ )
+ verbosity.add_option(
'--newline',
action='store_true', dest='progress_with_newline', default=False,
help='output progress bar as new lines')
@@ -478,10 +498,6 @@ def parseOpts(overrideArguments=None):
'--id', default=False,
action='store_true', dest='useid', help='use only video ID in file name')
filesystem.add_option(
- '-A', '--auto-number',
- action='store_true', dest='autonumber', default=False,
- help='number downloaded files starting from 00000')
- filesystem.add_option(
'-o', '--output',
dest='outtmpl', metavar='TEMPLATE',
help=('output filename template. Use %(title)s to get the title, '
@@ -509,6 +525,10 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='restrictfilenames', default=False,
help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames')
filesystem.add_option(
+ '-A', '--auto-number',
+ action='store_true', dest='autonumber', default=False,
+ help='[deprecated; use -o "%(autonumber)s-%(title)s.%(ext)s" ] number downloaded files starting from 00000')
+ filesystem.add_option(
'-t', '--title',
action='store_true', dest='usetitle', default=False,
help='[deprecated] use title in file name (default)')
diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py
index fb367ebe4..7f505b58e 100644
--- a/youtube_dl/postprocessor/__init__.py
+++ b/youtube_dl/postprocessor/__init__.py
@@ -8,11 +8,16 @@ from .ffmpeg import (
FFmpegExtractAudioPP,
FFmpegMergerPP,
FFmpegMetadataPP,
- FFmpegVideoConvertor,
+ FFmpegVideoConvertorPP,
)
from .xattrpp import XAttrMetadataPP
from .execafterdownload import ExecAfterDownloadPP
+
+def get_postprocessor(key):
+ return globals()[key + 'PP']
+
+
__all__ = [
'AtomicParsleyPP',
'ExecAfterDownloadPP',
@@ -22,6 +27,6 @@ __all__ = [
'FFmpegMergerPP',
'FFmpegMetadataPP',
'FFmpegPostProcessor',
- 'FFmpegVideoConvertor',
+ 'FFmpegVideoConvertorPP',
'XAttrMetadataPP',
]
diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py
index 09db43611..75c0f7bbe 100644
--- a/youtube_dl/postprocessor/execafterdownload.py
+++ b/youtube_dl/postprocessor/execafterdownload.py
@@ -14,7 +14,7 @@ class ExecAfterDownloadPP(PostProcessor):
def run(self, information):
cmd = self.exec_cmd
- if not '{}' in cmd:
+ if '{}' not in cmd:
cmd += ' {}'
cmd = cmd.replace('{}', shlex_quote(information['filepath']))
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index 9303b8378..d1b342c7a 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -37,11 +37,11 @@ class FFmpegPostProcessor(PostProcessor):
if not self._executable:
raise FFmpegPostProcessorError('ffmpeg or avconv not found. Please install one.')
- REQUIRED_VERSION = '1.0'
+ required_version = '10-0' if self._uses_avconv() else '1.0'
if is_outdated_version(
- self._versions[self._executable], REQUIRED_VERSION):
+ self._versions[self._executable], required_version):
warning = 'Your copy of %s is outdated, update %s to version %s or newer if you encounter any errors.' % (
- self._executable, self._executable, REQUIRED_VERSION)
+ self._executable, self._executable, required_version)
if self._downloader:
self._downloader.report_warning(warning)
@@ -80,8 +80,9 @@ class FFmpegPostProcessor(PostProcessor):
files_cmd = []
for path in input_paths:
- files_cmd.extend(['-i', encodeFilename(path, True)])
- cmd = ([self._executable, '-y'] + files_cmd
+ files_cmd.extend([encodeArgument('-i'), encodeFilename(path, True)])
+ cmd = ([encodeFilename(self._executable, True), encodeArgument('-y')] +
+ files_cmd
+ [encodeArgument(o) for o in opts] +
[encodeFilename(self._ffmpeg_filename_argument(out_path), True)])
@@ -122,8 +123,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
raise PostProcessingError('ffprobe or avprobe not found. Please install one.')
try:
cmd = [
- self._probe_executable,
- '-show_streams',
+ encodeFilename(self._probe_executable, True),
+ encodeArgument('-show_streams'),
encodeFilename(self._ffmpeg_filename_argument(path), True)]
handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE)
output = handle.communicate()[0]
@@ -236,9 +237,9 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
return self._nopostoverwrites, information
-class FFmpegVideoConvertor(FFmpegPostProcessor):
+class FFmpegVideoConvertorPP(FFmpegPostProcessor):
def __init__(self, downloader=None, preferedformat=None):
- super(FFmpegVideoConvertor, self).__init__(downloader)
+ super(FFmpegVideoConvertorPP, self).__init__(downloader)
self._preferedformat = preferedformat
def run(self, information):
@@ -520,7 +521,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
class FFmpegMergerPP(FFmpegPostProcessor):
def run(self, info):
filename = info['filepath']
- args = ['-c', 'copy', '-map', '0:v:0', '-map', '1:a:0', '-shortest']
+ args = ['-c', 'copy', '-map', '0:v:0', '-map', '1:a:0']
self._downloader.to_screen('[ffmpeg] Merging formats into "%s"' % filename)
self.run_ffmpeg_multiple_files(info['__files_to_merge'], filename, args)
return True, info
diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py
index 2bd264b30..e60505ace 100644
--- a/youtube_dl/swfinterp.py
+++ b/youtube_dl/swfinterp.py
@@ -4,8 +4,8 @@ import collections
import io
import zlib
+from .compat import compat_str
from .utils import (
- compat_str,
ExtractorError,
struct_unpack,
)
diff --git a/youtube_dl/update.py b/youtube_dl/update.py
index 4c07a558e..3f9c5249d 100644
--- a/youtube_dl/update.py
+++ b/youtube_dl/update.py
@@ -13,6 +13,7 @@ from .compat import (
compat_str,
compat_urllib_request,
)
+from .utils import make_HTTPS_handler
from .version import __version__
@@ -58,9 +59,12 @@ def update_self(to_screen, verbose):
to_screen('It looks like you installed youtube-dl with a package manager, pip, setup.py or a tarball. Please use that to update.')
return
+ https_handler = make_HTTPS_handler(False)
+ opener = compat_urllib_request.build_opener(https_handler)
+
# Check if there is a new version
try:
- newversion = compat_urllib_request.urlopen(VERSION_URL).read().decode('utf-8').strip()
+ newversion = opener.open(VERSION_URL).read().decode('utf-8').strip()
except:
if verbose:
to_screen(compat_str(traceback.format_exc()))
@@ -72,14 +76,14 @@ def update_self(to_screen, verbose):
# Download and check versions info
try:
- versions_info = compat_urllib_request.urlopen(JSON_URL).read().decode('utf-8')
+ versions_info = opener.open(JSON_URL).read().decode('utf-8')
versions_info = json.loads(versions_info)
except:
if verbose:
to_screen(compat_str(traceback.format_exc()))
to_screen('ERROR: can\'t obtain versions info. Please try again later.')
return
- if not 'signature' in versions_info:
+ if 'signature' not in versions_info:
to_screen('ERROR: the versions file is not signed or corrupted. Aborting.')
return
signature = versions_info['signature']
@@ -120,7 +124,7 @@ def update_self(to_screen, verbose):
return
try:
- urlh = compat_urllib_request.urlopen(version['exe'][0])
+ urlh = opener.open(version['exe'][0])
newcontent = urlh.read()
urlh.close()
except (IOError, OSError):
@@ -166,7 +170,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"
# Zip unix package
elif isinstance(globals().get('__loader__'), zipimporter):
try:
- urlh = compat_urllib_request.urlopen(version['bin'][0])
+ urlh = opener.open(version['bin'][0])
newcontent = urlh.read()
urlh.close()
except (IOError, OSError):
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 4d3cbac74..079e8d2c3 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -166,7 +166,7 @@ def xpath_text(node, xpath, name=None, fatal=False):
xpath = xpath.encode('ascii')
n = node.find(xpath)
- if n is None:
+ if n is None or n.text is None:
if fatal:
name = xpath if name is None else name
raise ExtractorError('Could not find XML element %s' % name)
@@ -205,6 +205,10 @@ def get_element_by_attribute(attribute, value, html):
def clean_html(html):
"""Clean an HTML snippet into a readable string"""
+
+ if html is None: # Convenience for sanitizing descriptions etc.
+ return html
+
# Newline vs <br />
html = html.replace('\n', ' ')
html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
@@ -363,7 +367,7 @@ def encodeArgument(s):
if not isinstance(s, compat_str):
# Legacy code that uses byte strings
# Uncomment the following line after fixing all post processors
- #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
+ # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
s = s.decode('ascii')
return encodeFilename(s, True)
@@ -388,6 +392,17 @@ def formatSeconds(secs):
def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
+ if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
+ context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
+ if opts_no_check_certificate:
+ context.verify_mode = ssl.CERT_NONE
+ try:
+ return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
+ except TypeError:
+ # Python 2.7.8
+ # (create_default_context present but HTTPSHandler has no context=)
+ pass
+
if sys.version_info < (3, 2):
import httplib
@@ -409,22 +424,12 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
def https_open(self, req):
return self.do_open(HTTPSConnectionV3, req)
return HTTPSHandlerV3(**kwargs)
- elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
- context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
- context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
- if opts_no_check_certificate:
- context.verify_mode = ssl.CERT_NONE
- return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
else: # Python < 3.4
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
context.verify_mode = (ssl.CERT_NONE
if opts_no_check_certificate
else ssl.CERT_REQUIRED)
context.set_default_verify_paths()
- try:
- context.load_default_certs()
- except AttributeError:
- pass # Python < 3.4
return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
@@ -463,6 +468,13 @@ class ExtractorError(Exception):
return ''.join(traceback.format_tb(self.traceback))
+class UnsupportedError(ExtractorError):
+ def __init__(self, url):
+ super(UnsupportedError, self).__init__(
+ 'Unsupported URL: %s' % url, expected=True)
+ self.url = url
+
+
class RegexNotFoundError(ExtractorError):
"""Error when a regex didn't match"""
pass
@@ -644,17 +656,19 @@ def parse_iso8601(date_str, delimiter='T'):
return calendar.timegm(dt.timetuple())
-def unified_strdate(date_str):
+def unified_strdate(date_str, day_first=True):
"""Return a string with the date in the format YYYYMMDD"""
if date_str is None:
return None
-
upload_date = None
# Replace commas
date_str = date_str.replace(',', ' ')
# %z (UTC offset) is only supported in python>=3.2
date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
+ # Remove AM/PM + timezone
+ date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
+
format_expressions = [
'%d %B %Y',
'%d %b %Y',
@@ -669,7 +683,6 @@ def unified_strdate(date_str):
'%d/%m/%Y',
'%d/%m/%y',
'%Y/%m/%d %H:%M:%S',
- '%d/%m/%Y %H:%M:%S',
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%d %H:%M:%S.%f',
'%d.%m.%Y %H:%M',
@@ -681,6 +694,14 @@ def unified_strdate(date_str):
'%Y-%m-%dT%H:%M:%S.%f',
'%Y-%m-%dT%H:%M',
]
+ if day_first:
+ format_expressions.extend([
+ '%d/%m/%Y %H:%M:%S',
+ ])
+ else:
+ format_expressions.extend([
+ '%m/%d/%Y %H:%M:%S',
+ ])
for expression in format_expressions:
try:
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
@@ -712,8 +733,10 @@ def date_from_str(date_str):
Return a datetime object from a string in the format YYYYMMDD or
(now|today)[+-][0-9](day|week|month|year)(s)?"""
today = datetime.date.today()
- if date_str == 'now'or date_str == 'today':
+ if date_str in ('now', 'today'):
return today
+ if date_str == 'yesterday':
+ return today - datetime.timedelta(days=1)
match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
if match is not None:
sign = match.group('sign')
@@ -808,22 +831,22 @@ def _windows_write_string(s, out):
GetStdHandle = ctypes.WINFUNCTYPE(
ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
- ("GetStdHandle", ctypes.windll.kernel32))
+ (b"GetStdHandle", ctypes.windll.kernel32))
h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
WriteConsoleW = ctypes.WINFUNCTYPE(
ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
- ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
+ ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
written = ctypes.wintypes.DWORD(0)
- GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
+ GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
FILE_TYPE_CHAR = 0x0002
FILE_TYPE_REMOTE = 0x8000
GetConsoleMode = ctypes.WINFUNCTYPE(
ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
ctypes.POINTER(ctypes.wintypes.DWORD))(
- ("GetConsoleMode", ctypes.windll.kernel32))
+ (b"GetConsoleMode", ctypes.windll.kernel32))
INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
def not_a_console(handle):
@@ -1024,7 +1047,7 @@ def smuggle_url(url, data):
def unsmuggle_url(smug_url, default=None):
- if not '#__youtubedl_smuggle' in smug_url:
+ if '#__youtubedl_smuggle' not in smug_url:
return smug_url, default
url, _, sdata = smug_url.rpartition('#')
jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
@@ -1090,11 +1113,14 @@ def parse_filesize(s):
}
units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
- m = re.match(r'(?P<num>[0-9]+(?:\.[0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
+ m = re.match(
+ r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
if not m:
return None
- return int(float(m.group('num')) * _UNIT_TABLE[m.group('unit')])
+ num_str = m.group('num').replace(',', '.')
+ mult = _UNIT_TABLE[m.group('unit')]
+ return int(float(num_str) * mult)
def get_term_width():
@@ -1203,18 +1229,29 @@ def parse_duration(s):
m = re.match(
r'''(?ix)T?
+ (?:
+ (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
+ (?P<only_hours>[0-9.]+)\s*(?:hours?)|
+
(?:
(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
)?
- (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$''', s)
+ (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
+ )$''', s)
if not m:
return None
- res = int(m.group('secs'))
+ res = 0
+ if m.group('only_mins'):
+ return float_or_none(m.group('only_mins'), invscale=60)
+ if m.group('only_hours'):
+ return float_or_none(m.group('only_hours'), invscale=60 * 60)
+ if m.group('secs'):
+ res += int(m.group('secs'))
if m.group('mins'):
res += int(m.group('mins')) * 60
- if m.group('hours'):
- res += int(m.group('hours')) * 60 * 60
+ if m.group('hours'):
+ res += int(m.group('hours')) * 60 * 60
if m.group('ms'):
res += float(m.group('ms'))
return res
@@ -1236,18 +1273,25 @@ def check_executable(exe, args=[]):
def get_exe_version(exe, args=['--version'],
- version_re=r'version\s+([0-9._-a-zA-Z]+)',
- unrecognized='present'):
+ version_re=None, unrecognized='present'):
""" Returns the version of the specified executable,
or False if the executable is not present """
try:
- out, err = subprocess.Popen(
+ out, _ = subprocess.Popen(
[exe] + args,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
except OSError:
return False
- firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
- m = re.search(version_re, firstline)
+ if isinstance(out, bytes): # Python 2.x
+ out = out.decode('ascii', 'ignore')
+ return detect_exe_version(out, version_re, unrecognized)
+
+
+def detect_exe_version(output, version_re=None, unrecognized='present'):
+ assert isinstance(output, compat_str)
+ if version_re is None:
+ version_re = r'version\s+([-0-9._a-zA-Z]+)'
+ m = re.search(version_re, output)
if m:
return m.group(1)
else:
@@ -1488,7 +1532,7 @@ def limit_length(s, length):
def version_tuple(v):
- return [int(e) for e in v.split('.')]
+ return tuple(int(e) for e in re.split(r'[-.]', v))
def is_outdated_version(version, limit, assume_new=True):
@@ -1510,3 +1554,23 @@ def ytdl_is_updateable():
def args_to_str(args):
# Get a short string representation for a subprocess command
return ' '.join(shlex_quote(a) for a in args)
+
+
+def urlhandle_detect_ext(url_handle):
+ try:
+ url_handle.headers
+ getheader = lambda h: url_handle.headers[h]
+ except AttributeError: # Python < 3
+ getheader = url_handle.info().getheader
+
+ return getheader('Content-Type').split("/")[1]
+
+
+def age_restricted(content_limit, age_limit):
+ """ Returns True iff the content should be blocked """
+
+ if age_limit is None: # No limit set
+ return False
+ if content_limit is None:
+ return False # Content available for everyone
+ return age_limit < content_limit
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 61902a8cc..8c57c7413 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2014.12.03'
+__version__ = '2015.01.09.2'