aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
authorfnord <fnord@fnord.mobi>2015-06-25 00:34:46 -0500
committerfnord <fnord@fnord.mobi>2015-06-25 00:34:46 -0500
commitaa5740fb61d388754e9278a3e38de12203c1b89d (patch)
tree8195c113270d167f57225bdaa6f89df807341968 /youtube_dl
parentda92eeae42f556926cb676b3c14e270603b7e38e (diff)
parent18b5e1e5348ba3a6d1b6a98e97217eebb3d32a1e (diff)
Merge remote-tracking branch 'origin/master' into pr-bbcnews
Diffstat (limited to 'youtube_dl')
-rw-r--r--youtube_dl/extractor/__init__.py12
-rw-r--r--youtube_dl/extractor/adobetv.py60
-rw-r--r--youtube_dl/extractor/bbc.py15
-rw-r--r--youtube_dl/extractor/brightcove.py3
-rw-r--r--youtube_dl/extractor/dailymotion.py42
-rw-r--r--youtube_dl/extractor/dramafever.py43
-rw-r--r--youtube_dl/extractor/drbonanza.py12
-rw-r--r--youtube_dl/extractor/faz.py21
-rw-r--r--youtube_dl/extractor/francetv.py15
-rw-r--r--youtube_dl/extractor/generic.py69
-rw-r--r--youtube_dl/extractor/imdb.py2
-rw-r--r--youtube_dl/extractor/pinkbike.py96
-rw-r--r--youtube_dl/extractor/sohu.py54
-rw-r--r--youtube_dl/extractor/tumblr.py16
-rw-r--r--youtube_dl/extractor/viki.py10
-rw-r--r--youtube_dl/extractor/vimeo.py16
-rw-r--r--youtube_dl/extractor/xhamster.py34
-rw-r--r--youtube_dl/extractor/xvideos.py27
-rw-r--r--youtube_dl/extractor/youtube.py2
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py196
-rw-r--r--youtube_dl/utils.py213
21 files changed, 701 insertions, 257 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index a48346e60..1a9585c92 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -4,7 +4,10 @@ from .abc import ABCIE
from .abc7news import Abc7NewsIE
from .academicearth import AcademicEarthCourseIE
from .addanime import AddAnimeIE
-from .adobetv import AdobeTVIE
+from .adobetv import (
+ AdobeTVIE,
+ AdobeTVVideoIE,
+)
from .adultswim import AdultSwimIE
from .aftenposten import AftenpostenIE
from .aftonbladet import AftonbladetIE
@@ -103,6 +106,7 @@ from .dailymotion import (
DailymotionIE,
DailymotionPlaylistIE,
DailymotionUserIE,
+ DailymotionCloudIE,
)
from .daum import DaumIE
from .dbtv import DBTVIE
@@ -401,6 +405,7 @@ from .pbs import PBSIE
from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE
+from .pinkbike import PinkbikeIE
from .planetaplay import PlanetaPlayIE
from .pladform import PladformIE
from .played import PlayedIE
@@ -696,7 +701,10 @@ from .wrzuta import WrzutaIE
from .wsj import WSJIE
from .xbef import XBefIE
from .xboxclips import XboxClipsIE
-from .xhamster import XHamsterIE
+from .xhamster import (
+ XHamsterIE,
+ XHamsterEmbedIE,
+)
from .xminus import XMinusIE
from .xnxx import XNXXIE
from .xstream import XstreamIE
diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py
index 97d128560..5e43adc51 100644
--- a/youtube_dl/extractor/adobetv.py
+++ b/youtube_dl/extractor/adobetv.py
@@ -5,6 +5,8 @@ from ..utils import (
parse_duration,
unified_strdate,
str_to_int,
+ float_or_none,
+ ISO639Utils,
)
@@ -69,3 +71,61 @@ class AdobeTVIE(InfoExtractor):
'view_count': view_count,
'formats': formats,
}
+
+
+class AdobeTVVideoIE(InfoExtractor):
+ _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
+
+ _TEST = {
+ # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
+ 'url': 'https://video.tv.adobe.com/v/2456/',
+ 'md5': '43662b577c018ad707a63766462b1e87',
+ 'info_dict': {
+ 'id': '2456',
+ 'ext': 'mp4',
+ 'title': 'New experience with Acrobat DC',
+ 'description': 'New experience with Acrobat DC',
+ 'duration': 248.667,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ player_params = self._parse_json(self._search_regex(
+ r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'),
+ video_id)
+
+ formats = [{
+ 'url': source['src'],
+ 'width': source.get('width'),
+ 'height': source.get('height'),
+ 'tbr': source.get('bitrate'),
+ } for source in player_params['sources']]
+
+ # For both metadata and downloaded files the duration varies among
+ # formats. I just pick the max one
+ duration = max(filter(None, [
+ float_or_none(source.get('duration'), scale=1000)
+ for source in player_params['sources']]))
+
+ subtitles = {}
+ for translation in player_params.get('translations', []):
+ lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium'])
+ if lang_id not in subtitles:
+ subtitles[lang_id] = []
+ subtitles[lang_id].append({
+ 'url': translation['vttPath'],
+ 'ext': 'vtt',
+ })
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': player_params['title'],
+ 'description': self._og_search_description(webpage),
+ 'duration': duration,
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index bb671d473..471d865d2 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -255,26 +255,11 @@ class BBCCoUkIE(InfoExtractor):
for connection in self._extract_connections(media):
captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
- ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
- srt = ''
-
- def _extract_text(p):
- if p.text is not None:
- stripped_text = p.text.strip()
- if stripped_text:
- return stripped_text
- return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
- for pos, p in enumerate(ps):
- srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
subtitles[lang] = [
{
'url': connection.get('href'),
'ext': 'ttml',
},
- {
- 'data': srt,
- 'ext': 'srt',
- },
]
return subtitles
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index d768f99e6..4721c2293 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -13,6 +13,7 @@ from ..compat import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urlparse,
+ compat_xml_parse_error,
)
from ..utils import (
determine_ext,
@@ -119,7 +120,7 @@ class BrightcoveIE(InfoExtractor):
try:
object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
- except xml.etree.ElementTree.ParseError:
+ except compat_xml_parse_error:
return
fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 70aa4333c..96f0ed9ad 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -251,3 +251,45 @@ class DailymotionUserIE(DailymotionPlaylistIE):
'title': full_user,
'entries': self._extract_entries(user),
}
+
+
+class DailymotionCloudIE(DailymotionBaseInfoExtractor):
+ _VALID_URL = r'http://api\.dmcloud\.net/embed/[^/]+/(?P<id>[^/?]+)'
+
+ _TEST = {
+ # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html
+ # Tested at FranceTvInfo_2
+ 'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1',
+ 'only_matching': True,
+ }
+
+ @classmethod
+ def _extract_dmcloud_url(self, webpage):
+ mobj = re.search(r'<iframe[^>]+src=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage)
+ if mobj:
+ return mobj.group(1)
+
+ mobj = re.search(r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage)
+ if mobj:
+ return mobj.group(1)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ request = self._build_request(url)
+ webpage = self._download_webpage(request, video_id)
+
+ title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title')
+
+ video_info = self._parse_json(self._search_regex(
+ r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id)
+
+ # TODO: parse ios_url, which is in fact a manifest
+ video_url = video_info['mp4_url']
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': video_info.get('thumbnail_url'),
+ }
diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py
index a34aad486..ca41a3abf 100644
--- a/youtube_dl/extractor/dramafever.py
+++ b/youtube_dl/extractor/dramafever.py
@@ -6,6 +6,8 @@ import itertools
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
+ compat_urllib_parse,
+ compat_urllib_request,
compat_urlparse,
)
from ..utils import (
@@ -17,7 +19,39 @@ from ..utils import (
)
-class DramaFeverIE(InfoExtractor):
+class DramaFeverBaseIE(InfoExtractor):
+ _LOGIN_URL = 'https://www.dramafever.com/accounts/login/'
+ _NETRC_MACHINE = 'dramafever'
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_form = {
+ 'username': username,
+ 'password': password,
+ }
+
+ request = compat_urllib_request.Request(
+ self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ response = self._download_webpage(
+ request, None, 'Logging in as %s' % username)
+
+ if all(logout_pattern not in response
+ for logout_pattern in ['href="/accounts/logout/"', '>Log out<']):
+ error = self._html_search_regex(
+ r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<',
+ response, 'error message', default=None)
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
+ raise ExtractorError('Unable to log in')
+
+
+class DramaFeverIE(DramaFeverBaseIE):
IE_NAME = 'dramafever'
_VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)'
_TEST = {
@@ -97,7 +131,7 @@ class DramaFeverIE(InfoExtractor):
}
-class DramaFeverSeriesIE(InfoExtractor):
+class DramaFeverSeriesIE(DramaFeverBaseIE):
IE_NAME = 'dramafever:series'
_VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$'
_TESTS = [{
@@ -151,8 +185,11 @@ class DramaFeverSeriesIE(InfoExtractor):
% (consumer_secret, series_id, self._PAGE_SIZE, page_num),
series_id, 'Downloading episodes JSON page #%d' % page_num)
for episode in episodes.get('value', []):
+ episode_url = episode.get('episode_url')
+ if not episode_url:
+ continue
entries.append(self.url_result(
- compat_urlparse.urljoin(url, episode['episode_url']),
+ compat_urlparse.urljoin(url, episode_url),
'DramaFever', episode.get('guid')))
if page_num == episodes['num_pages']:
break
diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py
index 7626219ba..8b98b013a 100644
--- a/youtube_dl/extractor/drbonanza.py
+++ b/youtube_dl/extractor/drbonanza.py
@@ -15,7 +15,6 @@ class DRBonanzaIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517',
- 'md5': 'fe330252ddea607635cf2eb2c99a0af3',
'info_dict': {
'id': '65517',
'ext': 'mp4',
@@ -26,6 +25,9 @@ class DRBonanzaIE(InfoExtractor):
'upload_date': '20110120',
'duration': 3664,
},
+ 'params': {
+ 'skip_download': True, # requires rtmp
+ },
}, {
'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410',
'md5': '6dfe039417e76795fb783c52da3de11d',
@@ -93,6 +95,11 @@ class DRBonanzaIE(InfoExtractor):
'format_id': file['Type'].replace('Video', ''),
'preference': preferencemap.get(file['Type'], -10),
})
+ if format['url'].startswith('rtmp'):
+ rtmp_url = format['url']
+ format['rtmp_live'] = True # --resume does not work
+ if '/bonanza/' in rtmp_url:
+ format['play_path'] = rtmp_url.split('/bonanza/')[1]
formats.append(format)
elif file['Type'] == "Thumb":
thumbnail = file['Location']
@@ -111,9 +118,6 @@ class DRBonanzaIE(InfoExtractor):
description = '%s\n%s\n%s\n' % (
info['Description'], info['Actors'], info['Colophon'])
- for f in formats:
- f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/')
- f['url'] = f['url'].replace('mp4:bonanza', 'bonanza')
self._sort_formats(formats)
display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id
diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py
index 3c39ca451..cebdd0193 100644
--- a/youtube_dl/extractor/faz.py
+++ b/youtube_dl/extractor/faz.py
@@ -6,9 +6,9 @@ from .common import InfoExtractor
class FazIE(InfoExtractor):
IE_NAME = 'faz.net'
- _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?faz\.net/(?:[^/]+/)*.*?-(?P<id>\d+)\.html'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
'info_dict': {
'id': '12610585',
@@ -16,7 +16,22 @@ class FazIE(InfoExtractor):
'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher',
'description': 'md5:1453fbf9a0d041d985a47306192ea253',
},
- }
+ }, {
+ 'url': 'http://www.faz.net/aktuell/politik/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.faz.net/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.faz.net/-13659345.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.faz.net/aktuell/politik/-13659345.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.faz.net/foobarblafasel-13659345.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index db0bbec1e..b2c984bf2 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -18,6 +18,7 @@ from ..utils import (
parse_duration,
determine_ext,
)
+from .dailymotion import DailymotionCloudIE
class FranceTVBaseInfoExtractor(InfoExtractor):
@@ -131,12 +132,26 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
'skip_download': 'HLS (reqires ffmpeg)'
},
'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.',
+ }, {
+ 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html',
+ 'md5': 'f485bda6e185e7d15dbc69b72bae993e',
+ 'info_dict': {
+ 'id': '556e03339473995ee145930c',
+ 'ext': 'mp4',
+ 'title': 'Les entreprises familiales : le secret de la réussite',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
+ }
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title)
+
+ dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
+ if dmcloud_url:
+ return self.url_result(dmcloud_url, 'DailymotionCloud')
+
video_id, catalogue = self._search_regex(
r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@')
return self._extract_video(video_id, catalogue)
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index f6b984300..5c03fddc6 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -43,6 +43,9 @@ from .senateisvp import SenateISVPIE
from .bliptv import BlipTVIE
from .svt import SVTIE
from .pornhub import PornHubIE
+from .xhamster import XHamsterEmbedIE
+from .vimeo import VimeoIE
+from .dailymotion import DailymotionCloudIE
class GenericIE(InfoExtractor):
@@ -333,6 +336,15 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ # XHamster embed
+ {
+ 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
+ 'info_dict': {
+ 'id': 'showthread',
+ 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
+ },
+ 'playlist_mincount': 7,
+ },
# Embedded TED video
{
'url': 'http://en.support.wordpress.com/videos/ted-talks/',
@@ -812,6 +824,29 @@ class GenericIE(InfoExtractor):
'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
'uploader': 'Rogers Sportsnet',
},
+ },
+ # Dailymotion Cloud video
+ {
+ 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
+ 'md5': '49444254273501a64675a7e68c502681',
+ 'info_dict': {
+ 'id': '5585de919473990de4bee11b',
+ 'ext': 'mp4',
+ 'title': 'Le débat',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
+ }
+ },
+ # AdobeTVVideo embed
+ {
+ 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
+ 'md5': '43662b577c018ad707a63766462b1e87',
+ 'info_dict': {
+ 'id': '2456',
+ 'ext': 'mp4',
+ 'title': 'New experience with Acrobat DC',
+ 'description': 'New experience with Acrobat DC',
+ 'duration': 248.667,
+ },
}
]
@@ -1089,18 +1124,9 @@ class GenericIE(InfoExtractor):
if matches:
return _playlist_from_matches(matches, ie='RtlNl')
- # Look for embedded (iframe) Vimeo player
- mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
- if mobj:
- player_url = unescapeHTML(mobj.group('url'))
- surl = smuggle_url(player_url, {'Referer': url})
- return self.url_result(surl)
- # Look for embedded (swf embed) Vimeo player
- mobj = re.search(
- r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
- if mobj:
- return self.url_result(mobj.group(1))
+ vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
+ if vimeo_url is not None:
+ return self.url_result(vimeo_url)
# Look for embedded YouTube player
matches = re.findall(r'''(?x)
@@ -1327,6 +1353,11 @@ class GenericIE(InfoExtractor):
if pornhub_url:
return self.url_result(pornhub_url, 'PornHub')
+ # Look for embedded XHamster player
+ xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
+ if xhamster_urls:
+ return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
+
# Look for embedded Tvigle player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
@@ -1494,6 +1525,20 @@ class GenericIE(InfoExtractor):
if senate_isvp_url:
return self.url_result(senate_isvp_url, 'SenateISVP')
+ # Look for Dailymotion Cloud videos
+ dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
+ if dmcloud_url:
+ return self.url_result(dmcloud_url, 'DailymotionCloud')
+
+ # Look for AdobeTVVideo embeds
+ mobj = re.search(
+ r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
+ webpage)
+ if mobj is not None:
+ return self.url_result(
+ self._proto_relative_url(unescapeHTML(mobj.group(1))),
+ 'AdobeTVVideo')
+
def check_video(vurl):
if YoutubeIE.suitable(vurl):
return True
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index f29df36b5..4bb574cf3 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -46,7 +46,7 @@ class ImdbIE(InfoExtractor):
format_info = info['videoPlayerObject']['video']
formats.append({
'format_id': f_id,
- 'url': format_info['url'],
+ 'url': format_info['videoInfoList'][0]['videoUrl'],
})
return {
diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py
new file mode 100644
index 000000000..a52210fab
--- /dev/null
+++ b/youtube_dl/extractor/pinkbike.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ remove_end,
+ remove_start,
+ str_to_int,
+ unified_strdate,
+)
+
+
+class PinkbikeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.pinkbike.com/video/402811/',
+ 'md5': '4814b8ca7651034cd87e3361d5c2155a',
+ 'info_dict': {
+ 'id': '402811',
+ 'ext': 'mp4',
+ 'title': 'Brandon Semenuk - RAW 100',
+ 'description': 'Official release: www.redbull.ca/rupertwalker',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 100,
+ 'upload_date': '20150406',
+ 'uploader': 'revelco',
+ 'location': 'Victoria, British Columbia, Canada',
+ 'view_count': int,
+ 'comment_count': int,
+ }
+ }, {
+ 'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://www.pinkbike.com/video/%s' % video_id, video_id)
+
+ formats = []
+ for _, format_id, src in re.findall(
+ r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage):
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None))
+ formats.append({
+ 'url': src,
+ 'format_id': format_id,
+ 'height': height,
+ })
+ self._sort_formats(formats)
+
+ title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike')
+ description = self._html_search_regex(
+ r'(?s)id="media-description"[^>]*>(.+?)<',
+ webpage, 'description', default=None) or remove_start(
+ self._og_search_description(webpage), title + '. ')
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = int_or_none(self._html_search_meta(
+ 'video:duration', webpage, 'duration'))
+
+ uploader = self._search_regex(
+ r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False)
+ upload_date = unified_strdate(self._search_regex(
+ r'class="fullTime"[^>]+title="([^"]+)"',
+ webpage, 'upload date', fatal=False))
+
+ location = self._html_search_regex(
+ r'(?s)<dt>Location</dt>\s*<dd>(.+?)<',
+ webpage, 'location', fatal=False)
+
+ def extract_count(webpage, label):
+ return str_to_int(self._search_regex(
+ r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label,
+ webpage, label, fatal=False))
+
+ view_count = extract_count(webpage, 'Views')
+ comment_count = extract_count(webpage, 'Comments')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'location': location,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
index 29bd9ce6f..ba2d5e19b 100644
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@@ -6,9 +6,12 @@ import re
from .common import InfoExtractor
from ..compat import (
compat_str,
- compat_urllib_request
+ compat_urllib_request,
+ compat_urllib_parse,
+)
+from ..utils import (
+ ExtractorError,
)
-from ..utils import ExtractorError
class SohuIE(InfoExtractor):
@@ -26,7 +29,7 @@ class SohuIE(InfoExtractor):
'skip': 'On available in China',
}, {
'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
- 'md5': 'ac9a5d322b4bf9ae184d53e4711e4f1a',
+ 'md5': '699060e75cf58858dd47fb9c03c42cfb',
'info_dict': {
'id': '409385080',
'ext': 'mp4',
@@ -34,7 +37,7 @@ class SohuIE(InfoExtractor):
}
}, {
'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml',
- 'md5': '49308ff6dafde5ece51137d04aec311e',
+ 'md5': '9bf34be48f2f4dadcb226c74127e203c',
'info_dict': {
'id': '78693464',
'ext': 'mp4',
@@ -48,7 +51,7 @@ class SohuIE(InfoExtractor):
'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
},
'playlist': [{
- 'md5': '492923eac023ba2f13ff69617c32754a',
+ 'md5': 'bdbfb8f39924725e6589c146bc1883ad',
'info_dict': {
'id': '78910339_part1',
'ext': 'mp4',
@@ -56,7 +59,7 @@ class SohuIE(InfoExtractor):
'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
}
}, {
- 'md5': 'de604848c0e8e9c4a4dde7e1347c0637',
+ 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e',
'info_dict': {
'id': '78910339_part2',
'ext': 'mp4',
@@ -64,7 +67,7 @@ class SohuIE(InfoExtractor):
'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
}
}, {
- 'md5': '93584716ee0657c0b205b8aa3d27aa13',
+ 'md5': '8407e634175fdac706766481b9443450',
'info_dict': {
'id': '78910339_part3',
'ext': 'mp4',
@@ -139,21 +142,42 @@ class SohuIE(InfoExtractor):
for i in range(part_count):
formats = []
for format_id, format_data in formats_json.items():
+ allot = format_data['allot']
+
data = format_data['data']
+ clips_url = data['clipsURL']
+ su = data['su']
- # URLs starts with http://newflv.sohu.ccgslb.net/ is not usable
- # so retry until got a working URL
video_url = 'newflv.sohu.ccgslb.net'
+ cdnId = None
retries = 0
- while 'newflv.sohu.ccgslb.net' in video_url and retries < 5:
- download_note = 'Download information from CDN gateway for format ' + format_id
+
+ while 'newflv.sohu.ccgslb.net' in video_url:
+ params = {
+ 'prot': 9,
+ 'file': clips_url[i],
+ 'new': su[i],
+ 'prod': 'flash',
+ }
+
+ if cdnId is not None:
+ params['idc'] = cdnId
+
+ download_note = 'Downloading %s video URL part %d of %d' % (
+ format_id, i + 1, part_count)
+
if retries > 0:
download_note += ' (retry #%d)' % retries
+ part_info = self._parse_json(self._download_webpage(
+ 'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)),
+ video_id, download_note), video_id)
+
+ video_url = part_info['url']
+ cdnId = part_info.get('nid')
+
retries += 1
- cdn_info = self._download_json(
- 'http://data.vod.itc.cn/cdnList?new=' + data['su'][i],
- video_id, download_note)
- video_url = cdn_info['url']
+ if retries > 5:
+ raise ExtractorError('Failed to get video URL')
formats.append({
'url': video_url,
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py
index 63c20310d..9ead13a91 100644
--- a/youtube_dl/extractor/tumblr.py
+++ b/youtube_dl/extractor/tumblr.py
@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from .pornhub import PornHubIE
+from .vimeo import VimeoIE
class TumblrIE(InfoExtractor):
@@ -40,6 +41,17 @@ class TumblrIE(InfoExtractor):
'timestamp': 1430931613,
},
'add_ie': ['Vidme'],
+ }, {
+ 'url': 'http://camdamage.tumblr.com/post/98846056295/',
+ 'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6',
+ 'info_dict': {
+ 'id': '105463834',
+ 'ext': 'mp4',
+ 'title': 'Cam Damage-HD 720p',
+ 'uploader': 'John Moyer',
+ 'uploader_id': 'user32021558',
+ },
+ 'add_ie': ['Vimeo'],
}]
def _real_extract(self, url):
@@ -60,6 +72,10 @@ class TumblrIE(InfoExtractor):
if pornhub_url:
return self.url_result(pornhub_url, 'PornHub')
+ vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
+ if vimeo_url:
+ return self.url_result(vimeo_url, 'Vimeo')
+
iframe_url = self._search_regex(
r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
webpage, 'iframe url')
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
index 52d10d242..51cdc6b65 100644
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@@ -28,11 +28,15 @@ class VikiBaseIE(InfoExtractor):
_NETRC_MACHINE = 'viki'
+ _token = None
+
def _prepare_call(self, path, timestamp=None, post_data=None):
path += '?' if '?' not in path else '&'
if not timestamp:
timestamp = int(time.time())
query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp)
+ if self._token:
+ query += '&token=%s' % self._token
sig = hmac.new(
self._APP_SECRET.encode('ascii'),
query.encode('ascii'),
@@ -76,10 +80,14 @@ class VikiBaseIE(InfoExtractor):
'password': password,
}
- self._call_api(
+ login = self._call_api(
'sessions.json', None,
'Logging in as %s' % username, post_data=login_form)
+ self._token = login.get('token')
+ if not self._token:
+ self.report_warning('Unable to get session token, login has probably failed')
+
class VikiIE(VikiBaseIE):
IE_NAME = 'viki'
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index f300c7ca4..cae90205d 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -22,6 +22,7 @@ from ..utils import (
unified_strdate,
unsmuggle_url,
urlencode_postdata,
+ unescapeHTML,
)
@@ -173,6 +174,21 @@ class VimeoIE(VimeoBaseInfoExtractor):
},
]
+ @staticmethod
+ def _extract_vimeo_url(url, webpage):
+ # Look for embedded (iframe) Vimeo player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
+ if mobj:
+ player_url = unescapeHTML(mobj.group('url'))
+ surl = smuggle_url(player_url, {'Referer': url})
+ return surl
+ # Look for embedded (swf embed) Vimeo player
+ mobj = re.search(
+ r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
+ if mobj:
+ return mobj.group(1)
+
def _verify_video_password(self, url, video_id, webpage):
password = self._downloader.params.get('videopassword', None)
if password is None:
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 4527567f8..b4ad513a0 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -13,7 +13,6 @@ from ..utils import (
class XHamsterIE(InfoExtractor):
- """Information Extractor for xHamster"""
_VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
_TESTS = [
{
@@ -133,3 +132,36 @@ class XHamsterIE(InfoExtractor):
'age_limit': age_limit,
'formats': formats,
}
+
+
+class XHamsterEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://xhamster.com/xembed.php?video=3328539',
+ 'info_dict': {
+ 'id': '3328539',
+ 'ext': 'mp4',
+ 'title': 'Pen Masturbation',
+ 'upload_date': '20140728',
+ 'uploader_id': 'anonymous',
+ 'duration': 5,
+ 'age_limit': 18,
+ }
+ }
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [url for _, url in re.findall(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1',
+ webpage)]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(
+ r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id,
+ webpage, 'xhamster url')
+
+ return self.url_result(video_url, 'XHamster')
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py
index 2a45dc574..d8415bed4 100644
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -5,10 +5,12 @@ import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
+ compat_urllib_request,
)
from ..utils import (
clean_html,
ExtractorError,
+ determine_ext,
)
@@ -25,6 +27,8 @@ class XVideosIE(InfoExtractor):
}
}
+ _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19'
+
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
@@ -40,9 +44,30 @@ class XVideosIE(InfoExtractor):
video_thumbnail = self._search_regex(
r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
+ formats = [{
+ 'url': video_url,
+ }]
+
+ android_req = compat_urllib_request.Request(url)
+ android_req.add_header('User-Agent', self._ANDROID_USER_AGENT)
+ android_webpage = self._download_webpage(android_req, video_id, fatal=False)
+
+ if android_webpage is not None:
+ player_params_str = self._search_regex(
+ 'mobileReplacePlayerDivTwoQual\(([^)]+)\)',
+ android_webpage, 'player parameters', default='')
+ player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(',')))
+ if player_params:
+ formats.extend([{
+ 'url': param,
+ 'preference': -10,
+ } for param in player_params if determine_ext(param) == 'mp4'])
+
+ self._sort_formats(formats)
+
return {
'id': video_id,
- 'url': video_url,
+ 'formats': formats,
'title': video_title,
'ext': 'flv',
'thumbnail': video_thumbnail,
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 9e2671192..a3da56c14 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -234,6 +234,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'44': {'ext': 'webm', 'width': 854, 'height': 480},
'45': {'ext': 'webm', 'width': 1280, 'height': 720},
'46': {'ext': 'webm', 'width': 1920, 'height': 1080},
+ '59': {'ext': 'mp4', 'width': 854, 'height': 480},
+ '78': {'ext': 'mp4', 'width': 854, 'height': 480},
# 3d videos
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index cc65b34e7..fe7e0a8ee 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -21,6 +21,7 @@ from ..utils import (
shell_quote,
subtitles_filename,
dfxp2srt,
+ ISO639Utils,
)
@@ -307,199 +308,6 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
- # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
- _lang_map = {
- 'aa': 'aar',
- 'ab': 'abk',
- 'ae': 'ave',
- 'af': 'afr',
- 'ak': 'aka',
- 'am': 'amh',
- 'an': 'arg',
- 'ar': 'ara',
- 'as': 'asm',
- 'av': 'ava',
- 'ay': 'aym',
- 'az': 'aze',
- 'ba': 'bak',
- 'be': 'bel',
- 'bg': 'bul',
- 'bh': 'bih',
- 'bi': 'bis',
- 'bm': 'bam',
- 'bn': 'ben',
- 'bo': 'bod',
- 'br': 'bre',
- 'bs': 'bos',
- 'ca': 'cat',
- 'ce': 'che',
- 'ch': 'cha',
- 'co': 'cos',
- 'cr': 'cre',
- 'cs': 'ces',
- 'cu': 'chu',
- 'cv': 'chv',
- 'cy': 'cym',
- 'da': 'dan',
- 'de': 'deu',
- 'dv': 'div',
- 'dz': 'dzo',
- 'ee': 'ewe',
- 'el': 'ell',
- 'en': 'eng',
- 'eo': 'epo',
- 'es': 'spa',
- 'et': 'est',
- 'eu': 'eus',
- 'fa': 'fas',
- 'ff': 'ful',
- 'fi': 'fin',
- 'fj': 'fij',
- 'fo': 'fao',
- 'fr': 'fra',
- 'fy': 'fry',
- 'ga': 'gle',
- 'gd': 'gla',
- 'gl': 'glg',
- 'gn': 'grn',
- 'gu': 'guj',
- 'gv': 'glv',
- 'ha': 'hau',
- 'he': 'heb',
- 'hi': 'hin',
- 'ho': 'hmo',
- 'hr': 'hrv',
- 'ht': 'hat',
- 'hu': 'hun',
- 'hy': 'hye',
- 'hz': 'her',
- 'ia': 'ina',
- 'id': 'ind',
- 'ie': 'ile',
- 'ig': 'ibo',
- 'ii': 'iii',
- 'ik': 'ipk',
- 'io': 'ido',
- 'is': 'isl',
- 'it': 'ita',
- 'iu': 'iku',
- 'ja': 'jpn',
- 'jv': 'jav',
- 'ka': 'kat',
- 'kg': 'kon',
- 'ki': 'kik',
- 'kj': 'kua',
- 'kk': 'kaz',
- 'kl': 'kal',
- 'km': 'khm',
- 'kn': 'kan',
- 'ko': 'kor',
- 'kr': 'kau',
- 'ks': 'kas',
- 'ku': 'kur',
- 'kv': 'kom',
- 'kw': 'cor',
- 'ky': 'kir',
- 'la': 'lat',
- 'lb': 'ltz',
- 'lg': 'lug',
- 'li': 'lim',
- 'ln': 'lin',
- 'lo': 'lao',
- 'lt': 'lit',
- 'lu': 'lub',
- 'lv': 'lav',
- 'mg': 'mlg',
- 'mh': 'mah',
- 'mi': 'mri',
- 'mk': 'mkd',
- 'ml': 'mal',
- 'mn': 'mon',
- 'mr': 'mar',
- 'ms': 'msa',
- 'mt': 'mlt',
- 'my': 'mya',
- 'na': 'nau',
- 'nb': 'nob',
- 'nd': 'nde',
- 'ne': 'nep',
- 'ng': 'ndo',
- 'nl': 'nld',
- 'nn': 'nno',
- 'no': 'nor',
- 'nr': 'nbl',
- 'nv': 'nav',
- 'ny': 'nya',
- 'oc': 'oci',
- 'oj': 'oji',
- 'om': 'orm',
- 'or': 'ori',
- 'os': 'oss',
- 'pa': 'pan',
- 'pi': 'pli',
- 'pl': 'pol',
- 'ps': 'pus',
- 'pt': 'por',
- 'qu': 'que',
- 'rm': 'roh',
- 'rn': 'run',
- 'ro': 'ron',
- 'ru': 'rus',
- 'rw': 'kin',
- 'sa': 'san',
- 'sc': 'srd',
- 'sd': 'snd',
- 'se': 'sme',
- 'sg': 'sag',
- 'si': 'sin',
- 'sk': 'slk',
- 'sl': 'slv',
- 'sm': 'smo',
- 'sn': 'sna',
- 'so': 'som',
- 'sq': 'sqi',
- 'sr': 'srp',
- 'ss': 'ssw',
- 'st': 'sot',
- 'su': 'sun',
- 'sv': 'swe',
- 'sw': 'swa',
- 'ta': 'tam',
- 'te': 'tel',
- 'tg': 'tgk',
- 'th': 'tha',
- 'ti': 'tir',
- 'tk': 'tuk',
- 'tl': 'tgl',
- 'tn': 'tsn',
- 'to': 'ton',
- 'tr': 'tur',
- 'ts': 'tso',
- 'tt': 'tat',
- 'tw': 'twi',
- 'ty': 'tah',
- 'ug': 'uig',
- 'uk': 'ukr',
- 'ur': 'urd',
- 'uz': 'uzb',
- 've': 'ven',
- 'vi': 'vie',
- 'vo': 'vol',
- 'wa': 'wln',
- 'wo': 'wol',
- 'xh': 'xho',
- 'yi': 'yid',
- 'yo': 'yor',
- 'za': 'zha',
- 'zh': 'zho',
- 'zu': 'zul',
- }
-
- @classmethod
- def _conver_lang_code(cls, code):
- """Convert language code from ISO 639-1 to ISO 639-2/T"""
- return cls._lang_map.get(code[:2])
-
def run(self, information):
if information['ext'] not in ['mp4', 'mkv']:
self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files')
@@ -525,7 +333,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
opts += ['-c:s', 'mov_text']
for (i, lang) in enumerate(sub_langs):
opts.extend(['-map', '%d:0' % (i + 1)])
- lang_code = self._conver_lang_code(lang)
+ lang_code = ISO639Utils.short2long(lang)
if lang_code is not None:
opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code])
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 52d198fa3..a2746b2d1 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1841,7 +1841,10 @@ def srt_subtitles_timecode(seconds):
def dfxp2srt(dfxp_data):
- _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
+ _x = functools.partial(xpath_with_ns, ns_map={
+ 'ttml': 'http://www.w3.org/ns/ttml',
+ 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
+ })
def parse_node(node):
str_or_empty = functools.partial(str_or_none, default='')
@@ -1849,9 +1852,9 @@ def dfxp2srt(dfxp_data):
out = str_or_empty(node.text)
for child in node:
- if child.tag in (_x('ttml:br'), 'br'):
+ if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
out += '\n' + str_or_empty(child.tail)
- elif child.tag in (_x('ttml:span'), 'span'):
+ elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
out += str_or_empty(parse_node(child))
else:
out += str_or_empty(xml.etree.ElementTree.tostring(child))
@@ -1860,7 +1863,7 @@ def dfxp2srt(dfxp_data):
dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
out = []
- paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
+ paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
if not paras:
raise ValueError('Invalid dfxp/TTML subtitle')
@@ -1879,6 +1882,208 @@ def dfxp2srt(dfxp_data):
return ''.join(out)
+class ISO639Utils(object):
+ # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
+ _lang_map = {
+ 'aa': 'aar',
+ 'ab': 'abk',
+ 'ae': 'ave',
+ 'af': 'afr',
+ 'ak': 'aka',
+ 'am': 'amh',
+ 'an': 'arg',
+ 'ar': 'ara',
+ 'as': 'asm',
+ 'av': 'ava',
+ 'ay': 'aym',
+ 'az': 'aze',
+ 'ba': 'bak',
+ 'be': 'bel',
+ 'bg': 'bul',
+ 'bh': 'bih',
+ 'bi': 'bis',
+ 'bm': 'bam',
+ 'bn': 'ben',
+ 'bo': 'bod',
+ 'br': 'bre',
+ 'bs': 'bos',
+ 'ca': 'cat',
+ 'ce': 'che',
+ 'ch': 'cha',
+ 'co': 'cos',
+ 'cr': 'cre',
+ 'cs': 'ces',
+ 'cu': 'chu',
+ 'cv': 'chv',
+ 'cy': 'cym',
+ 'da': 'dan',
+ 'de': 'deu',
+ 'dv': 'div',
+ 'dz': 'dzo',
+ 'ee': 'ewe',
+ 'el': 'ell',
+ 'en': 'eng',
+ 'eo': 'epo',
+ 'es': 'spa',
+ 'et': 'est',
+ 'eu': 'eus',
+ 'fa': 'fas',
+ 'ff': 'ful',
+ 'fi': 'fin',
+ 'fj': 'fij',
+ 'fo': 'fao',
+ 'fr': 'fra',
+ 'fy': 'fry',
+ 'ga': 'gle',
+ 'gd': 'gla',
+ 'gl': 'glg',
+ 'gn': 'grn',
+ 'gu': 'guj',
+ 'gv': 'glv',
+ 'ha': 'hau',
+ 'he': 'heb',
+ 'hi': 'hin',
+ 'ho': 'hmo',
+ 'hr': 'hrv',
+ 'ht': 'hat',
+ 'hu': 'hun',
+ 'hy': 'hye',
+ 'hz': 'her',
+ 'ia': 'ina',
+ 'id': 'ind',
+ 'ie': 'ile',
+ 'ig': 'ibo',
+ 'ii': 'iii',
+ 'ik': 'ipk',
+ 'io': 'ido',
+ 'is': 'isl',
+ 'it': 'ita',
+ 'iu': 'iku',
+ 'ja': 'jpn',
+ 'jv': 'jav',
+ 'ka': 'kat',
+ 'kg': 'kon',
+ 'ki': 'kik',
+ 'kj': 'kua',
+ 'kk': 'kaz',
+ 'kl': 'kal',
+ 'km': 'khm',
+ 'kn': 'kan',
+ 'ko': 'kor',
+ 'kr': 'kau',
+ 'ks': 'kas',
+ 'ku': 'kur',
+ 'kv': 'kom',
+ 'kw': 'cor',
+ 'ky': 'kir',
+ 'la': 'lat',
+ 'lb': 'ltz',
+ 'lg': 'lug',
+ 'li': 'lim',
+ 'ln': 'lin',
+ 'lo': 'lao',
+ 'lt': 'lit',
+ 'lu': 'lub',
+ 'lv': 'lav',
+ 'mg': 'mlg',
+ 'mh': 'mah',
+ 'mi': 'mri',
+ 'mk': 'mkd',
+ 'ml': 'mal',
+ 'mn': 'mon',
+ 'mr': 'mar',
+ 'ms': 'msa',
+ 'mt': 'mlt',
+ 'my': 'mya',
+ 'na': 'nau',
+ 'nb': 'nob',
+ 'nd': 'nde',
+ 'ne': 'nep',
+ 'ng': 'ndo',
+ 'nl': 'nld',
+ 'nn': 'nno',
+ 'no': 'nor',
+ 'nr': 'nbl',
+ 'nv': 'nav',
+ 'ny': 'nya',
+ 'oc': 'oci',
+ 'oj': 'oji',
+ 'om': 'orm',
+ 'or': 'ori',
+ 'os': 'oss',
+ 'pa': 'pan',
+ 'pi': 'pli',
+ 'pl': 'pol',
+ 'ps': 'pus',
+ 'pt': 'por',
+ 'qu': 'que',
+ 'rm': 'roh',
+ 'rn': 'run',
+ 'ro': 'ron',
+ 'ru': 'rus',
+ 'rw': 'kin',
+ 'sa': 'san',
+ 'sc': 'srd',
+ 'sd': 'snd',
+ 'se': 'sme',
+ 'sg': 'sag',
+ 'si': 'sin',
+ 'sk': 'slk',
+ 'sl': 'slv',
+ 'sm': 'smo',
+ 'sn': 'sna',
+ 'so': 'som',
+ 'sq': 'sqi',
+ 'sr': 'srp',
+ 'ss': 'ssw',
+ 'st': 'sot',
+ 'su': 'sun',
+ 'sv': 'swe',
+ 'sw': 'swa',
+ 'ta': 'tam',
+ 'te': 'tel',
+ 'tg': 'tgk',
+ 'th': 'tha',
+ 'ti': 'tir',
+ 'tk': 'tuk',
+ 'tl': 'tgl',
+ 'tn': 'tsn',
+ 'to': 'ton',
+ 'tr': 'tur',
+ 'ts': 'tso',
+ 'tt': 'tat',
+ 'tw': 'twi',
+ 'ty': 'tah',
+ 'ug': 'uig',
+ 'uk': 'ukr',
+ 'ur': 'urd',
+ 'uz': 'uzb',
+ 've': 'ven',
+ 'vi': 'vie',
+ 'vo': 'vol',
+ 'wa': 'wln',
+ 'wo': 'wol',
+ 'xh': 'xho',
+ 'yi': 'yid',
+ 'yo': 'yor',
+ 'za': 'zha',
+ 'zh': 'zho',
+ 'zu': 'zul',
+ }
+
+ @classmethod
+ def short2long(cls, code):
+ """Convert language code from ISO 639-1 to ISO 639-2/T"""
+ return cls._lang_map.get(code[:2])
+
+ @classmethod
+ def long2short(cls, code):
+ """Convert language code from ISO 639-2/T to ISO 639-1"""
+ for short_name, long_name in cls._lang_map.items():
+ if long_name == code:
+ return short_name
+
+
class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
def __init__(self, proxies=None):
# Set default handlers