aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/youtube.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
-rw-r--r--youtube_dl/extractor/youtube.py196
1 files changed, 61 insertions, 135 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index e402ef17f..571c73889 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -7,6 +7,7 @@ import socket
import itertools
from .common import InfoExtractor, SearchInfoExtractor
+from .subtitles import SubtitlesIE
from ..utils import (
compat_http_client,
compat_parse_qs,
@@ -23,114 +24,67 @@ from ..utils import (
orderedSet,
)
-class YoutubeBaseInfoExtractor(InfoExtractor):
- """Provide base functions for Youtube extractors"""
- _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
- _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
- _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
- _NETRC_MACHINE = 'youtube'
- # If True it will raise an error if no login info is provided
- _LOGIN_REQUIRED = False
-
- def report_lang(self):
- """Report attempt to set language."""
- self.to_screen(u'Setting language')
-
- def _set_language(self):
- request = compat_urllib_request.Request(self._LANG_URL)
- try:
- self.report_lang()
- compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
- return False
- return True
-
- def _login(self):
- (username, password) = self._get_login_info()
- # No authentication to be performed
- if username is None:
- if self._LOGIN_REQUIRED:
- raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
- return False
-
- request = compat_urllib_request.Request(self._LOGIN_URL)
- try:
- login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
- return False
-
- galx = None
- dsh = None
- match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
- if match:
- galx = match.group(1)
- match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
- if match:
- dsh = match.group(1)
-
- # Log in
- login_form_strs = {
- u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
- u'Email': username,
- u'GALX': galx,
- u'Passwd': password,
- u'PersistentCookie': u'yes',
- u'_utf8': u'霱',
- u'bgresponse': u'js_disabled',
- u'checkConnection': u'',
- u'checkedDomains': u'youtube',
- u'dnConn': u'',
- u'dsh': dsh,
- u'pstMsg': u'0',
- u'rmShown': u'1',
- u'secTok': u'',
- u'signIn': u'Sign in',
- u'timeStmp': u'',
- u'service': u'youtube',
- u'uilel': u'3',
- u'hl': u'en_US',
- }
- # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
- # chokes on unicode
- login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
- login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
- request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
+
+class YoutubeSubtitlesIE(SubtitlesIE):
+
+ def _get_available_subtitles(self, video_id):
+ request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
try:
- self.report_login()
- login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
- if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
- self._downloader.report_warning(u'unable to log in: bad username or password')
- return False
+ sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
- return False
- return True
-
- def _confirm_age(self):
- age_form = {
- 'next_url': '/',
- 'action_confirm': 'Confirm',
- }
- request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
+ self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
+ return {}
+ lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
+
+ sub_lang_list = {}
+ for l in lang_list:
+ lang = l[1]
+ params = compat_urllib_parse.urlencode({
+ 'lang': lang,
+ 'v': video_id,
+ 'fmt': self._downloader.params.get('subtitlesformat'),
+ })
+ url = u'http://www.youtube.com/api/timedtext?' + params
+ sub_lang_list[lang] = url
+ if not sub_lang_list:
+ self._downloader.report_warning(u'video doesn\'t have subtitles')
+ return {}
+ return sub_lang_list
+
+ def _request_automatic_caption(self, video_id, webpage):
+ """We need the webpage for getting the captions url, pass it as an
+ argument to speed up the process."""
+ sub_lang = self._downloader.params.get('subtitleslang') or 'en'
+ sub_format = self._downloader.params.get('subtitlesformat')
+ self.to_screen(u'%s: Looking for automatic captions' % video_id)
+ mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
+ err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
+ if mobj is None:
+ self._downloader.report_warning(err_msg)
+ return {}
+ player_config = json.loads(mobj.group(1))
try:
- self.report_age_confirmation()
- compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
- return True
+ args = player_config[u'args']
+ caption_url = args[u'ttsurl']
+ timestamp = args[u'timestamp']
+ params = compat_urllib_parse.urlencode({
+ 'lang': 'en',
+ 'tlang': sub_lang,
+ 'fmt': sub_format,
+ 'ts': timestamp,
+ 'kind': 'asr',
+ })
+ subtitles_url = caption_url + '&' + params
+ sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
+ return {sub_lang: sub}
+ # An extractor error can be raise by the download process if there are
+ # no automatic captions but there are subtitles
+ except (KeyError, ExtractorError):
+ self._downloader.report_warning(err_msg)
+ return {}
- def _real_initialize(self):
- if self._downloader is None:
- return
- if not self._set_language():
- return
- if not self._login():
- return
- self._confirm_age()
-class YoutubeIE(YoutubeBaseInfoExtractor):
+class YoutubeIE(YoutubeSubtitlesIE):
IE_DESC = u'YouTube.com'
_VALID_URL = r"""^
(
@@ -141,7 +95,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?: # the various things that can precede the ID:
(?:(?:v|embed|e)/) # v/ or embed/ or e/
|(?: # or the v= param in all its forms
- (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
+ (?:watch|movie(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
(?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
v=
@@ -384,19 +338,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
"""Report attempt to download video info webpage."""
self.to_screen(u'%s: Downloading video info webpage' % video_id)
- def report_video_subtitles_download(self, video_id):
- """Report attempt to download video info webpage."""
- self.to_screen(u'%s: Checking available subtitles' % video_id)
-
- def report_video_subtitles_request(self, video_id, sub_lang, format):
- """Report attempt to download video info webpage."""
- self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
-
- def report_video_subtitles_available(self, video_id, sub_lang_list):
- """Report available subtitles."""
- sub_lang = ",".join(list(sub_lang_list.keys()))
- self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
-
def report_information_extraction(self, video_id):
"""Report attempt to extract video information."""
self.to_screen(u'%s: Extracting video information' % video_id)
@@ -736,25 +677,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# subtitles
video_subtitles = None
- if self._downloader.params.get('writesubtitles', False):
- video_subtitles = self._extract_subtitle(video_id)
- if video_subtitles:
- (sub_error, sub_lang, sub) = video_subtitles[0]
- if sub_error:
- self._downloader.report_warning(sub_error)
-
- if self._downloader.params.get('writeautomaticsub', False):
+ if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
+ video_subtitles = self._extract_subtitles(video_id)
+ elif self._downloader.params.get('writeautomaticsub', False):
video_subtitles = self._request_automatic_caption(video_id, video_webpage)
- (sub_error, sub_lang, sub) = video_subtitles[0]
- if sub_error:
- self._downloader.report_warning(sub_error)
-
- if self._downloader.params.get('allsubtitles', False):
- video_subtitles = self._extract_all_subtitles(video_id)
- for video_subtitle in video_subtitles:
- (sub_error, sub_lang, sub) = video_subtitle
- if sub_error:
- self._downloader.report_warning(sub_error)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id)