1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
mimetype2ext,
qualities,
remove_end,
)
class ImdbIE(InfoExtractor):
IE_NAME = 'imdb'
IE_DESC = 'Internet Movie Database trailers'
_VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.imdb.com/video/imdb/vi2524815897',
'info_dict': {
'id': '2524815897',
'ext': 'mp4',
'title': 'Ice Age: Continental Drift Trailer (No. 2)',
'description': 'md5:9061c2219254e5d14e03c25c98e96a81',
}
}, {
'url': 'http://www.imdb.com/video/_/vi2524815897',
'only_matching': True,
}, {
'url': 'http://www.imdb.com/title/tt1667889/?ref_=ext_shr_eml_vi#lb-vi2524815897',
'only_matching': True,
}, {
'url': 'http://www.imdb.com/title/tt1667889/#lb-vi2524815897',
'only_matching': True,
}, {
'url': 'http://www.imdb.com/videoplayer/vi1562949145',
'only_matching': True,
}, {
'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id)
descr = self._html_search_regex(
r'(?s)<span itemprop="description">(.*?)</span>',
webpage, 'description', fatal=False)
player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id
player_page = self._download_webpage(
player_url, video_id, 'Downloading player page')
# the player page contains the info for the default format, we have to
# fetch other pages for the rest of the formats
extra_formats = re.findall(r'href="(?P<url>%s.*?)".*?>(?P<name>.*?)<' % re.escape(player_url), player_page)
format_pages = [
self._download_webpage(
f_url, video_id, 'Downloading info for %s format' % f_name)
for f_url, f_name in extra_formats]
format_pages.append(player_page)
quality = qualities(('SD', '480p', '720p', '1080p'))
formats = []
for format_page in format_pages:
json_data = self._search_regex(
r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
format_page, 'json data', flags=re.DOTALL)
info = self._parse_json(json_data, video_id, fatal=False)
if not info:
continue
format_info = info.get('videoPlayerObject', {}).get('video', {})
if not format_info:
continue
video_info_list = format_info.get('videoInfoList')
if not video_info_list or not isinstance(video_info_list, list):
continue
for video_info in video_info_list:
if not video_info or not isinstance(video_info, dict):
continue
video_url = video_info.get('videoUrl')
if not video_url or not isinstance(video_url, compat_str):
continue
if (video_info.get('videoMimeType') == 'application/x-mpegURL' or
determine_ext(video_url) == 'm3u8'):
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
continue
format_id = format_info.get('ffname')
formats.append({
'format_id': format_id,
'url': video_url,
'ext': mimetype2ext(video_info.get('videoMimeType')),
'quality': quality(format_id),
})
self._sort_formats(formats)
return {
'id': video_id,
'title': remove_end(self._og_search_title(webpage), ' - IMDb'),
'formats': formats,
'description': descr,
'thumbnail': format_info.get('slate'),
}
class ImdbListIE(InfoExtractor):
IE_NAME = 'imdb:list'
IE_DESC = 'Internet Movie Database lists'
_VALID_URL = r'https?://(?:www\.)?imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})'
_TEST = {
'url': 'http://www.imdb.com/list/JFs9NWw6XI0',
'info_dict': {
'id': 'JFs9NWw6XI0',
'title': 'March 23, 2012 Releases',
},
'playlist_count': 7,
}
def _real_extract(self, url):
list_id = self._match_id(url)
webpage = self._download_webpage(url, list_id)
entries = [
self.url_result('http://www.imdb.com' + m, 'Imdb')
for m in re.findall(r'href="(/video/imdb/vi[^"]+)"\s+data-type="playlist"', webpage)]
list_title = self._html_search_regex(
r'<h1 class="header">(.*?)</h1>', webpage, 'list title')
return self.playlist_result(entries, list_id, list_title)
|