1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
|
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
xpath_text,
str_to_int
)
from ..compat import compat_str
class MovieFapIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<name>[a-z-_]+)'
_TESTS = [{
# normal, multi-format video
'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html',
'md5': '26624b4e2523051b550067d547615906',
'info_dict': {
'id': 'be9867c9416c19f54a4a',
'ext': 'mp4',
'title': 'Experienced MILF Amazing Handjob',
'description': 'Experienced MILF giving an Amazing Handjob',
'thumbnail': 'http://img.moviefap.com/a16:9w990r/thumbs/be/322032-20l.jpg',
'uploader_id': 'darvinfred06',
'display_id': 'experienced-milf-amazing-handjob',
'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing']
}
}, {
# quirky single-format case where the extension is given as fid, but the video is really an flv
'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html',
'md5': 'fa56683e291fc80635907168a743c9ad',
'info_dict': {
'id': 'e5da0d3edce5404418f5',
'ext': 'flv',
'title': 'Jeune Couple Russe',
'description': 'Amateur',
'thumbnail': 'http://pic.moviefap.com/thumbs/e5/949-18l.jpg',
'uploader_id': 'whiskeyjar',
'display_id': 'jeune-couple-russe',
'categories': ['Amateur', 'Teen']
}
}]
@staticmethod
def __get_thumbnail_data(xml):
"""
Constructs a list of video thumbnails from timeline preview images.
:param xml: the information XML document to parse
"""
timeline = xml.find('timeline')
if timeline is None:
# not all videos have the data - ah well
return []
# get the required information from the XML
width = str_to_int(timeline.find('imageWidth').text)
height = str_to_int(timeline.find('imageHeight').text)
first = str_to_int(timeline.find('imageFirst').text)
last = str_to_int(timeline.find('imageLast').text)
pattern = timeline.find('imagePattern').text
# generate the list of thumbnail information dicts
thumbnails = []
for i in range(first, last + 1):
thumbnails.append({
'url': pattern.replace('#', compat_str(i)),
'width': width,
'height': height
})
return thumbnails
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
# find and retrieve the XML document detailing video download URLs
info_url = self._html_search_regex(
r'flashvars\.config = escape\("(.+?)"', webpage, 'player parameters')
xml = self._download_xml(info_url, video_id)
# find the video container
if xml.find('videoConfig') is not None:
ext = xml.find('videoConfig').find('type').text
else:
ext = 'flv' # guess...
# work out the video URL(s)
formats = []
if xml.find('videoLink') is not None:
# single format available
formats.append({
'url': xpath_text(xml, 'videoLink', 'url', True),
'ext': ext
})
else:
# multiple formats available
for item in xml.find('quality').findall('item'):
resolution = xpath_text(item, 'res', 'resolution', True) # 480p etc.
formats.append({
'url': xpath_text(item, 'videoLink', 'url', True),
'ext': ext,
'resolution': resolution,
'height': int(re.findall(r'\d+', resolution)[0])
})
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'title': self._html_search_regex(
r'<div id="view_title"><h1>(.*?)</h1>', webpage, 'title'),
'display_id': re.compile(self._VALID_URL).match(url).group('name'),
'thumbnails': self.__get_thumbnail_data(xml),
'thumbnail': xpath_text(xml, 'startThumb', 'thumbnail'),
'description': self._html_search_regex(
r'name="description" value="(.*?)"', webpage, 'description', fatal=False),
'uploader_id': self._html_search_regex(
r'name="username" value="(.*?)"', webpage, 'uploader_id', fatal=False),
'view_count': str_to_int(self._html_search_regex(
r'<br>Views <strong>([0-9]+)</strong>', webpage, 'view_count, fatal=False')),
'average_rating': float(self._html_search_regex(
r'Current Rating<br> <strong>(.*?)</strong>', webpage, 'average_rating', fatal=False)),
'comment_count': str_to_int(self._html_search_regex(
r'<span id="comCount">([0-9]+)</span>', webpage, 'comment_count', fatal=False)),
'age_limit': 18,
'webpage_url': self._html_search_regex(
r'name="link" value="(.*?)"', webpage, 'webpage_url', fatal=False),
'categories': self._html_search_regex(
r'</div>\s*(.*?)\s*<br>', webpage, 'categories', fatal=False).split(', ')
}
|