youtube_dl/extractor/moviefap.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    xpath_text,
    str_to_int
)
from ..compat import compat_str


class MovieFapIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<name>[a-z-_]+)'
    _TESTS = [{
        # normal, multi-format video
        'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html',
        'md5': '26624b4e2523051b550067d547615906',
        'info_dict': {
            'id': 'be9867c9416c19f54a4a',
            'ext': 'mp4',
            'title': 'Experienced MILF Amazing Handjob',
            'description': 'Experienced MILF giving an Amazing Handjob',
            'thumbnail': 'http://img.moviefap.com/a16:9w990r/thumbs/be/322032-20l.jpg',
            'uploader_id': 'darvinfred06',
            'display_id': 'experienced-milf-amazing-handjob',
            'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing']
        }
    }, {
        # quirky single-format case where the extension is given as fid, but the video is really an flv
        'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html',
        'md5': 'fa56683e291fc80635907168a743c9ad',
        'info_dict': {
            'id': 'e5da0d3edce5404418f5',
            'ext': 'flv',
            'title': 'Jeune Couple Russe',
            'description': 'Amateur',
            'thumbnail': 'http://pic.moviefap.com/thumbs/e5/949-18l.jpg',
            'uploader_id': 'whiskeyjar',
            'display_id': 'jeune-couple-russe',
            'categories': ['Amateur', 'Teen']
        }
    }]

    @staticmethod
    def __get_thumbnail_data(xml):

        """
        Constructs a list of video thumbnails from timeline preview images.
        :param xml: the information XML document to parse
        """

        timeline = xml.find('timeline')
        if timeline is None:
            # not all videos have the data - ah well
            return []

        # get the required information from the XML
        width = str_to_int(timeline.find('imageWidth').text)
        height = str_to_int(timeline.find('imageHeight').text)
        first = str_to_int(timeline.find('imageFirst').text)
        last = str_to_int(timeline.find('imageLast').text)
        pattern = timeline.find('imagePattern').text

        # generate the list of thumbnail information dicts
        thumbnails = []
        for i in range(first, last + 1):
            thumbnails.append({
                'url': pattern.replace('#', compat_str(i)),
                'width': width,
                'height': height
            })
        return thumbnails

    def _real_extract(self, url):

        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        # find and retrieve the XML document detailing video download URLs
        info_url = self._html_search_regex( \
                r'flashvars\.config = escape\("(.+?)"', webpage, 'player parameters')
        xml = self._download_xml(info_url, video_id)

        info = {
            'id': video_id,
            'title': self._html_search_regex( \
                    r'<div id="view_title"><h1>(.*?)</h1>', webpage, 'title'),
            'display_id': re.compile(self._VALID_URL).match(url).group('name'),
            'thumbnails': self.__get_thumbnail_data(xml),
            'thumbnail': xpath_text(xml, 'startThumb', 'thumbnail'),
            'description': self._html_search_regex( \
                    r'name="description" value="(.*?)"', webpage, 'description', fatal=False),
            'uploader_id': self._html_search_regex( \
                    r'name="username" value="(.*?)"', webpage, 'uploader_id', fatal=False),
            'view_count': str_to_int(self._html_search_regex( \
                    r'<br>Views <strong>([0-9]+)</strong>', webpage, 'view_count, fatal=False')),
            'average_rating': float(self._html_search_regex( \
                    r'Current Rating<br> <strong>(.*?)</strong>', webpage, 'average_rating', fatal=False)),
            'comment_count': str_to_int(self._html_search_regex( \
                    r'<span id="comCount">([0-9]+)</span>', webpage, 'comment_count', fatal=False)),
            'age_limit': 18,
            'webpage_url': self._html_search_regex( \
                    r'name="link" value="(.*?)"', webpage, 'webpage_url', fatal=False),
            'categories': self._html_search_regex( \
                    r'</div>\s*(.*?)\s*<br>', webpage, 'categories', fatal=False).split(', ')
        }

        # find and add the format
        if xml.find('videoConfig') is not None:
            info['ext'] = xml.find('videoConfig').find('type').text
        else:
            info['ext'] = 'flv'  # guess...

        # work out the video URL(s)
        if xml.find('videoLink') is not None:
            # single format available
            info['url'] = xpath_text(xml, 'videoLink', 'url', True)
        else:
            # multiple formats available
            info['formats'] = []

            for item in xml.find('quality').findall('item'):
                resolution = xpath_text(item, 'res', 'resolution', True)  # 480p etc.
                info['formats'].append({
                    'url': xpath_text(item, 'videoLink', 'url', True),
                    'resolution': resolution,
                    'height': int(re.findall(r'\d+', resolution)[0])
                })

            self._sort_formats(info['formats'])

        return info