diff options
| author | Remita Amine <remitamine@gmail.com> | 2020-12-24 12:59:46 +0100 | 
|---|---|---|
| committer | Remita Amine <remitamine@gmail.com> | 2020-12-24 13:10:20 +0100 | 
| commit | ecaa535cf42f1e055056b590a4f7c870ded6c339 (patch) | |
| tree | d6c5fb455104ba4df4cf136d1d8204e61bc30ae6 | |
| parent | 79dd92b1fe94270a713af9d23718ad48e70133b9 (diff) | |
[facebook] add support for watchparty pages(closes #27507)
| -rw-r--r-- | youtube_dl/extractor/facebook.py | 221 | 
1 files changed, 139 insertions, 82 deletions
| diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 147deccc9..cb34c59f5 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -1,6 +1,7 @@  # coding: utf-8  from __future__ import unicode_literals +import json  import re  import socket @@ -8,6 +9,7 @@ from .common import InfoExtractor  from ..compat import (      compat_etree_fromstring,      compat_http_client, +    compat_str,      compat_urllib_error,      compat_urllib_parse_unquote,      compat_urllib_parse_unquote_plus, @@ -47,7 +49,8 @@ class FacebookIE(InfoExtractor):                              )\?(?:.*?)(?:v|video_id|story_fbid)=|                              [^/]+/videos/(?:[^/]+/)?|                              [^/]+/posts/| -                            groups/[^/]+/permalink/ +                            groups/[^/]+/permalink/| +                            watchparty/                          )|                      facebook:                  ) @@ -280,8 +283,18 @@ class FacebookIE(InfoExtractor):          # data.video.creation_story.attachments[].media          'url': 'https://www.facebook.com/watch/live/?v=1823658634322275',          'only_matching': True, +    }, { +        'url': 'https://www.facebook.com/watchparty/211641140192478', +        'info_dict': { +            'id': '211641140192478', +        }, +        'playlist_count': 1, +        'skip': 'Requires logging in',      }]      _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' +    _api_config = { +        'graphURI': '/api/graphql/' +    }      @staticmethod      def _extract_urls(webpage): @@ -405,6 +418,17 @@ class FacebookIE(InfoExtractor):              self._sort_formats(formats) +        def extract_relay_data(_filter): +            return self._parse_json(self._search_regex( +                r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, +                webpage, 'replay data', default='{}'), video_id, fatal=False) or {} + +        def extract_relay_prefetched_data(_filter): +            replay_data = extract_relay_data(_filter) +            for require in (replay_data.get('require') or []): +                if require[0] == 'RelayPrefetchedStreamCache': +                    return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} +          if not video_data:              server_js_data = self._parse_json(self._search_regex([                  r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, @@ -413,87 +437,83 @@ class FacebookIE(InfoExtractor):              video_data = extract_from_jsmods_instances(server_js_data)          if not video_data: -            graphql_data = self._parse_json(self._search_regex( -                r'handleWithCustomApplyEach\([^,]+,\s*({.*?"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+".*?})\);', -                webpage, 'graphql data', default='{}'), video_id, fatal=False) or {} -            for require in (graphql_data.get('require') or []): -                if require[0] == 'RelayPrefetchedStreamCache': -                    entries = [] - -                    def parse_graphql_video(video): -                        formats = [] -                        q = qualities(['sd', 'hd']) -                        for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: -                            playable_url = video.get('playable_url' + suffix) -                            if not playable_url: -                                continue -                            formats.append({ -                                'format_id': format_id, -                                'quality': q(format_id), -                                'url': playable_url, -                            }) -                        extract_dash_manifest(video, formats) -                        process_formats(formats) -                        v_id = video.get('videoId') or video.get('id') or video_id -                        info = { -                            'id': v_id, -                            'formats': formats, -                            'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), -                            'uploader_id': try_get(video, lambda x: x['owner']['id']), -                            'timestamp': int_or_none(video.get('publish_time')), -                            'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), -                        } -                        description = try_get(video, lambda x: x['savable_description']['text']) -                        title = video.get('name') -                        if title: -                            info.update({ -                                'title': title, -                                'description': description, -                            }) -                        else: -                            info['title'] = description or 'Facebook video #%s' % v_id -                        entries.append(info) - -                    def parse_attachment(attachment, key='media'): -                        media = attachment.get(key) or {} -                        if media.get('__typename') == 'Video': -                            return parse_graphql_video(media) - -                    data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} - -                    nodes = data.get('nodes') or [] -                    node = data.get('node') or {} -                    if not nodes and node: -                        nodes.append(node) -                    for node in nodes: -                        story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} -                        attachments = try_get(story, [ -                            lambda x: x['attached_story']['attachments'], -                            lambda x: x['attachments'] -                        ], list) or [] -                        for attachment in attachments: -                            attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) -                            ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] -                            for n in ns: -                                parse_attachment(n) -                            parse_attachment(attachment) - -                    edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] -                    for edge in edges: -                        parse_attachment(edge, key='node') - -                    video = data.get('video') or {} -                    if video: -                        attachments = try_get(video, [ -                            lambda x: x['story']['attachments'], -                            lambda x: x['creation_story']['attachments'] -                        ], list) or [] -                        for attachment in attachments: -                            parse_attachment(attachment) -                        if not entries: -                            parse_graphql_video(video) - -                    return self.playlist_result(entries, video_id) +            data = extract_relay_prefetched_data( +                r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') +            if data: +                entries = [] + +                def parse_graphql_video(video): +                    formats = [] +                    q = qualities(['sd', 'hd']) +                    for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: +                        playable_url = video.get('playable_url' + suffix) +                        if not playable_url: +                            continue +                        formats.append({ +                            'format_id': format_id, +                            'quality': q(format_id), +                            'url': playable_url, +                        }) +                    extract_dash_manifest(video, formats) +                    process_formats(formats) +                    v_id = video.get('videoId') or video.get('id') or video_id +                    info = { +                        'id': v_id, +                        'formats': formats, +                        'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), +                        'uploader_id': try_get(video, lambda x: x['owner']['id']), +                        'timestamp': int_or_none(video.get('publish_time')), +                        'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), +                    } +                    description = try_get(video, lambda x: x['savable_description']['text']) +                    title = video.get('name') +                    if title: +                        info.update({ +                            'title': title, +                            'description': description, +                        }) +                    else: +                        info['title'] = description or 'Facebook video #%s' % v_id +                    entries.append(info) + +                def parse_attachment(attachment, key='media'): +                    media = attachment.get(key) or {} +                    if media.get('__typename') == 'Video': +                        return parse_graphql_video(media) + +                nodes = data.get('nodes') or [] +                node = data.get('node') or {} +                if not nodes and node: +                    nodes.append(node) +                for node in nodes: +                    story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} +                    attachments = try_get(story, [ +                        lambda x: x['attached_story']['attachments'], +                        lambda x: x['attachments'] +                    ], list) or [] +                    for attachment in attachments: +                        attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) +                        ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] +                        for n in ns: +                            parse_attachment(n) +                        parse_attachment(attachment) + +                edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] +                for edge in edges: +                    parse_attachment(edge, key='node') + +                video = data.get('video') or {} +                if video: +                    attachments = try_get(video, [ +                        lambda x: x['story']['attachments'], +                        lambda x: x['creation_story']['attachments'] +                    ], list) or [] +                    for attachment in attachments: +                        parse_attachment(attachment) +                    if not entries: +                        parse_graphql_video(video) + +                return self.playlist_result(entries, video_id)          if not video_data:              m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) @@ -504,6 +524,43 @@ class FacebookIE(InfoExtractor):              elif '>You must log in to continue' in webpage:                  self.raise_login_required() +        if not video_data and '/watchparty/' in url: +            post_data = { +                'doc_id': 3731964053542869, +                'variables': json.dumps({ +                    'livingRoomID': video_id, +                }), +            } + +            prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{') +            if prefetched_data: +                lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict) +                if lsd: +                    post_data[lsd['name']] = lsd['value'] + +            relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,') +            for define in (relay_data.get('define') or []): +                if define[0] == 'RelayAPIConfigDefaults': +                    self._api_config = define[2] + +            living_room = self._download_json( +                urljoin(url, self._api_config['graphURI']), video_id, +                data=urlencode_postdata(post_data))['data']['living_room'] + +            entries = [] +            for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []): +                video = try_get(edge, lambda x: x['node']['video']) or {} +                v_id = video.get('id') +                if not v_id: +                    continue +                v_id = compat_str(v_id) +                entries.append(self.url_result( +                    self._VIDEO_PAGE_TEMPLATE % v_id, +                    self.ie_key(), v_id, video.get('name'))) + +            return self.playlist_result(entries, video_id) + +        if not video_data:              # Video info not in first request, do a secondary request using              # tahoe player specific URL              tahoe_data = self._download_webpage( | 
