diff options
| -rw-r--r-- | youtube_dl/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/cracked.py | 65 | ||||
| -rw-r--r-- | youtube_dl/extractor/francetv.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/mlb.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/sapo.py | 119 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 2 | 
7 files changed, 193 insertions, 3 deletions
| diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 6e2359b28..f223b75f4 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -65,6 +65,7 @@ __authors__  = (      'Tobias Bell',      'Naglis Jonaitis',      'Charles Chen', +    'Hassaan Ali',  )  __license__ = 'Public Domain' diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 03d2e446d..a17a80a5f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -53,6 +53,7 @@ from .cnn import (  from .collegehumor import CollegeHumorIE  from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE  from .condenast import CondeNastIE +from .cracked import CrackedIE  from .criterion import CriterionIE  from .crunchyroll import CrunchyrollIE  from .cspan import CSpanIE @@ -252,6 +253,7 @@ from .rutube import (      RutubePersonIE,  )  from .rutv import RUTVIE +from .sapo import SapoIE  from .savefrom import SaveFromIE  from .scivee import SciVeeIE  from .screencast import ScreencastIE @@ -399,6 +401,7 @@ from .youtube import (      YoutubeUserIE,      YoutubeWatchLaterIE,  ) +  from .zdf import ZDFIE diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py new file mode 100644 index 000000000..74b880ffc --- /dev/null +++ b/youtube_dl/extractor/cracked.py @@ -0,0 +1,65 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    parse_iso8601, +    str_to_int, +) + + +class CrackedIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?cracked\.com/video_(?P<id>\d+)_[\da-z-]+\.html' +    _TEST = { +        'url': 'http://www.cracked.com/video_19006_4-plot-holes-you-didnt-notice-in-your-favorite-movies.html', +        'md5': '4b29a5eeec292cd5eca6388c7558db9e', +        'info_dict': { +            'id': '19006', +            'ext': 'mp4', +            'title': '4 Plot Holes You Didn\'t Notice in Your Favorite Movies', +            'description': 'md5:3b909e752661db86007d10e5ec2df769', +            'timestamp': 1405659600, +            'upload_date': '20140718', +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        video_url = self._html_search_regex( +            [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'], webpage, 'video URL') + +        title = self._og_search_title(webpage) +        description = self._og_search_description(webpage) + +        timestamp = self._html_search_regex(r'<time datetime="([^"]+)"', webpage, 'upload date', fatal=False) +        if timestamp: +            timestamp = parse_iso8601(timestamp[:-6]) + +        view_count = str_to_int(self._html_search_regex( +            r'<span class="views" id="viewCounts">([\d,\.]+) Views</span>', webpage, 'view count', fatal=False)) +        comment_count = str_to_int(self._html_search_regex( +            r'<span id="commentCounts">([\d,\.]+)</span>', webpage, 'comment count', fatal=False)) + +        m = re.search(r'_(?P<width>\d+)X(?P<height>\d+)\.mp4$', video_url) +        if m: +            width = int(m.group('width')) +            height = int(m.group('height')) +        else: +            width = height = None + +        return { +            'id': video_id, +            'url':video_url, +            'title': title, +            'description': description, +            'timestamp': timestamp, +            'view_count': view_count, +            'comment_count': comment_count, +            'height': height, +            'width': width, +        }
\ No newline at end of file diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index f3e0f38b7..1fbe6d175 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -48,7 +48,7 @@ class PluzzIE(FranceTVBaseInfoExtractor):  class FranceTvInfoIE(FranceTVBaseInfoExtractor):      IE_NAME = 'francetvinfo.fr' -    _VALID_URL = r'https?://www\.francetvinfo\.fr/.*/(?P<title>.+)\.html' +    _VALID_URL = r'https?://(?:www|mobile)\.francetvinfo\.fr/.*/(?P<title>.+)\.html'      _TESTS = [{          'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', @@ -211,7 +211,7 @@ class GenerationQuoiIE(InfoExtractor):  class CultureboxIE(FranceTVBaseInfoExtractor):      IE_NAME = 'culturebox.francetvinfo.fr' -    _VALID_URL = r'https?://culturebox\.francetvinfo\.fr/(?P<name>.*?)(\?|$)' +    _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?P<name>.*?)(\?|$)'      _TEST = {          'url': 'http://culturebox.francetvinfo.fr/einstein-on-the-beach-au-theatre-du-chatelet-146813', diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 18ab2c135..c28be3a7d 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -11,7 +11,7 @@ from ..utils import (  class MLBIE(InfoExtractor): -    _VALID_URL = r'http?://m\.mlb\.com/video/(?:topic/[\da-z_-]+/)?v(?P<id>n?\d+)' +    _VALID_URL = r'https?://m\.mlb\.com/video/(?:topic/[\da-z_-]+/)?v(?P<id>n?\d+)'      _TESTS = [          {              'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', diff --git a/youtube_dl/extractor/sapo.py b/youtube_dl/extractor/sapo.py new file mode 100644 index 000000000..172cc1275 --- /dev/null +++ b/youtube_dl/extractor/sapo.py @@ -0,0 +1,119 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    parse_duration, +    unified_strdate, +) + + +class SapoIE(InfoExtractor): +    IE_DESC = 'SAPO Vídeos' +    _VALID_URL = r'https?://(?:(?:v2|www)\.)?videos\.sapo\.(?:pt|cv|ao|mz|tl)/(?P<id>[\da-zA-Z]{20})' + +    _TESTS = [ +        { +            'url': 'http://videos.sapo.pt/UBz95kOtiWYUMTA5Ghfi', +            'md5': '79ee523f6ecb9233ac25075dee0eda83', +            'note': 'SD video', +            'info_dict': { +                'id': 'UBz95kOtiWYUMTA5Ghfi', +                'ext': 'mp4', +                'title': 'Benfica - Marcas na Hitória', +                'description': 'md5:c9082000a128c3fd57bf0299e1367f22', +                'duration': 264, +                'uploader': 'tiago_1988', +                'upload_date': '20080229', +                'categories': ['benfica', 'cabral', 'desporto', 'futebol', 'geovanni', 'hooijdonk', 'joao', 'karel', 'lisboa', 'miccoli'], +            }, +        }, +        { +            'url': 'http://videos.sapo.pt/IyusNAZ791ZdoCY5H5IF', +            'md5': '90a2f283cfb49193fe06e861613a72aa', +            'note': 'HD video', +            'info_dict': { +                'id': 'IyusNAZ791ZdoCY5H5IF', +                'ext': 'mp4', +                'title': 'Codebits VII - Report', +                'description': 'md5:6448d6fd81ce86feac05321f354dbdc8', +                'duration': 144, +                'uploader': 'codebits', +                'upload_date': '20140427', +                'categories': ['codebits', 'codebits2014'], +            }, +        }, +        { +            'url': 'http://v2.videos.sapo.pt/yLqjzPtbTimsn2wWBKHz', +            'md5': 'e5aa7cc0bdc6db9b33df1a48e49a15ac', +            'note': 'v2 video', +            'info_dict': { +                'id': 'yLqjzPtbTimsn2wWBKHz', +                'ext': 'mp4', +                'title': 'Hipnose Condicionativa 4', +                'description': 'md5:ef0481abf8fb4ae6f525088a6dadbc40', +                'duration': 692, +                'uploader': 'sapozen', +                'upload_date': '20090609', +                'categories': ['condicionativa', 'heloisa', 'hipnose', 'miranda', 'sapo', 'zen'], +            }, +        }, +    ] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        item = self._download_xml( +            'http://rd3.videos.sapo.pt/%s/rss2' % video_id, video_id).find('./channel/item') + +        title = item.find('./title').text +        description = item.find('./{http://videos.sapo.pt/mrss/}synopse').text +        thumbnail = item.find('./{http://search.yahoo.com/mrss/}content').get('url') +        duration = parse_duration(item.find('./{http://videos.sapo.pt/mrss/}time').text) +        uploader = item.find('./{http://videos.sapo.pt/mrss/}author').text +        upload_date = unified_strdate(item.find('./pubDate').text) +        view_count = int(item.find('./{http://videos.sapo.pt/mrss/}views').text) +        comment_count = int(item.find('./{http://videos.sapo.pt/mrss/}comment_count').text) +        tags = item.find('./{http://videos.sapo.pt/mrss/}tags').text +        categories = tags.split() if tags else [] +        age_limit = 18 if item.find('./{http://videos.sapo.pt/mrss/}m18').text == 'true' else 0 + +        video_url = item.find('./{http://videos.sapo.pt/mrss/}videoFile').text +        video_size = item.find('./{http://videos.sapo.pt/mrss/}videoSize').text.split('x') + +        formats = [{ +            'url': video_url, +            'ext': 'mp4', +            'format_id': 'sd', +            'width': int(video_size[0]), +            'height': int(video_size[1]), +        }] + +        if item.find('./{http://videos.sapo.pt/mrss/}HD').text == 'true': +            formats.append({ +                'url': re.sub(r'/mov/1$', '/mov/39', video_url), +                'ext': 'mp4', +                'format_id': 'hd', +                'width': 1280, +                'height': 720, +            }) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'uploader': uploader, +            'upload_date': upload_date, +            'view_count': view_count, +            'comment_count': comment_count, +            'categories': categories, +            'age_limit': age_limit, +            'formats': formats, +        } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 64a9618ca..919603c62 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1194,6 +1194,8 @@ def format_bytes(bytes):  def str_to_int(int_str): +    if int_str is None: +        return None      int_str = re.sub(r'[,\.]', u'', int_str)      return int(int_str) | 
