aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py6
-rw-r--r--youtube_dl/extractor/dailymotion.py27
-rw-r--r--youtube_dl/extractor/generic.py34
-rw-r--r--youtube_dl/extractor/globo.py398
-rw-r--r--youtube_dl/extractor/googleplus.py75
-rw-r--r--youtube_dl/extractor/gorillavid.py25
-rw-r--r--youtube_dl/extractor/howstuffworks.py10
-rw-r--r--youtube_dl/extractor/mixcloud.py2
-rw-r--r--youtube_dl/extractor/mlb.py8
-rw-r--r--youtube_dl/extractor/ndr.py14
-rw-r--r--youtube_dl/extractor/niconico.py17
-rw-r--r--youtube_dl/extractor/pornhd.py52
-rw-r--r--youtube_dl/extractor/sportbox.py81
-rw-r--r--youtube_dl/extractor/theonion.py70
-rw-r--r--youtube_dl/extractor/thesixtyone.py100
-rw-r--r--youtube_dl/extractor/vimeo.py2
-rw-r--r--youtube_dl/extractor/walla.py89
-rw-r--r--youtube_dl/extractor/yahoo.py145
18 files changed, 979 insertions, 176 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 348f5767a..5e38d2663 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -134,6 +134,7 @@ from .gamestar import GameStarIE
from .gametrailers import GametrailersIE
from .gdcvault import GDCVaultIE
from .generic import GenericIE
+from .globo import GloboIE
from .godtube import GodTubeIE
from .golem import GolemIE
from .googleplus import GooglePlusIE
@@ -346,6 +347,7 @@ from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE
from .spike import SpikeIE
from .sport5 import Sport5IE
+from .sportbox import SportBoxIE
from .sportdeutschland import SportDeutschlandIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
@@ -369,7 +371,9 @@ from .telemb import TeleMBIE
from .tenplay import TenPlayIE
from .testurl import TestURLIE
from .tf1 import TF1IE
+from .theonion import TheOnionIE
from .theplatform import ThePlatformIE
+from .thesixtyone import TheSixtyOneIE
from .thisav import ThisAVIE
from .tinypic import TinyPicIE
from .tlc import TlcIE, TlcDeIE
@@ -438,6 +442,7 @@ from .vporn import VpornIE
from .vube import VubeIE
from .vuclip import VuClipIE
from .vulture import VultureIE
+from .walla import WallaIE
from .washingtonpost import WashingtonPostIE
from .wat import WatIE
from .wayofthemaster import WayOfTheMasterIE
@@ -459,7 +464,6 @@ from .xvideos import XVideosIE
from .xtube import XTubeUserIE, XTubeIE
from .yahoo import (
YahooIE,
- YahooNewsIE,
YahooSearchIE,
)
from .ynet import YnetIE
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 66a8f16d9..dbcf5d6a7 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -82,11 +82,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
]
def _real_extract(self, url):
- # Extract id and simplified title from URL
- mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
url = 'http://www.dailymotion.com/video/%s' % video_id
# Retrieve video webpage to extract further information
@@ -147,18 +143,23 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
self._list_available_subtitles(video_id, webpage)
return
- view_count = self._search_regex(
- r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, 'view count', fatal=False)
- if view_count is not None:
- view_count = str_to_int(view_count)
+ view_count = str_to_int(self._search_regex(
+ r'video_views_count[^>]+>\s+([\d\.,]+)',
+ webpage, 'view count', fatal=False))
+
+ title = self._og_search_title(webpage, default=None)
+ if title is None:
+ title = self._html_search_regex(
+ r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage,
+ 'title')
return {
- 'id': video_id,
+ 'id': video_id,
'formats': formats,
'uploader': info['owner.screenname'],
- 'upload_date': video_upload_date,
- 'title': self._og_search_title(webpage),
- 'subtitles': video_subtitles,
+ 'upload_date': video_upload_date,
+ 'title': title,
+ 'subtitles': video_subtitles,
'thumbnail': info['thumbnail_url'],
'age_limit': age_limit,
'view_count': view_count,
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index c16da70f1..dfc2ef4e7 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -847,47 +847,51 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'MLB')
+ def check_video(vurl):
+ vpath = compat_urlparse.urlparse(vurl).path
+ vext = determine_ext(vpath)
+ return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
+
+ def filter_video(urls):
+ return list(filter(check_video, urls))
+
# Start with something easy: JW Player in SWFObject
- found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
+ found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
if not found:
# Look for gorilla-vid style embedding
- found = re.findall(r'''(?sx)
+ found = filter_video(re.findall(r'''(?sx)
(?:
jw_plugins|
JWPlayerOptions|
jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
)
- .*?file\s*:\s*["\'](.*?)["\']''', webpage)
+ .*?file\s*:\s*["\'](.*?)["\']''', webpage))
if not found:
# Broaden the search a little bit
- found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
+ found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
if not found:
# Broaden the findall a little bit: JWPlayer JS loader
- found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
+ found = filter_video(re.findall(
+ r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
if not found:
# Flow player
- found = re.findall(r'''(?xs)
+ found = filter_video(re.findall(r'''(?xs)
flowplayer\("[^"]+",\s*
\{[^}]+?\}\s*,
\s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*
["']?url["']?\s*:\s*["']([^"']+)["']
- ''', webpage)
+ ''', webpage))
if not found:
# Try to find twitter cards info
- found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
+ found = filter_video(re.findall(
+ r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
if not found:
# We look for Open Graph info:
# We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
if m_video_type is not None:
- def check_video(vurl):
- vpath = compat_urlparse.urlparse(vurl).path
- vext = determine_ext(vpath)
- return '.' in vpath and vext not in ('swf', 'png', 'jpg')
- found = list(filter(
- check_video,
- re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)))
+ found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
if not found:
# HTML5 video
found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage)
diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py
new file mode 100644
index 000000000..77c3ad4fc
--- /dev/null
+++ b/youtube_dl/extractor/globo.py
@@ -0,0 +1,398 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+import math
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+ compat_str,
+ compat_chr,
+ compat_ord,
+)
+
+
+class GloboIE(InfoExtractor):
+ _VALID_URL = 'https?://.+?\.globo\.com/(?P<id>.+)'
+
+ _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist'
+ _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=2.9.9.50&resource_id=%s'
+
+ _VIDEOID_REGEXES = [
+ r'\bdata-video-id="(\d+)"',
+ r'\bdata-player-videosids="(\d+)"',
+ r'<div[^>]+\bid="(\d+)"',
+ ]
+
+ _RESIGN_EXPIRATION = 86400
+
+ _TESTS = [
+ {
+ 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/',
+ 'md5': '03ebf41cb7ade43581608b7d9b71fab0',
+ 'info_dict': {
+ 'id': '3654973',
+ 'ext': 'mp4',
+ 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão',
+ 'duration': 251.585,
+ 'uploader': 'SporTV',
+ 'uploader_id': 698,
+ 'like_count': int,
+ }
+ },
+ {
+ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/',
+ 'md5': 'b3ccc801f75cd04a914d51dadb83a78d',
+ 'info_dict': {
+ 'id': '3607726',
+ 'ext': 'mp4',
+ 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa',
+ 'duration': 103.204,
+ 'uploader': 'Globo.com',
+ 'uploader_id': 265,
+ 'like_count': int,
+ }
+ },
+ {
+ 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html',
+ 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b',
+ 'info_dict': {
+ 'id': '3652183',
+ 'ext': 'mp4',
+ 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião',
+ 'duration': 110.711,
+ 'uploader': 'Rede Globo',
+ 'uploader_id': 196,
+ 'like_count': int,
+ }
+ },
+ ]
+
+ class MD5():
+ HEX_FORMAT_LOWERCASE = 0
+ HEX_FORMAT_UPPERCASE = 1
+ BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = ''
+ BASE64_PAD_CHARACTER_RFC_COMPLIANCE = '='
+ PADDING = '=0xFF01DD'
+ hexcase = 0
+ b64pad = ''
+
+ def __init__(self):
+ pass
+
+ class JSArray(list):
+ def __getitem__(self, y):
+ try:
+ return list.__getitem__(self, y)
+ except IndexError:
+ return 0
+
+ def __setitem__(self, i, y):
+ try:
+ return list.__setitem__(self, i, y)
+ except IndexError:
+ self.extend([0] * (i - len(self) + 1))
+ self[-1] = y
+
+ @classmethod
+ def hex_md5(cls, param1):
+ return cls.rstr2hex(cls.rstr_md5(cls.str2rstr_utf8(param1)))
+
+ @classmethod
+ def b64_md5(cls, param1, param2=None):
+ return cls.rstr2b64(cls.rstr_md5(cls.str2rstr_utf8(param1, param2)))
+
+ @classmethod
+ def any_md5(cls, param1, param2):
+ return cls.rstr2any(cls.rstr_md5(cls.str2rstr_utf8(param1)), param2)
+
+ @classmethod
+ def rstr_md5(cls, param1):
+ return cls.binl2rstr(cls.binl_md5(cls.rstr2binl(param1), len(param1) * 8))
+
+ @classmethod
+ def rstr2hex(cls, param1):
+ _loc_2 = '0123456789ABCDEF' if cls.hexcase else '0123456789abcdef'
+ _loc_3 = ''
+ for _loc_5 in range(0, len(param1)):
+ _loc_4 = compat_ord(param1[_loc_5])
+ _loc_3 += _loc_2[_loc_4 >> 4 & 15] + _loc_2[_loc_4 & 15]
+ return _loc_3
+
+ @classmethod
+ def rstr2b64(cls, param1):
+ _loc_2 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
+ _loc_3 = ''
+ _loc_4 = len(param1)
+ for _loc_5 in range(0, _loc_4, 3):
+ _loc_6_1 = compat_ord(param1[_loc_5]) << 16
+ _loc_6_2 = compat_ord(param1[_loc_5 + 1]) << 8 if _loc_5 + 1 < _loc_4 else 0
+ _loc_6_3 = compat_ord(param1[_loc_5 + 2]) if _loc_5 + 2 < _loc_4 else 0
+ _loc_6 = _loc_6_1 | _loc_6_2 | _loc_6_3
+ for _loc_7 in range(0, 4):
+ if _loc_5 * 8 + _loc_7 * 6 > len(param1) * 8:
+ _loc_3 += cls.b64pad
+ else:
+ _loc_3 += _loc_2[_loc_6 >> 6 * (3 - _loc_7) & 63]
+ return _loc_3
+
+ @staticmethod
+ def rstr2any(param1, param2):
+ _loc_3 = len(param2)
+ _loc_4 = []
+ _loc_9 = [0] * ((len(param1) >> 2) + 1)
+ for _loc_5 in range(0, len(_loc_9)):
+ _loc_9[_loc_5] = compat_ord(param1[_loc_5 * 2]) << 8 | compat_ord(param1[_loc_5 * 2 + 1])
+
+ while len(_loc_9) > 0:
+ _loc_8 = []
+ _loc_7 = 0
+ for _loc_5 in range(0, len(_loc_9)):
+ _loc_7 = (_loc_7 << 16) + _loc_9[_loc_5]
+ _loc_6 = math.floor(_loc_7 / _loc_3)
+ _loc_7 -= _loc_6 * _loc_3
+ if len(_loc_8) > 0 or _loc_6 > 0:
+ _loc_8[len(_loc_8)] = _loc_6
+
+ _loc_4[len(_loc_4)] = _loc_7
+ _loc_9 = _loc_8
+
+ _loc_10 = ''
+ _loc_5 = len(_loc_4) - 1
+ while _loc_5 >= 0:
+ _loc_10 += param2[_loc_4[_loc_5]]
+ _loc_5 -= 1
+
+ return _loc_10
+
+ @classmethod
+ def str2rstr_utf8(cls, param1, param2=None):
+ _loc_3 = ''
+ _loc_4 = -1
+ if not param2:
+ param2 = cls.PADDING
+ param1 = param1 + param2[1:9]
+ while True:
+ _loc_4 += 1
+ if _loc_4 >= len(param1):
+ break
+ _loc_5 = compat_ord(param1[_loc_4])
+ _loc_6 = compat_ord(param1[_loc_4 + 1]) if _loc_4 + 1 < len(param1) else 0
+ if 55296 <= _loc_5 <= 56319 and 56320 <= _loc_6 <= 57343:
+ _loc_5 = 65536 + ((_loc_5 & 1023) << 10) + (_loc_6 & 1023)
+ _loc_4 += 1
+ if _loc_5 <= 127:
+ _loc_3 += compat_chr(_loc_5)
+ continue
+ if _loc_5 <= 2047:
+ _loc_3 += compat_chr(192 | _loc_5 >> 6 & 31) + compat_chr(128 | _loc_5 & 63)
+ continue
+ if _loc_5 <= 65535:
+ _loc_3 += compat_chr(224 | _loc_5 >> 12 & 15) + compat_chr(128 | _loc_5 >> 6 & 63) + compat_chr(
+ 128 | _loc_5 & 63)
+ continue
+ if _loc_5 <= 2097151:
+ _loc_3 += compat_chr(240 | _loc_5 >> 18 & 7) + compat_chr(128 | _loc_5 >> 12 & 63) + compat_chr(
+ 128 | _loc_5 >> 6 & 63) + compat_chr(128 | _loc_5 & 63)
+ return _loc_3
+
+ @staticmethod
+ def rstr2binl(param1):
+ _loc_2 = [0] * ((len(param1) >> 2) + 1)
+ for _loc_3 in range(0, len(_loc_2)):
+ _loc_2[_loc_3] = 0
+ for _loc_3 in range(0, len(param1) * 8, 8):
+ _loc_2[_loc_3 >> 5] |= (compat_ord(param1[_loc_3 // 8]) & 255) << _loc_3 % 32
+ return _loc_2
+
+ @staticmethod
+ def binl2rstr(param1):
+ _loc_2 = ''
+ for _loc_3 in range(0, len(param1) * 32, 8):
+ _loc_2 += compat_chr(param1[_loc_3 >> 5] >> _loc_3 % 32 & 255)
+ return _loc_2
+
+ @classmethod
+ def binl_md5(cls, param1, param2):
+ param1 = cls.JSArray(param1)
+ param1[param2 >> 5] |= 128 << param2 % 32
+ param1[(param2 + 64 >> 9 << 4) + 14] = param2
+ _loc_3 = 1732584193
+ _loc_4 = -271733879
+ _loc_5 = -1732584194
+ _loc_6 = 271733878
+ for _loc_7 in range(0, len(param1), 16):
+ _loc_8 = _loc_3
+ _loc_9 = _loc_4
+ _loc_10 = _loc_5
+ _loc_11 = _loc_6
+ _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 7, -680876936)
+ _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 1], 12, -389564586)
+ _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 17, 606105819)
+ _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 3], 22, -1044525330)
+ _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 7, -176418897)
+ _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 5], 12, 1200080426)
+ _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 17, -1473231341)
+ _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 7], 22, -45705983)
+ _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 7, 1770035416)
+ _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 9], 12, -1958414417)
+ _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 17, -42063)
+ _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 11], 22, -1990404162)
+ _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 7, 1804603682)
+ _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 13], 12, -40341101)
+ _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 17, -1502002290)
+ _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 15], 22, 1236535329)
+ _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 5, -165796510)
+ _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 6], 9, -1069501632)
+ _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 14, 643717713)
+ _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 0], 20, -373897302)
+ _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 5, -701558691)
+ _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 10], 9, 38016083)
+ _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 14, -660478335)
+ _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 4], 20, -405537848)
+ _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 5, 568446438)
+ _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 14], 9, -1019803690)
+ _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 14, -187363961)
+ _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 8], 20, 1163531501)
+ _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 5, -1444681467)
+ _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 2], 9, -51403784)
+ _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 14, 1735328473)
+ _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 12], 20, -1926607734)
+ _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 4, -378558)
+ _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 8], 11, -2022574463)
+ _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 16, 1839030562)
+ _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 14], 23, -35309556)
+ _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 4, -1530992060)
+ _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 4], 11, 1272893353)
+ _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 16, -155497632)
+ _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 10], 23, -1094730640)
+ _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 4, 681279174)
+ _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 0], 11, -358537222)
+ _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 16, -722521979)
+ _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 6], 23, 76029189)
+ _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 4, -640364487)
+ _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 12], 11, -421815835)
+ _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 16, 530742520)
+ _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 2], 23, -995338651)
+ _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 6, -198630844)
+ _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 7], 10, 1126891415)
+ _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 15, -1416354905)
+ _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 5], 21, -57434055)
+ _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 6, 1700485571)
+ _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 3], 10, -1894986606)
+ _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 15, -1051523)
+ _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 1], 21, -2054922799)
+ _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 6, 1873313359)
+ _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 15], 10, -30611744)
+ _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 15, -1560198380)
+ _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 13], 21, 1309151649)
+ _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 6, -145523070)
+ _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 11], 10, -1120210379)
+ _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 15, 718787259)
+ _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 9], 21, -343485551)
+ _loc_3 = cls.safe_add(_loc_3, _loc_8)
+ _loc_4 = cls.safe_add(_loc_4, _loc_9)
+ _loc_5 = cls.safe_add(_loc_5, _loc_10)
+ _loc_6 = cls.safe_add(_loc_6, _loc_11)
+ return [_loc_3, _loc_4, _loc_5, _loc_6]
+
+ @classmethod
+ def md5_cmn(cls, param1, param2, param3, param4, param5, param6):
+ return cls.safe_add(
+ cls.bit_rol(cls.safe_add(cls.safe_add(param2, param1), cls.safe_add(param4, param6)), param5), param3)
+
+ @classmethod
+ def md5_ff(cls, param1, param2, param3, param4, param5, param6, param7):
+ return cls.md5_cmn(param2 & param3 | ~param2 & param4, param1, param2, param5, param6, param7)
+
+ @classmethod
+ def md5_gg(cls, param1, param2, param3, param4, param5, param6, param7):
+ return cls.md5_cmn(param2 & param4 | param3 & ~param4, param1, param2, param5, param6, param7)
+
+ @classmethod
+ def md5_hh(cls, param1, param2, param3, param4, param5, param6, param7):
+ return cls.md5_cmn(param2 ^ param3 ^ param4, param1, param2, param5, param6, param7)
+
+ @classmethod
+ def md5_ii(cls, param1, param2, param3, param4, param5, param6, param7):
+ return cls.md5_cmn(param3 ^ (param2 | ~param4), param1, param2, param5, param6, param7)
+
+ @classmethod
+ def safe_add(cls, param1, param2):
+ _loc_3 = (param1 & 65535) + (param2 & 65535)
+ _loc_4 = (param1 >> 16) + (param2 >> 16) + (_loc_3 >> 16)
+ return cls.lshift(_loc_4, 16) | _loc_3 & 65535
+
+ @classmethod
+ def bit_rol(cls, param1, param2):
+ return cls.lshift(param1, param2) | (param1 & 0xFFFFFFFF) >> (32 - param2)
+
+ @staticmethod
+ def lshift(value, count):
+ r = (0xFFFFFFFF & value) << count
+ return -(~(r - 1) & 0xFFFFFFFF) if r > 0x7FFFFFFF else r
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id')
+
+ video = self._download_json(
+ self._API_URL_TEMPLATE % video_id, video_id)['videos'][0]
+
+ title = video['title']
+ duration = float_or_none(video['duration'], 1000)
+ like_count = video['likes']
+ uploader = video['channel']
+ uploader_id = video['channel_id']
+
+ formats = []
+
+ for resource in video['resources']:
+ resource_id = resource.get('_id')
+ if not resource_id:
+ continue
+
+ security = self._download_json(
+ self._SECURITY_URL_TEMPLATE % (video_id, resource_id),
+ video_id, 'Downloading security hash for %s' % resource_id)
+
+ security_hash = security.get('hash')
+ if not security_hash:
+ message = security.get('message')
+ if message:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, message), expected=True)
+ continue
+
+ hash_code = security_hash[:2]
+ received_time = int(security_hash[2:12])
+ received_random = security_hash[12:22]
+ received_md5 = security_hash[22:]
+
+ sign_time = received_time + self._RESIGN_EXPIRATION
+ padding = '%010d' % random.randint(1, 10000000000)
+
+ signed_md5 = self.MD5.b64_md5(received_md5 + compat_str(sign_time) + padding)
+ signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5
+
+ formats.append({
+ 'url': '%s?h=%s&k=%s' % (resource['url'], signed_hash, 'flash'),
+ 'format_id': resource_id,
+ 'height': resource['height']
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'duration': duration,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'like_count': like_count,
+ 'formats': formats
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py
index 07d994b44..fcefe54cd 100644
--- a/youtube_dl/extractor/googleplus.py
+++ b/youtube_dl/extractor/googleplus.py
@@ -1,13 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
-import datetime
import re
+import codecs
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
-)
+from ..utils import unified_strdate
class GooglePlusIE(InfoExtractor):
@@ -19,74 +17,57 @@ class GooglePlusIE(InfoExtractor):
'info_dict': {
'id': 'ZButuJc6CtH',
'ext': 'flv',
+ 'title': '嘆きの天使 降臨',
'upload_date': '20120613',
'uploader': '井上ヨシマサ',
- 'title': '嘆きの天使 降臨',
}
}
def _real_extract(self, url):
- # Extract id from URL
- mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
# Step 1, Retrieve post webpage to extract further information
webpage = self._download_webpage(url, video_id, 'Downloading entry webpage')
- self.report_extraction(video_id)
-
- # Extract update date
- upload_date = self._html_search_regex(
+ title = self._og_search_description(webpage).splitlines()[0]
+ upload_date = unified_strdate(self._html_search_regex(
r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*>
([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''',
- webpage, 'upload date', fatal=False, flags=re.VERBOSE)
- if upload_date:
- # Convert timestring to a format suitable for filename
- upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
- upload_date = upload_date.strftime('%Y%m%d')
-
- # Extract uploader
- uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
- webpage, 'uploader', fatal=False)
-
- # Extract title
- # Get the first line for title
- video_title = self._og_search_description(webpage).splitlines()[0]
+ webpage, 'upload date', fatal=False, flags=re.VERBOSE))
+ uploader = self._html_search_regex(
+ r'rel="author".*?>(.*?)</a>', webpage, 'uploader', fatal=False)
# Step 2, Simulate clicking the image box to launch video
DOMAIN = 'https://plus.google.com/'
- video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
+ video_page = self._search_regex(
+ r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
webpage, 'video page URL')
if not video_page.startswith(DOMAIN):
video_page = DOMAIN + video_page
webpage = self._download_webpage(video_page, video_id, 'Downloading video page')
- # Extract video links all sizes
- pattern = r'\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
- mobj = re.findall(pattern, webpage)
- if len(mobj) == 0:
- raise ExtractorError('Unable to extract video links')
-
- # Sort in resolution
- links = sorted(mobj)
+ def unicode_escape(s):
+ decoder = codecs.getdecoder('unicode_escape')
+ return re.sub(
+ r'\\u[0-9a-fA-F]{4,}',
+ lambda m: decoder(m.group(0))[0],
+ s)
- # Choose the lowest of the sort, i.e. highest resolution
- video_url = links[-1]
- # Only get the url. The resolution part in the tuple has no use anymore
- video_url = video_url[-1]
- # Treat escaped \u0026 style hex
- try:
- video_url = video_url.decode("unicode_escape")
- except AttributeError: # Python 3
- video_url = bytes(video_url, 'ascii').decode('unicode-escape')
+ # Extract video links all sizes
+ formats = [{
+ 'url': unicode_escape(video_url),
+ 'ext': 'flv',
+ 'width': int(width),
+ 'height': int(height),
+ } for width, height, video_url in re.findall(
+ r'\d+,(\d+),(\d+),"(https?://redirector\.googlevideo\.com.*?)"', webpage)]
+ self._sort_formats(formats)
return {
'id': video_id,
- 'url': video_url,
+ 'title': title,
'uploader': uploader,
'upload_date': upload_date,
- 'title': video_title,
- 'ext': 'flv',
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py
index ca5f7c417..45cca1d24 100644
--- a/youtube_dl/extractor/gorillavid.py
+++ b/youtube_dl/extractor/gorillavid.py
@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
determine_ext,
compat_urllib_parse,
compat_urllib_request,
@@ -12,20 +13,22 @@ from ..utils import (
class GorillaVidIE(InfoExtractor):
- IE_DESC = 'GorillaVid.in and daclips.in'
+ IE_DESC = 'GorillaVid.in, daclips.in and movpod.in'
_VALID_URL = r'''(?x)
https?://(?P<host>(?:www\.)?
- (?:daclips\.in|gorillavid\.in))/
+ (?:daclips\.in|gorillavid\.in|movpod\.in))/
(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?
'''
+ _FILE_NOT_FOUND_REGEX = r'>(?:404 - )?File Not Found<'
+
_TESTS = [{
'url': 'http://gorillavid.in/06y9juieqpmi',
'md5': '5ae4a3580620380619678ee4875893ba',
'info_dict': {
'id': '06y9juieqpmi',
'ext': 'flv',
- 'title': 'Rebecca Black My Moment Official Music Video Reaction',
+ 'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ',
'thumbnail': 're:http://.*\.jpg',
},
}, {
@@ -46,6 +49,9 @@ class GorillaVidIE(InfoExtractor):
'title': 'Micro Pig piglets ready on 16th July 2009',
'thumbnail': 're:http://.*\.jpg',
},
+ }, {
+ 'url': 'http://movpod.in/0wguyyxi1yca',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -54,6 +60,9 @@ class GorillaVidIE(InfoExtractor):
webpage = self._download_webpage('http://%s/%s' % (mobj.group('host'), video_id), video_id)
+ if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None:
+ raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
fields = dict(re.findall(r'''(?x)<input\s+
type="hidden"\s+
name="([^"]+)"\s+
@@ -69,14 +78,14 @@ class GorillaVidIE(InfoExtractor):
webpage = self._download_webpage(req, video_id, 'Downloading video page')
- title = self._search_regex(r'style="z-index: [0-9]+;">([0-9a-zA-Z ]+)(?:-.+)?</span>', webpage, 'title')
- thumbnail = self._search_regex(r'image:\'(http[^\']+)\',', webpage, 'thumbnail')
- url = self._search_regex(r'file: \'(http[^\']+)\',', webpage, 'file url')
+ title = self._search_regex(r'style="z-index: [0-9]+;">([^<]+)</span>', webpage, 'title')
+ video_url = self._search_regex(r'file\s*:\s*\'(http[^\']+)\',', webpage, 'file url')
+ thumbnail = self._search_regex(r'image\s*:\s*\'(http[^\']+)\',', webpage, 'thumbnail', fatal=False)
formats = [{
'format_id': 'sd',
- 'url': url,
- 'ext': determine_ext(url),
+ 'url': video_url,
+ 'ext': determine_ext(video_url),
'quality': 1,
}]
diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py
index 68684b997..fccc23884 100644
--- a/youtube_dl/extractor/howstuffworks.py
+++ b/youtube_dl/extractor/howstuffworks.py
@@ -28,13 +28,13 @@ class HowStuffWorksIE(InfoExtractor):
}
},
{
- 'url': 'http://adventure.howstuffworks.com/39516-deadliest-catch-jakes-farewell-pots-video.htm',
+ 'url': 'http://adventure.howstuffworks.com/7199-survival-zone-food-and-water-in-the-savanna-video.htm',
'info_dict': {
- 'id': '553470',
- 'display_id': 'deadliest-catch-jakes-farewell-pots',
+ 'id': '453464',
+ 'display_id': 'survival-zone-food-and-water-in-the-savanna',
'ext': 'mp4',
- 'title': 'Deadliest Catch: Jake\'s Farewell Pots',
- 'description': 'md5:9632c346d5e43ee238028c9cefd8dbbc',
+ 'title': 'Survival Zone: Food and Water In the Savanna',
+ 'description': 'md5:7e1c89f6411434970c15fa094170c371',
'thumbnail': 're:^https?://.*\.jpg$',
},
'params': {
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index 520f27fca..a4564d3de 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -70,7 +70,7 @@ class MixcloudIE(InfoExtractor):
raise ExtractorError('Unable to extract track url')
PREFIX = (
- r'<div class="cloudcast-play-button-container"'
+ r'<div class="cloudcast-play-button-container[^"]*?"'
r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')
title = self._html_search_regex(
PREFIX + r'm-title="([^"]+)"', webpage, 'title')
diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py
index bfdb462eb..42aa2e227 100644
--- a/youtube_dl/extractor/mlb.py
+++ b/youtube_dl/extractor/mlb.py
@@ -6,7 +6,6 @@ from .common import InfoExtractor
from ..utils import (
parse_duration,
parse_iso8601,
- find_xpath_attr,
)
@@ -88,8 +87,9 @@ class MLBIE(InfoExtractor):
duration = parse_duration(detail.find('./duration').text)
timestamp = parse_iso8601(detail.attrib['date'][:-5])
- thumbnail = find_xpath_attr(
- detail, './thumbnailScenarios/thumbnailScenario', 'type', '45').text
+ thumbnails = [{
+ 'url': thumbnail.text,
+ } for thumbnail in detail.findall('./thumbnailScenarios/thumbnailScenario')]
formats = []
for media_url in detail.findall('./url'):
@@ -116,5 +116,5 @@ class MLBIE(InfoExtractor):
'duration': duration,
'timestamp': timestamp,
'formats': formats,
- 'thumbnail': thumbnail,
+ 'thumbnails': thumbnails,
}
diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py
index 94d5ba982..add4b3e5d 100644
--- a/youtube_dl/extractor/ndr.py
+++ b/youtube_dl/extractor/ndr.py
@@ -18,16 +18,16 @@ class NDRIE(InfoExtractor):
_TESTS = [
{
- 'url': 'http://www.ndr.de/fernsehen/media/dienordreportage325.html',
- 'md5': '4a4eeafd17c3058b65f0c8f091355855',
+ 'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html',
+ 'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c',
'note': 'Video file',
'info_dict': {
- 'id': '325',
+ 'id': '25866',
'ext': 'mp4',
- 'title': 'Blaue Bohnen aus Blocken',
- 'description': 'md5:190d71ba2ccddc805ed01547718963bc',
- 'duration': 1715,
- },
+ 'title': 'Kartoffeltage in der Lewitz',
+ 'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8',
+ 'duration': 166,
+ }
},
{
'url': 'http://www.ndr.de/info/audio51535.html',
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
index c0c139b5d..7b85589b7 100644
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@@ -39,18 +39,17 @@ class NiconicoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/((?:[a-z]{2})?[0-9]+)'
_NETRC_MACHINE = 'niconico'
- # Determine whether the downloader uses authentication to download video
- _AUTHENTICATE = False
+ # Determine whether the downloader used authentication to download video
+ _AUTHENTICATED = False
def _real_initialize(self):
- if self._downloader.params.get('username', None) is not None:
- self._AUTHENTICATE = True
-
- if self._AUTHENTICATE:
- self._login()
+ self._login()
def _login(self):
(username, password) = self._get_login_info()
+ # No authentication to be performed
+ if not username:
+ return True
# Log in
login_form_strs = {
@@ -68,6 +67,8 @@ class NiconicoIE(InfoExtractor):
if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
self._downloader.report_warning('unable to log in: bad username or password')
return False
+ # Successful login
+ self._AUTHENTICATED = True
return True
def _real_extract(self, url):
@@ -82,7 +83,7 @@ class NiconicoIE(InfoExtractor):
'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
note='Downloading video info page')
- if self._AUTHENTICATE:
+ if self._AUTHENTICATED:
# Get flv info
flv_info_webpage = self._download_webpage(
'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
index 48ce6e730..bac484c67 100644
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -4,19 +4,27 @@ import re
import json
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ qualities,
+ determine_ext,
+)
class PornHdIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)'
+ _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?'
_TEST = {
'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
'md5': '956b8ca569f7f4d8ec563e2c41598441',
'info_dict': {
'id': '1962',
+ 'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
'ext': 'mp4',
'title': 'Sierra loves doing laundry',
'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'view_count': int,
'age_limit': 18,
}
}
@@ -24,8 +32,9 @@ class PornHdIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, display_id or video_id)
title = self._html_search_regex(
r'<title>(.+) porn HD.+?</title>', webpage, 'title')
@@ -33,38 +42,21 @@ class PornHdIE(InfoExtractor):
r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
view_count = int_or_none(self._html_search_regex(
r'(\d+) views\s*</span>', webpage, 'view count', fatal=False))
+ thumbnail = self._search_regex(
+ r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False)
- videos = re.findall(
- r'var __video([\da-zA-Z]+?)(Low|High)StreamUrl = \'(http://.+?)\?noProxy=1\'', webpage)
-
- mobj = re.search(r'flashVars = (?P<flashvars>{.+?});', webpage)
- if mobj:
- flashvars = json.loads(mobj.group('flashvars'))
- for key, quality in [('hashlink', 'low'), ('hd', 'high')]:
- redirect_url = flashvars.get(key)
- if redirect_url:
- videos.append(('flv', quality, redirect_url))
- thumbnail = flashvars['urlWallpaper']
- else:
- thumbnail = self._og_search_thumbnail(webpage)
-
- formats = []
- for format_, quality, redirect_url in videos:
- format_id = '%s-%s' % (format_.lower(), quality.lower())
- video_url = self._download_webpage(
- redirect_url, video_id, 'Downloading %s video link' % format_id, fatal=False)
- if not video_url:
- continue
- formats.append({
- 'url': video_url,
- 'ext': format_.lower(),
- 'format_id': format_id,
- 'quality': 1 if quality.lower() == 'high' else 0,
- })
+ quality = qualities(['SD', 'HD'])
+ formats = [{
+ 'url': source['file'],
+ 'format_id': '%s-%s' % (source['label'], determine_ext(source['file'])),
+ 'quality': quality(source['label']),
+ } for source in json.loads(js_to_json(self._search_regex(
+ r"(?s)'sources'\s*:\s*(\[.+?\])", webpage, 'sources')))]
self._sort_formats(formats)
return {
'id': video_id,
+ 'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py
new file mode 100644
index 000000000..19cc976e3
--- /dev/null
+++ b/youtube_dl/extractor/sportbox.py
@@ -0,0 +1,81 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ parse_iso8601,
+ int_or_none,
+)
+
+
+class SportBoxIE(InfoExtractor):
+ _VALID_URL = r'https?://news\.sportbox\.ru/Vidy_sporta/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
+ _TESTS = [
+ {
+ 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
+ 'md5': 'ff56a598c2cf411a9a38a69709e97079',
+ 'info_dict': {
+ 'id': '80822',
+ 'ext': 'mp4',
+ 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
+ 'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1411896237,
+ 'upload_date': '20140928',
+ 'duration': 4846,
+ 'view_count': int,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
+ 'only_matching': True,
+ }
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ r'src="/vdl/player/media/(\d+)"', webpage, 'video id')
+
+ player = self._download_webpage(
+ 'http://news.sportbox.ru/vdl/player/media/%s' % video_id,
+ display_id, 'Downloading player webpage')
+
+ hls = self._search_regex(
+ r"var\s+original_hls_file\s*=\s*'([^']+)'", player, 'hls file')
+
+ formats = self._extract_m3u8_formats(hls, display_id, 'mp4')
+
+ title = self._html_search_regex(
+ r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title')
+ description = self._html_search_regex(
+ r'(?s)<div itemprop="description">(.+?)</div>', webpage, 'description', fatal=False)
+ thumbnail = self._og_search_thumbnail(webpage)
+ timestamp = parse_iso8601(self._search_regex(
+ r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False))
+ duration = parse_duration(self._html_search_regex(
+ r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False))
+ view_count = int_or_none(self._html_search_regex(
+ r'<span>Просмотров: (\d+)</span>', player, 'view count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/theonion.py b/youtube_dl/extractor/theonion.py
new file mode 100644
index 000000000..b65d8e03f
--- /dev/null
+++ b/youtube_dl/extractor/theonion.py
@@ -0,0 +1,70 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class TheOnionIE(InfoExtractor):
+ _VALID_URL = r'(?x)https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<article_id>[0-9]+)/?'
+ _TEST = {
+ 'url': 'http://www.theonion.com/video/man-wearing-mm-jacket-gods-image,36918/',
+ 'md5': '19eaa9a39cf9b9804d982e654dc791ee',
+ 'info_dict': {
+ 'id': '2133',
+ 'ext': 'mp4',
+ 'title': 'Man Wearing M&M Jacket Apparently Made In God\'s Image',
+ 'description': 'md5:cc12448686b5600baae9261d3e180910',
+ 'thumbnail': 're:^https?://.*\.jpg\?\d+$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ article_id = mobj.group('article_id')
+
+ webpage = self._download_webpage(url, article_id)
+
+ video_id = self._search_regex(
+ r'"videoId":\s(\d+),', webpage, 'video ID')
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ sources = re.findall(r'<source src="([^"]+)" type="([^"]+)"', webpage)
+ if not sources:
+ raise ExtractorError(
+ 'No sources found for video %s' % video_id, expected=True)
+
+ formats = []
+ for src, type_ in sources:
+ if type_ == 'video/mp4':
+ formats.append({
+ 'format_id': 'mp4_sd',
+ 'preference': 1,
+ 'url': src,
+ })
+ elif type_ == 'video/webm':
+ formats.append({
+ 'format_id': 'webm_sd',
+ 'preference': 0,
+ 'url': src,
+ })
+ elif type_ == 'application/x-mpegURL':
+ formats.extend(
+ self._extract_m3u8_formats(src, video_id, preference=-1))
+ else:
+ self.report_warning(
+ 'Encountered unexpected format: %s' % type_)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py
new file mode 100644
index 000000000..a77c6a2fc
--- /dev/null
+++ b/youtube_dl/extractor/thesixtyone.py
@@ -0,0 +1,100 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class TheSixtyOneIE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://(?:www\.)?thesixtyone\.com/
+ (?:.*?/)*
+ (?:
+ s|
+ song/comments/list|
+ song
+ )/(?P<id>[A-Za-z0-9]+)/?$'''
+ _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}'
+ _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}.thesixtyone.com/thesixtyone_production/audio/{0:}_stream'
+ _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop'
+ _TESTS = [
+ {
+ 'url': 'http://www.thesixtyone.com/s/SrE3zD7s1jt/',
+ 'md5': '821cc43b0530d3222e3e2b70bb4622ea',
+ 'info_dict': {
+ 'id': 'SrE3zD7s1jt',
+ 'ext': 'mp3',
+ 'title': 'CASIO - Unicorn War Mixtape',
+ 'thumbnail': 're:^https?://.*_desktop$',
+ 'upload_date': '20071217',
+ 'duration': 3208,
+ }
+ },
+ {
+ 'url': 'http://www.thesixtyone.com/song/comments/list/SrE3zD7s1jt',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.thesixtyone.com/s/ULoiyjuJWli#/s/SrE3zD7s1jt/',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.thesixtyone.com/#/s/SrE3zD7s1jt/',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/',
+ 'only_matching': True,
+ },
+ ]
+
+ _DECODE_MAP = {
+ "x": "a",
+ "m": "b",
+ "w": "c",
+ "q": "d",
+ "n": "e",
+ "p": "f",
+ "a": "0",
+ "h": "1",
+ "e": "2",
+ "u": "3",
+ "s": "4",
+ "i": "5",
+ "o": "6",
+ "y": "7",
+ "r": "8",
+ "c": "9"
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ song_id = mobj.group('id')
+
+ webpage = self._download_webpage(
+ self._SONG_URL_TEMPLATE.format(song_id), song_id)
+
+ song_data = json.loads(self._search_regex(
+ r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'))
+ keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']]
+ url = self._SONG_FILE_URL_TEMPLATE.format(
+ "".join(reversed(keys)), **song_data)
+
+ formats = [{
+ 'format_id': 'sd',
+ 'url': url,
+ 'ext': 'mp3',
+ }]
+
+ return {
+ 'id': song_id,
+ 'title': '{artist:} - {name:}'.format(**song_data),
+ 'formats': formats,
+ 'comment_count': song_data.get('comments_count'),
+ 'duration': song_data.get('play_time'),
+ 'like_count': song_data.get('score'),
+ 'thumbnail': self._THUMBNAIL_URL_TEMPLATE.format(**song_data),
+ 'upload_date': unified_strdate(song_data.get('publish_date')),
+ }
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index d2c36b58a..e6a86f18e 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -56,7 +56,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
# _VALID_URL matches Vimeo URLs
_VALID_URL = r'''(?x)
- (?P<proto>(?:https?:)?//)?
+ https?://
(?:(?:www|(?P<player>player))\.)?
vimeo(?P<pro>pro)?\.com/
(?!channels/[^/?#]+/?(?:$|[?#])|album/)
diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py
new file mode 100644
index 000000000..672bda7a7
--- /dev/null
+++ b/youtube_dl/extractor/walla.py
@@ -0,0 +1,89 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .subtitles import SubtitlesInfoExtractor
+from ..utils import (
+ xpath_text,
+ int_or_none,
+)
+
+
+class WallaIE(SubtitlesInfoExtractor):
+ _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)'
+ _TEST = {
+ 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one',
+ 'info_dict': {
+ 'id': '2642630',
+ 'display_id': 'one-direction-all-for-one',
+ 'ext': 'flv',
+ 'title': 'וואן דיירקשן: ההיסטריה',
+ 'description': 'md5:de9e2512a92442574cdb0913c49bc4d8',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 3600,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }
+
+ _SUBTITLE_LANGS = {
+ 'עברית': 'heb',
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ video = self._download_xml(
+ 'http://video2.walla.co.il/?w=null/null/%s/@@/video/flv_pl' % video_id,
+ display_id)
+
+ item = video.find('./items/item')
+
+ title = xpath_text(item, './title', 'title')
+ description = xpath_text(item, './synopsis', 'description')
+ thumbnail = xpath_text(item, './preview_pic', 'thumbnail')
+ duration = int_or_none(xpath_text(item, './duration', 'duration'))
+
+ subtitles = {}
+ for subtitle in item.findall('./subtitles/subtitle'):
+ lang = xpath_text(subtitle, './title')
+ subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = xpath_text(subtitle, './src')
+
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, subtitles)
+ return
+
+ subtitles = self.extract_subtitles(video_id, subtitles)
+
+ formats = []
+ for quality in item.findall('./qualities/quality'):
+ format_id = xpath_text(quality, './title')
+ fmt = {
+ 'url': 'rtmp://wafla.walla.co.il/vod',
+ 'play_path': xpath_text(quality, './src'),
+ 'player_url': 'http://isc.walla.co.il/w9/swf/video_swf/vod/WallaMediaPlayerAvod.swf',
+ 'page_url': url,
+ 'ext': 'flv',
+ 'format_id': xpath_text(quality, './title'),
+ }
+ m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
+ if m:
+ fmt['height'] = int(m.group('height'))
+ formats.append(fmt)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 221341c13..117f0856a 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import itertools
@@ -6,6 +7,7 @@ import re
from .common import InfoExtractor, SearchInfoExtractor
from ..utils import (
+ ExtractorError,
compat_urllib_parse,
compat_urlparse,
clean_html,
@@ -15,7 +17,7 @@ from ..utils import (
class YahooIE(InfoExtractor):
IE_DESC = 'Yahoo screen and movies'
- _VALID_URL = r'(?P<url>https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
+ _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+?)-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
_TESTS = [
{
'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
@@ -25,6 +27,7 @@ class YahooIE(InfoExtractor):
'ext': 'mp4',
'title': 'Julian Smith & Travis Legg Watch Julian Smith',
'description': 'Julian and Travis watch Julian Smith',
+ 'duration': 6863,
},
},
{
@@ -34,7 +37,8 @@ class YahooIE(InfoExtractor):
'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',
'ext': 'mp4',
'title': 'Codefellas - The Cougar Lies with Spanish Moss',
- 'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
+ 'description': 'md5:66b627ab0a282b26352136ca96ce73c1',
+ 'duration': 151,
},
},
{
@@ -45,15 +49,95 @@ class YahooIE(InfoExtractor):
'ext': 'mp4',
'title': "Yahoo Saves 'Community'",
'description': 'md5:4d4145af2fd3de00cbb6c1d664105053',
+ 'duration': 170,
}
},
+ {
+ 'url': 'https://tw.screen.yahoo.com/taipei-opinion-poll/選情站報-街頭民調-台北市篇-102823042.html',
+ 'md5': '92a7fdd8a08783c68a174d7aa067dde8',
+ 'info_dict': {
+ 'id': '7a23b569-7bea-36cb-85b9-bd5301a0a1fb',
+ 'ext': 'mp4',
+ 'title': '選情站報 街頭民調 台北市篇',
+ 'description': '選情站報 街頭民調 台北市篇',
+ 'duration': 429,
+ }
+ },
+ {
+ 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html',
+ 'md5': '0b51660361f0e27c9789e7037ef76f4b',
+ 'info_dict': {
+ 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58',
+ 'ext': 'mp4',
+ 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder',
+ 'description': 'md5:f66c890e1490f4910a9953c941dee944',
+ 'duration': 97,
+ }
+ },
+ {
+ 'url': 'https://ca.sports.yahoo.com/video/program-makes-hockey-more-affordable-013127711.html',
+ 'md5': '57e06440778b1828a6079d2f744212c4',
+ 'info_dict': {
+ 'id': 'c9fa2a36-0d4d-3937-b8f6-cc0fb1881e73',
+ 'ext': 'mp4',
+ 'title': 'Program that makes hockey more affordable not offered in Manitoba',
+ 'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4',
+ 'duration': 121,
+ }
+ }, {
+ 'url': 'https://ca.finance.yahoo.com/news/20-most-valuable-brands-world-112600775.html',
+ 'md5': '3e401e4eed6325aa29d9b96125fd5b4f',
+ 'info_dict': {
+ 'id': 'c1b4c09c-8ed8-3b65-8b05-169c55358a83',
+ 'ext': 'mp4',
+ 'title': "Apple Is The World's Most Valuable Brand",
+ 'description': 'md5:73eabc1a11c6f59752593b2ceefa1262',
+ 'duration': 21,
+ }
+ }, {
+ 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
+ 'md5': '67010fdf3a08d290e060a4dd96baa07b',
+ 'info_dict': {
+ 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521',
+ 'ext': 'mp4',
+ 'title': 'China Moses Is Crazy About the Blues',
+ 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
+ 'duration': 128,
+ }
+ }, {
+ 'url': 'https://in.lifestyle.yahoo.com/video/connect-dots-dark-side-virgo-090247395.html',
+ 'md5': 'd9a083ccf1379127bf25699d67e4791b',
+ 'info_dict': {
+ 'id': '52aeeaa3-b3d1-30d8-9ef8-5d0cf05efb7c',
+ 'ext': 'mp4',
+ 'title': 'Connect the Dots: Dark Side of Virgo',
+ 'description': 'md5:1428185051cfd1949807ad4ff6d3686a',
+ 'duration': 201,
+ }
+ }, {
+ 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',
+ 'only_matching': True,
+ }
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
url = mobj.group('url')
- webpage = self._download_webpage(url, video_id)
+ host = mobj.group('host')
+ webpage = self._download_webpage(url, display_id)
+
+ # Look for iframed media first
+ iframe_m = re.search(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage)
+ if iframe_m:
+ iframepage = self._download_webpage(
+ host + iframe_m.group(1), display_id, 'Downloading iframe webpage')
+ items_json = self._search_regex(
+ r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None)
+ if items_json:
+ items = json.loads(items_json)
+ video_id = items[0]['id']
+ return self._get_info(video_id, display_id, webpage)
items_json = self._search_regex(
r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
@@ -64,20 +148,22 @@ class YahooIE(InfoExtractor):
r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
r'"first_videoid"\s*:\s*"([^"]+)"',
]
- long_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
- video_id = long_id
+ video_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
else:
items = json.loads(items_json)
info = items['mediaItems']['query']['results']['mediaObj'][0]
# The 'meta' field is not always in the video webpage, we request it
# from another page
- long_id = info['id']
- return self._get_info(long_id, video_id, webpage)
+ video_id = info['id']
+ return self._get_info(video_id, display_id, webpage)
- def _get_info(self, long_id, video_id, webpage):
+ def _get_info(self, video_id, display_id, webpage):
+ region = self._search_regex(
+ r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"',
+ webpage, 'region', fatal=False, default='US')
query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
- ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
- ' AND protocol="http"' % long_id)
+ ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="%s"'
+ ' AND protocol="http"' % (video_id, region))
data = compat_urllib_parse.urlencode({
'q': query,
'env': 'prod',
@@ -85,9 +171,17 @@ class YahooIE(InfoExtractor):
})
query_result = self._download_json(
'http://video.query.yahoo.com/v1/public/yql?' + data,
- video_id, 'Downloading video info')
+ display_id, 'Downloading video info')
+
info = query_result['query']['results']['mediaObj'][0]
- meta = info['meta']
+ meta = info.get('meta')
+
+ if not meta:
+ msg = info['status'].get('msg')
+ if msg:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, msg), expected=True)
+ raise ExtractorError('Unable to extract media object meta')
formats = []
for s in info['streams']:
@@ -114,36 +208,15 @@ class YahooIE(InfoExtractor):
return {
'id': video_id,
+ 'display_id': display_id,
'title': meta['title'],
'formats': formats,
'description': clean_html(meta['description']),
'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
+ 'duration': int_or_none(meta.get('duration')),
}
-class YahooNewsIE(YahooIE):
- IE_NAME = 'yahoo:news'
- _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
-
- _TESTS = [{
- 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
- 'md5': '67010fdf3a08d290e060a4dd96baa07b',
- 'info_dict': {
- 'id': '104538833',
- 'ext': 'mp4',
- 'title': 'China Moses Is Crazy About the Blues',
- 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
- },
- }]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- webpage = self._download_webpage(url, video_id)
- long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id')
- return self._get_info(long_id, video_id, webpage)
-
-
class YahooSearchIE(SearchInfoExtractor):
IE_DESC = 'Yahoo screen search'
_MAX_RESULTS = 1000