aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
authorIsmael Mejia <iemejia@gmail.com>2013-09-06 23:23:23 +0200
committerIsmael Mejia <iemejia@gmail.com>2013-09-06 23:24:41 +0200
commit72836fcee453386f4f16325c5b8fa4c1ba1bb442 (patch)
tree58efd36f4a56269a07774969e2ac385aacf8eae6 /youtube_dl/extractor
parentd6e203b3dcef8f291b57021903e629d3e30e1f0b (diff)
parenta7130543fa0368175740f5fa173ef920671db866 (diff)
Merge branch 'master' into subtitles_rework
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py22
-rw-r--r--youtube_dl/extractor/addanime.py75
-rw-r--r--youtube_dl/extractor/appletrailers.py166
-rw-r--r--youtube_dl/extractor/c56.py4
-rw-r--r--youtube_dl/extractor/canalc2.py35
-rw-r--r--youtube_dl/extractor/canalplus.py2
-rw-r--r--youtube_dl/extractor/cnn.py58
-rw-r--r--youtube_dl/extractor/common.py16
-rw-r--r--youtube_dl/extractor/dailymotion.py8
-rw-r--r--youtube_dl/extractor/daum.py74
-rw-r--r--youtube_dl/extractor/defense.py39
-rw-r--r--youtube_dl/extractor/generic.py18
-rw-r--r--youtube_dl/extractor/googleplus.py4
-rw-r--r--youtube_dl/extractor/hark.py37
-rw-r--r--youtube_dl/extractor/ign.py8
-rw-r--r--youtube_dl/extractor/kankan.py6
-rw-r--r--youtube_dl/extractor/metacafe.py2
-rw-r--r--youtube_dl/extractor/metacritic.py55
-rw-r--r--youtube_dl/extractor/mit.py74
-rw-r--r--youtube_dl/extractor/naver.py73
-rw-r--r--youtube_dl/extractor/nbc.py33
-rw-r--r--youtube_dl/extractor/orf.py54
-rw-r--r--youtube_dl/extractor/ro220.py42
-rw-r--r--youtube_dl/extractor/rtlnow.py17
-rw-r--r--youtube_dl/extractor/sohu.py90
-rw-r--r--youtube_dl/extractor/trilulilu.py73
-rw-r--r--youtube_dl/extractor/unistra.py2
-rw-r--r--youtube_dl/extractor/veehd.py56
-rw-r--r--youtube_dl/extractor/vimeo.py48
-rw-r--r--youtube_dl/extractor/wat.py1
-rw-r--r--youtube_dl/extractor/xhamster.py18
-rw-r--r--youtube_dl/extractor/youporn.py18
-rw-r--r--youtube_dl/extractor/youtube.py89
33 files changed, 1241 insertions, 76 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index b4db8f0bf..fbe0b8cb7 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -1,3 +1,5 @@
+from .appletrailers import AppleTrailersIE
+from .addanime import AddAnimeIE
from .archiveorg import ArchiveOrgIE
from .ard import ARDIE
from .arte import ArteTvIE
@@ -6,16 +8,21 @@ from .bandcamp import BandcampIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .breakcom import BreakIE
from .brightcove import BrightcoveIE
+from .c56 import C56IE
from .canalplus import CanalplusIE
+from .canalc2 import Canalc2IE
+from .cnn import CNNIE
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE
from .condenast import CondeNastIE
from .criterion import CriterionIE
from .cspan import CSpanIE
from .dailymotion import DailymotionIE, DailymotionPlaylistIE
+from .daum import DaumIE
from .depositfiles import DepositFilesIE
from .dotsub import DotsubIE
from .dreisat import DreiSatIE
+from .defense import DefenseGouvFrIE
from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .escapist import EscapistIE
@@ -29,6 +36,7 @@ from .gametrailers import GametrailersIE
from .generic import GenericIE
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
+from .hark import HarkIE
from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
from .hypem import HypemIE
@@ -44,23 +52,30 @@ from .keek import KeekIE
from .liveleak import LiveLeakIE
from .livestream import LivestreamIE
from .metacafe import MetacafeIE
+from .metacritic import MetacriticIE
+from .mit import TechTVMITIE, MITIE
from .mixcloud import MixcloudIE
from .mtv import MTVIE
from .muzu import MuzuTVIE
from .myspass import MySpassIE
from .myvideo import MyVideoIE
+from .naver import NaverIE
from .nba import NBAIE
+from .nbc import NBCNewsIE
from .ooyala import OoyalaIE
+from .orf import ORFIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
from .pornotube import PornotubeIE
from .rbmaradio import RBMARadioIE
from .redtube import RedTubeIE
from .ringtv import RingTVIE
+from .ro220 import Ro220IE
from .roxwel import RoxwelIE
from .rtlnow import RTLnowIE
from .sina import SinaIE
from .slashdot import SlashdotIE
+from .sohu import SohuIE
from .soundcloud import SoundcloudIE, SoundcloudSetIE
from .spiegel import SpiegelIE
from .stanfordoc import StanfordOpenClassroomIE
@@ -71,18 +86,19 @@ from .ted import TEDIE
from .tf1 import TF1IE
from .thisav import ThisAVIE
from .traileraddict import TrailerAddictIE
+from .trilulilu import TriluliluIE
from .tudou import TudouIE
from .tumblr import TumblrIE
from .tutv import TutvIE
-from .ustream import UstreamIE
from .unistra import UnistraIE
+from .ustream import UstreamIE
from .vbox7 import Vbox7IE
+from .veehd import VeeHDIE
from .veoh import VeohIE
from .vevo import VevoIE
from .videofyme import VideofyMeIE
from .vimeo import VimeoIE, VimeoChannelIE
from .vine import VineIE
-from .c56 import C56IE
from .wat import WatIE
from .weibo import WeiboIE
from .wimp import WimpIE
@@ -116,12 +132,14 @@ _ALL_CLASSES = [
]
_ALL_CLASSES.append(GenericIE)
+
def gen_extractors():
""" Return a list of an instance of every supported extractor.
The order does matter; the first extractor matched is the one handling the URL.
"""
return [klass() for klass in _ALL_CLASSES]
+
def get_info_extractor(ie_name):
"""Returns the info extractor class with the given ie_name"""
return globals()[ie_name+'IE']
diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py
new file mode 100644
index 000000000..82a785a19
--- /dev/null
+++ b/youtube_dl/extractor/addanime.py
@@ -0,0 +1,75 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_HTTPError,
+ compat_str,
+ compat_urllib_parse,
+ compat_urllib_parse_urlparse,
+
+ ExtractorError,
+)
+
+
+class AddAnimeIE(InfoExtractor):
+
+ _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
+ IE_NAME = u'AddAnime'
+ _TEST = {
+ u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
+ u'file': u'24MR3YO5SAS9.flv',
+ u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1',
+ u'info_dict': {
+ u"description": u"One Piece 606",
+ u"title": u"One Piece 606"
+ }
+ }
+
+ def _real_extract(self, url):
+ try:
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('video_id')
+ webpage = self._download_webpage(url, video_id)
+ except ExtractorError as ee:
+ if not isinstance(ee.cause, compat_HTTPError):
+ raise
+
+ redir_webpage = ee.cause.read().decode('utf-8')
+ action = self._search_regex(
+ r'<form id="challenge-form" action="([^"]+)"',
+ redir_webpage, u'Redirect form')
+ vc = self._search_regex(
+ r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>',
+ redir_webpage, u'redirect vc value')
+ av = re.search(
+ r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
+ redir_webpage)
+ if av is None:
+ raise ExtractorError(u'Cannot find redirect math task')
+ av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3))
+
+ parsed_url = compat_urllib_parse_urlparse(url)
+ av_val = av_res + len(parsed_url.netloc)
+ confirm_url = (
+ parsed_url.scheme + u'://' + parsed_url.netloc +
+ action + '?' +
+ compat_urllib_parse.urlencode({
+ 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)}))
+ self._download_webpage(
+ confirm_url, video_id,
+ note=u'Confirming after redirect')
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(r"var normal_video_file = '(.*?)';",
+ webpage, u'video file URL')
+ video_title = self._og_search_title(webpage)
+ video_description = self._og_search_description(webpage)
+
+ return {
+ '_type': 'video',
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'flv',
+ 'title': video_title,
+ 'description': video_description
+ }
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
new file mode 100644
index 000000000..8b191c196
--- /dev/null
+++ b/youtube_dl/extractor/appletrailers.py
@@ -0,0 +1,166 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+)
+
+
+class AppleTrailersIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
+ _TEST = {
+ u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/",
+ u"playlist": [
+ {
+ u"file": u"manofsteel-trailer4.mov",
+ u"md5": u"11874af099d480cc09e103b189805d5f",
+ u"info_dict": {
+ u"duration": 111,
+ u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg",
+ u"title": u"Trailer 4",
+ u"upload_date": u"20130523",
+ u"uploader_id": u"wb",
+ },
+ },
+ {
+ u"file": u"manofsteel-trailer3.mov",
+ u"md5": u"07a0a262aae5afe68120eed61137ab34",
+ u"info_dict": {
+ u"duration": 182,
+ u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg",
+ u"title": u"Trailer 3",
+ u"upload_date": u"20130417",
+ u"uploader_id": u"wb",
+ },
+ },
+ {
+ u"file": u"manofsteel-trailer.mov",
+ u"md5": u"e401fde0813008e3307e54b6f384cff1",
+ u"info_dict": {
+ u"duration": 148,
+ u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg",
+ u"title": u"Trailer",
+ u"upload_date": u"20121212",
+ u"uploader_id": u"wb",
+ },
+ },
+ {
+ u"file": u"manofsteel-teaser.mov",
+ u"md5": u"76b392f2ae9e7c98b22913c10a639c97",
+ u"info_dict": {
+ u"duration": 93,
+ u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg",
+ u"title": u"Teaser",
+ u"upload_date": u"20120721",
+ u"uploader_id": u"wb",
+ },
+ }
+ ]
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ movie = mobj.group('movie')
+ uploader_id = mobj.group('company')
+
+ playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc'
+ playlist_snippet = self._download_webpage(playlist_url, movie)
+ playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet)
+ playlist_html = u'<html>' + playlist_cleaned + u'</html>'
+
+ size_cache = {}
+
+ doc = xml.etree.ElementTree.fromstring(playlist_html)
+ playlist = []
+ for li in doc.findall('./div/ul/li'):
+ title = li.find('.//h3').text
+ video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
+ thumbnail = li.find('.//img').attrib['src']
+
+ date_el = li.find('.//p')
+ upload_date = None
+ m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text)
+ if m:
+ upload_date = u'20' + m.group('year') + m.group('month') + m.group('day')
+ runtime_el = date_el.find('./br')
+ m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail)
+ duration = None
+ if m:
+ duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
+
+ formats = []
+ for formats_el in li.findall('.//a'):
+ if formats_el.attrib['class'] != 'OverlayPanel':
+ continue
+ target = formats_el.attrib['target']
+
+ format_code = formats_el.text
+ if 'Automatic' in format_code:
+ continue
+
+ size_q = formats_el.attrib['href']
+ size_id = size_q.rpartition('#videos-')[2]
+ if size_id not in size_cache:
+ size_url = url + size_q
+ sizepage_html = self._download_webpage(
+ size_url, movie,
+ note=u'Downloading size info %s' % size_id,
+ errnote=u'Error while downloading size info %s' % size_id,
+ )
+ _doc = xml.etree.ElementTree.fromstring(sizepage_html)
+ size_cache[size_id] = _doc
+
+ sizepage_doc = size_cache[size_id]
+ links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a')
+ for vid_a in links:
+ href = vid_a.get('href')
+ if not href.endswith(target):
+ continue
+ detail_q = href.partition('#')[0]
+ detail_url = url + '/' + detail_q
+
+ m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q)
+ detail_id = m.group('detail_id')
+
+ detail_html = self._download_webpage(
+ detail_url, movie,
+ note=u'Downloading detail %s %s' % (detail_id, size_id),
+ errnote=u'Error while downloading detail %s %s' % (detail_id, size_id)
+ )
+ detail_doc = xml.etree.ElementTree.fromstring(detail_html)
+ movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a')
+ assert movie_link_el.get('class') == 'movieLink'
+ movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h')
+ ext = determine_ext(movie_link)
+ assert ext == 'mov'
+
+ formats.append({
+ 'format': format_code,
+ 'ext': ext,
+ 'url': movie_link,
+ })
+
+ info = {
+ '_type': 'video',
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'title': title,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'uploader_id': uploader_id,
+ 'user_agent': 'QuickTime compatible (youtube-dl)',
+ }
+ # TODO: Remove when #980 has been merged
+ info['url'] = formats[-1]['url']
+ info['ext'] = formats[-1]['ext']
+
+ playlist.append(info)
+
+ return {
+ '_type': 'playlist',
+ 'id': movie,
+ 'entries': playlist,
+ }
diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py
index 4c8a8af09..dc3a8d47d 100644
--- a/youtube_dl/extractor/c56.py
+++ b/youtube_dl/extractor/c56.py
@@ -12,8 +12,8 @@ class C56IE(InfoExtractor):
_TEST ={
u'url': u'http://www.56.com/u39/v_OTM0NDA3MTY.html',
- u'file': u'93440716.mp4',
- u'md5': u'9dc07b5c8e978112a6441f9e75d2b59e',
+ u'file': u'93440716.flv',
+ u'md5': u'e59995ac63d0457783ea05f93f12a866',
u'info_dict': {
u'title': u'网事知多少 第32期:车怒',
},
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
new file mode 100644
index 000000000..50832217a
--- /dev/null
+++ b/youtube_dl/extractor/canalc2.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+import re
+
+from .common import InfoExtractor
+
+
+class Canalc2IE(InfoExtractor):
+ _IE_NAME = 'canalc2.tv'
+ _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui'
+
+ _TEST = {
+ u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
+ u'file': u'12163.mp4',
+ u'md5': u'060158428b650f896c542dfbb3d6487f',
+ u'info_dict': {
+ u'title': u'Terrasses du Numérique'
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = re.match(self._VALID_URL, url).group(1)
+ webpage = self._download_webpage(url, video_id)
+ file_name = self._search_regex(
+ r"so\.addVariable\('file','(.*?)'\);",
+ webpage, 'file name')
+ video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name
+
+ title = self._html_search_regex(
+ r'class="evenement8">(.*?)</a>', webpage, u'title')
+
+ return {'id': video_id,
+ 'ext': 'mp4',
+ 'url': video_url,
+ 'title': title,
+ }
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 3b1c88876..1f02519a0 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
from ..utils import unified_strdate
class CanalplusIE(InfoExtractor):
- _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)'
+ _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P<id>\d+)'
_VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
IE_NAME = u'canalplus.fr'
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
new file mode 100644
index 000000000..a79f881cd
--- /dev/null
+++ b/youtube_dl/extractor/cnn.py
@@ -0,0 +1,58 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+
+class CNNIE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/
+ (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))'''
+
+ _TESTS = [{
+ u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
+ u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4',
+ u'md5': u'3e6121ea48df7e2259fe73a0628605c4',
+ u'info_dict': {
+ u'title': u'Nadal wins 8th French Open title',
+ u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
+ },
+ },
+ {
+ u"url": u"http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29",
+ u"file": u"us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4",
+ u"md5": u"b5cc60c60a3477d185af8f19a2a26f4e",
+ u"info_dict": {
+ u"title": "Student's epic speech stuns new freshmen",
+ u"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\""
+ }
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ path = mobj.group('path')
+ page_title = mobj.group('title')
+ info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path
+ info_xml = self._download_webpage(info_url, page_title)
+ info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+
+ formats = []
+ for f in info.findall('files/file'):
+ mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate'])
+ if mf is not None:
+ formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text))
+ formats = sorted(formats)
+ (_,_,_, video_path) = formats[-1]
+ video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path
+
+ thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')])
+ thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails]
+
+ return {'id': info.attrib['id'],
+ 'title': info.find('headline').text,
+ 'url': video_url,
+ 'ext': determine_ext(video_url),
+ 'thumbnail': thumbnails[-1][1],
+ 'thumbnails': thumbs_dict,
+ 'description': info.find('description').text,
+ }
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 52c4483c9..77726ee24 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -114,6 +114,11 @@ class InfoExtractor(object):
"""Real extraction process. Redefine in subclasses."""
pass
+ @classmethod
+ def ie_key(cls):
+ """A string for getting the InfoExtractor with get_info_extractor"""
+ return cls.__name__[:-2]
+
@property
def IE_NAME(self):
return type(self).__name__[:-2]
@@ -129,7 +134,7 @@ class InfoExtractor(object):
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if errnote is None:
errnote = u'Unable to download webpage'
- raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
+ raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
""" Returns a tuple (page content as string, URL handle) """
@@ -140,12 +145,17 @@ class InfoExtractor(object):
urlh = self._request_webpage(url_or_request, video_id, note, errnote)
content_type = urlh.headers.get('Content-Type', '')
+ webpage_bytes = urlh.read()
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
if m:
encoding = m.group(1)
else:
- encoding = 'utf-8'
- webpage_bytes = urlh.read()
+ m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
+ webpage_bytes[:1024])
+ if m:
+ encoding = m.group(1).decode('ascii')
+ else:
+ encoding = 'utf-8'
if self._downloader.params.get('dump_intermediate_pages', False):
try:
url = url_or_request.get_full_url()
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 003b1d8c3..f7dffd4cc 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -37,14 +37,14 @@ class DailyMotionSubtitlesIE(NoAutoSubtitlesIE):
class DailymotionIE(DailyMotionSubtitlesIE, InfoExtractor):
"""Information Extractor for Dailymotion"""
- _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
+ _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)'
IE_NAME = u'dailymotion'
_TEST = {
u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech',
u'file': u'x33vw9.mp4',
u'md5': u'392c4b85a60a90dc4792da41ce3144eb',
u'info_dict': {
- u"uploader": u"Alex and Van .",
+ u"uploader": u"Amphora Alex and Van .",
u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\""
}
}
@@ -56,6 +56,7 @@ class DailymotionIE(DailyMotionSubtitlesIE, InfoExtractor):
video_id = mobj.group(1).split('_')[0].split('?')[0]
video_extension = 'mp4'
+ url = 'http://www.dailymotion.com/video/%s' % video_id
# Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url)
@@ -78,7 +79,8 @@ class DailymotionIE(DailyMotionSubtitlesIE, InfoExtractor):
embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
embed_page = self._download_webpage(embed_url, video_id,
u'Downloading embed page')
- info = self._search_regex(r'var info = ({.*?}),', embed_page, 'video info')
+ info = self._search_regex(r'var info = ({.*?}),$', embed_page,
+ 'video info', flags=re.MULTILINE)
info = json.loads(info)
# TODO: support choosing qualities
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py
new file mode 100644
index 000000000..a804e83bd
--- /dev/null
+++ b/youtube_dl/extractor/daum.py
@@ -0,0 +1,74 @@
+# encoding: utf-8
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ determine_ext,
+)
+
+
+class DaumIE(InfoExtractor):
+ _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
+ IE_NAME = u'daum.net'
+
+ _TEST = {
+ u'url': u'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690',
+ u'file': u'52554690.mp4',
+ u'info_dict': {
+ u'title': u'DOTA 2GETHER 시즌2 6회 - 2부',
+ u'description': u'DOTA 2GETHER 시즌2 6회 - 2부',
+ u'upload_date': u'20130831',
+ u'duration': 3868,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+ canonical_url = 'http://tvpot.daum.net/v/%s' % video_id
+ webpage = self._download_webpage(canonical_url, video_id)
+ full_id = self._search_regex(r'<link rel="video_src" href=".+?vid=(.+?)"',
+ webpage, u'full id')
+ query = compat_urllib_parse.urlencode({'vid': full_id})
+ info_xml = self._download_webpage(
+ 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
+ u'Downloading video info')
+ urls_xml = self._download_webpage(
+ 'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query,
+ video_id, u'Downloading video formats info')
+ info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+ urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8'))
+
+ self.to_screen(u'%s: Getting video urls' % video_id)
+ formats = []
+ for format_el in urls.findall('result/output_list/output_list'):
+ profile = format_el.attrib['profile']
+ format_query = compat_urllib_parse.urlencode({
+ 'vid': full_id,
+ 'profile': profile,
+ })
+ url_xml = self._download_webpage(
+ 'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query,
+ video_id, note=False)
+ url_doc = xml.etree.ElementTree.fromstring(url_xml.encode('utf-8'))
+ format_url = url_doc.find('result/url').text
+ formats.append({
+ 'url': format_url,
+ 'ext': determine_ext(format_url),
+ 'format_id': profile,
+ })
+
+ info = {
+ 'id': video_id,
+ 'title': info.find('TITLE').text,
+ 'formats': formats,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'description': info.find('CONTENTS').text,
+ 'duration': int(info.find('DURATION').text),
+ 'upload_date': info.find('REGDTTM').text[:8],
+ }
+ # TODO: Remove when #980 has been merged
+ info.update(formats[-1])
+ return info
diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py
new file mode 100644
index 000000000..424d960da
--- /dev/null
+++ b/youtube_dl/extractor/defense.py
@@ -0,0 +1,39 @@
+import re
+import json
+
+from .common import InfoExtractor
+
+
+class DefenseGouvFrIE(InfoExtractor):
+ _IE_NAME = 'defense.gouv.fr'
+ _VALID_URL = (r'http://.*?\.defense\.gouv\.fr/layout/set/'
+ r'ligthboxvideo/base-de-medias/webtv/(.*)')
+
+ _TEST = {
+ u'url': (u'http://www.defense.gouv.fr/layout/set/ligthboxvideo/'
+ u'base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1'),
+ u'file': u'11213.mp4',
+ u'md5': u'75bba6124da7e63d2d60b5244ec9430c',
+ "info_dict": {
+ "title": "attaque-chimique-syrienne-du-21-aout-2013-1"
+ }
+ }
+
+ def _real_extract(self, url):
+ title = re.match(self._VALID_URL, url).group(1)
+ webpage = self._download_webpage(url, title)
+ video_id = self._search_regex(
+ r"flashvars.pvg_id=\"(\d+)\";",
+ webpage, 'ID')
+
+ json_url = ('http://static.videos.gouv.fr/brightcovehub/export/json/'
+ + video_id)
+ info = self._download_webpage(json_url, title,
+ 'Downloading JSON config')
+ video_url = json.loads(info)['renditions'][0]['url']
+
+ return {'id': video_id,
+ 'ext': 'mp4',
+ 'url': video_url,
+ 'title': title,
+ }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index da016f7ee..f92e61fea 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -8,11 +8,13 @@ from ..utils import (
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
+ compat_urlparse,
ExtractorError,
)
from .brightcove import BrightcoveIE
+
class GenericIE(InfoExtractor):
IE_DESC = u'Generic downloader that works on some sites'
_VALID_URL = r'.*'
@@ -23,7 +25,7 @@ class GenericIE(InfoExtractor):
u'file': u'13601338388002.mp4',
u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
u'info_dict': {
- u"uploader": u"www.hodiho.fr",
+ u"uploader": u"www.hodiho.fr",
u"title": u"R\u00e9gis plante sa Jeep"
}
},
@@ -107,6 +109,11 @@ class GenericIE(InfoExtractor):
return new_url
def _real_extract(self, url):
+ parsed_url = compat_urlparse.urlparse(url)
+ if not parsed_url.scheme:
+ self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
+ return self.url_result('http://' + url)
+
try:
new_url = self._test_redirect(url)
if new_url:
@@ -124,7 +131,7 @@ class GenericIE(InfoExtractor):
raise ExtractorError(u'Invalid URL: %s' % url)
self.report_extraction(video_id)
- # Look for BrigthCove:
+ # Look for BrightCove:
m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)
if m_brightcove is not None:
self.to_screen(u'Brightcove video detected.')
@@ -151,7 +158,7 @@ class GenericIE(InfoExtractor):
mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
if mobj is None:
# HTML5 video
- mobj = re.search(r'<video[^<]*>.*?<source .*?src="([^"]+)"', webpage, flags=re.DOTALL)
+ mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
@@ -160,8 +167,9 @@ class GenericIE(InfoExtractor):
if mobj.group(1) is None:
raise ExtractorError(u'Invalid URL: %s' % url)
- video_url = compat_urllib_parse.unquote(mobj.group(1))
- video_id = os.path.basename(video_url)
+ video_url = mobj.group(1)
+ video_url = compat_urlparse.urljoin(url, video_url)
+ video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
# here's a fun little line of code for you:
video_extension = os.path.splitext(video_id)[1][1:]
diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py
index 9f7fc19a4..f1cd88983 100644
--- a/youtube_dl/extractor/googleplus.py
+++ b/youtube_dl/extractor/googleplus.py
@@ -57,8 +57,8 @@ class GooglePlusIE(InfoExtractor):
webpage, 'title', default=u'NA')
# Step 2, Simulate clicking the image box to launch video
- DOMAIN = 'https://plus.google.com'
- video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN),
+ DOMAIN = 'https://plus.google.com/'
+ video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
webpage, u'video page URL')
if not video_page.startswith(DOMAIN):
video_page = DOMAIN + video_page
diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py
new file mode 100644
index 000000000..5bdd08afa
--- /dev/null
+++ b/youtube_dl/extractor/hark.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+class HarkIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+'
+ _TEST = {
+ u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013',
+ u'file': u'mmbzyhkgny.mp3',
+ u'md5': u'6783a58491b47b92c7c1af5a77d4cbee',
+ u'info_dict': {
+ u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013",
+ u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.',
+ u'duration': 11,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+ json_url = "http://www.hark.com/clips/%s.json" %(video_id)
+ info_json = self._download_webpage(json_url, video_id)
+ info = json.loads(info_json)
+ final_url = info['url']
+
+ return {'id': video_id,
+ 'url' : final_url,
+ 'title': info['name'],
+ 'ext': determine_ext(final_url),
+ 'description': info['description'],
+ 'thumbnail': info['image_original'],
+ 'duration': info['duration'],
+ }
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py
index 62abab655..b1c84278a 100644
--- a/youtube_dl/extractor/ign.py
+++ b/youtube_dl/extractor/ign.py
@@ -13,7 +13,7 @@ class IGNIE(InfoExtractor):
Some videos of it.ign.com are also supported
"""
- _VALID_URL = r'https?://.+?\.ign\.com/(?:videos|show_videos)(/.+)?/(?P<name_or_id>.+)'
+ _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles)(/.+)?/(?P<name_or_id>.+)'
IE_NAME = u'ign.com'
_CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config'
@@ -41,7 +41,11 @@ class IGNIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
name_or_id = mobj.group('name_or_id')
+ page_type = mobj.group('type')
webpage = self._download_webpage(url, name_or_id)
+ if page_type == 'articles':
+ video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, u'video url')
+ return self.url_result(video_url, ie='IGN')
video_id = self._find_video_id(webpage)
result = self._get_video_info(video_id)
description = self._html_search_regex(self._DESCRIPTION_RE,
@@ -68,7 +72,7 @@ class IGNIE(InfoExtractor):
class OneUPIE(IGNIE):
"""Extractor for 1up.com, it uses the ign videos system."""
- _VALID_URL = r'https?://gamevideos.1up.com/video/id/(?P<name_or_id>.+)'
+ _VALID_URL = r'https?://gamevideos.1up.com/(?P<type>video)/id/(?P<name_or_id>.+)'
IE_NAME = '1up.com'
_DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>'
diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py
index 8537ba584..445d46501 100644
--- a/youtube_dl/extractor/kankan.py
+++ b/youtube_dl/extractor/kankan.py
@@ -21,8 +21,10 @@ class KankanIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title')
- gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid')
+ title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, u'video title')
+ surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0)
+ gcids = re.findall(r"http://.+?/.+?/(.+?)/", surls)
+ gcid = gcids[-1]
video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid,
video_id, u'Downloading video url info')
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py
index e38dc98b4..e537648ff 100644
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -122,7 +122,7 @@ class MetacafeIE(InfoExtractor):
video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, u'title')
description = self._og_search_description(webpage)
video_uploader = self._html_search_regex(
- r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("channel","([^"]+)"\);',
+ r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
webpage, u'uploader nickname', fatal=False)
return {
diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py
new file mode 100644
index 000000000..449138b56
--- /dev/null
+++ b/youtube_dl/extractor/metacritic.py
@@ -0,0 +1,55 @@
+import re
+import xml.etree.ElementTree
+import operator
+
+from .common import InfoExtractor
+
+
+class MetacriticIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.metacritic\.com/.+?/trailers/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222',
+ u'file': u'3698222.mp4',
+ u'info_dict': {
+ u'title': u'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors',
+ u'description': u'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.',
+ u'duration': 221,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ # The xml is not well formatted, there are raw '&'
+ info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id,
+ video_id, u'Downloading info xml').replace('&', '&amp;')
+ info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+
+ clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
+ formats = []
+ for videoFile in clip.findall('httpURI/videoFile'):
+ rate_str = videoFile.find('rate').text
+ video_url = videoFile.find('filePath').text
+ formats.append({
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'format_id': rate_str,
+ 'rate': int(rate_str),
+ })
+ formats.sort(key=operator.itemgetter('rate'))
+
+ description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>',
+ webpage, u'description', flags=re.DOTALL)
+
+ info = {
+ 'id': video_id,
+ 'title': clip.find('title').text,
+ 'formats': formats,
+ 'description': description,
+ 'duration': int(clip.find('duration').text),
+ }
+ # TODO: Remove when #980 has been merged
+ info.update(formats[-1])
+ return info
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
new file mode 100644
index 000000000..52be9232f
--- /dev/null
+++ b/youtube_dl/extractor/mit.py
@@ -0,0 +1,74 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ get_element_by_id,
+)
+
+
+class TechTVMITIE(InfoExtractor):
+ IE_NAME = u'techtv.mit.edu'
+ _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
+ u'file': u'25418.mp4',
+ u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f',
+ u'info_dict': {
+ u'title': u'MIT DNA Learning Center Set',
+ u'description': u'md5:82313335e8a8a3f243351ba55bc1b474',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ raw_page = self._download_webpage(
+ 'http://techtv.mit.edu/videos/%s' % video_id, video_id)
+ clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page)
+
+ base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
+ raw_page, u'base url')
+ formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,
+ u'video formats')
+ formats = json.loads(formats_json)
+ formats = sorted(formats, key=lambda f: f['bitrate'])
+
+ title = get_element_by_id('edit-title', clean_page)
+ description = clean_html(get_element_by_id('edit-description', clean_page))
+ thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
+ raw_page, u'thumbnail', flags=re.DOTALL)
+
+ return {'id': video_id,
+ 'title': title,
+ 'url': base_url + formats[-1]['url'].replace('mp4:', ''),
+ 'ext': 'mp4',
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
+
+
+class MITIE(TechTVMITIE):
+ IE_NAME = u'video.mit.edu'
+ _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)'
+
+ _TEST = {
+ u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
+ u'file': u'21783.mp4',
+ u'md5': u'7db01d5ccc1895fc5010e9c9e13648da',
+ u'info_dict': {
+ u'title': u'The Government is Profiling You',
+ u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ page_title = mobj.group('title')
+ webpage = self._download_webpage(url, page_title)
+ self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME))
+ embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage,
+ u'embed url')
+ return self.url_result(embed_url, ie='TechTVMIT')
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
new file mode 100644
index 000000000..9df236d69
--- /dev/null
+++ b/youtube_dl/extractor/naver.py
@@ -0,0 +1,73 @@
+# encoding: utf-8
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ ExtractorError,
+)
+
+
+class NaverIE(InfoExtractor):
+ _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://tvcast.naver.com/v/81652',
+ u'file': u'81652.mp4',
+ u'info_dict': {
+ u'title': u'[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번',
+ u'description': u'합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
+ u'upload_date': u'20130903',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+ webpage = self._download_webpage(url, video_id)
+ m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"',
+ webpage)
+ if m_id is None:
+ raise ExtractorError(u'couldn\'t extract vid and key')
+ vid = m_id.group(1)
+ key = m_id.group(2)
+ query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key,})
+ query_urls = compat_urllib_parse.urlencode({
+ 'masterVid': vid,
+ 'protocol': 'p2p',
+ 'inKey': key,
+ })
+ info_xml = self._download_webpage(
+ 'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query,
+ video_id, u'Downloading video info')
+ urls_xml = self._download_webpage(
+ 'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls,
+ video_id, u'Downloading video formats info')
+ info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+ urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8'))
+
+ formats = []
+ for format_el in urls.findall('EncodingOptions/EncodingOption'):
+ domain = format_el.find('Domain').text
+ if domain.startswith('rtmp'):
+ continue
+ formats.append({
+ 'url': domain + format_el.find('uri').text,
+ 'ext': 'mp4',
+ 'width': int(format_el.find('width').text),
+ 'height': int(format_el.find('height').text),
+ })
+
+ info = {
+ 'id': video_id,
+ 'title': info.find('Subject').text,
+ 'formats': formats,
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'upload_date': info.find('WriteDate').text.replace('.', ''),
+ 'view_count': int(info.find('PlayCount').text),
+ }
+ # TODO: Remove when #980 has been merged
+ info.update(formats[-1])
+ return info
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
new file mode 100644
index 000000000..3bc9dae6d
--- /dev/null
+++ b/youtube_dl/extractor/nbc.py
@@ -0,0 +1,33 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import find_xpath_attr, compat_str
+
+
+class NBCNewsIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.nbcnews.com/video/nbc-news/52753292',
+ u'file': u'52753292.flv',
+ u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179',
+ u'info_dict': {
+ u'title': u'Crew emerges after four-month Mars food study',
+ u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
+ info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video')
+
+ return {'id': video_id,
+ 'title': info.find('headline').text,
+ 'ext': 'flv',
+ 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
+ 'description': compat_str(info.find('caption').text),
+ 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
+ }
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
new file mode 100644
index 000000000..cfca2a063
--- /dev/null
+++ b/youtube_dl/extractor/orf.py
@@ -0,0 +1,54 @@
+# coding: utf-8
+
+import re
+import xml.etree.ElementTree
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urlparse,
+ ExtractorError,
+ find_xpath_attr,
+)
+
+class ORFIE(InfoExtractor):
+ _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ playlist_id = mobj.group('id')
+ webpage = self._download_webpage(url, playlist_id)
+
+ flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml')
+ flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0]
+ flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8'))
+ playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"')
+ playlist = json.loads(playlist_json)
+
+ videos = []
+ ns = '{http://tempuri.org/XMLSchema.xsd}'
+ xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns}
+ webpage_description = self._og_search_description(webpage)
+ for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1):
+ # Get best quality url
+ rtmp_url = None
+ for q in ['Q6A', 'Q4A', 'Q1A']:
+ video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q)
+ if video_url is not None:
+ rtmp_url = video_url.text
+ break
+ if rtmp_url is None:
+ raise ExtractorError(u'Couldn\'t get video url: %s' % info['id'])
+ description = self._html_search_regex(
+ r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage,
+ u'description', default=webpage_description, flags=re.DOTALL)
+ videos.append({
+ '_type': 'video',
+ 'id': info['id'],
+ 'title': info['title'],
+ 'url': rtmp_url,
+ 'ext': 'flv',
+ 'description': description,
+ })
+
+ return videos
diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py
new file mode 100644
index 000000000..c32f64d99
--- /dev/null
+++ b/youtube_dl/extractor/ro220.py
@@ -0,0 +1,42 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ compat_parse_qs,
+)
+
+
+class Ro220IE(InfoExtractor):
+ IE_NAME = '220.ro'
+ _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<video_id>[^/]+)'
+ _TEST = {
+ u"url": u"http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/",
+ u'file': u'LYV6doKo7f.mp4',
+ u'md5': u'03af18b73a07b4088753930db7a34add',
+ u'info_dict': {
+ u"title": u"Luati-le Banii sez 4 ep 1",
+ u"description": u"Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.",
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('video_id')
+
+ webpage = self._download_webpage(url, video_id)
+ flashVars_str = self._search_regex(
+ r'<param name="flashVars" value="([^"]+)"',
+ webpage, u'flashVars')
+ flashVars = compat_parse_qs(flashVars_str)
+
+ info = {
+ '_type': 'video',
+ 'id': video_id,
+ 'ext': 'mp4',
+ 'url': flashVars['videoURL'][0],
+ 'title': flashVars['title'][0],
+ 'description': clean_html(flashVars['desc'][0]),
+ 'thumbnail': flashVars['preview'][0],
+ }
+ return info
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
index 2f134e6a7..7bb236c2b 100644
--- a/youtube_dl/extractor/rtlnow.py
+++ b/youtube_dl/extractor/rtlnow.py
@@ -8,8 +8,8 @@ from ..utils import (
)
class RTLnowIE(InfoExtractor):
- """Information Extractor for RTLnow, RTL2now and VOXnow"""
- _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl(?:(?P<is_rtl2>2)|-)now\.rtl(?(is_rtl2)2|)\.de/|(?:www\.)?voxnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
+ """Information Extractor for RTL NOW, RTL2 NOW, SUPER RTL NOW and VOX NOW"""
+ _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?superrtlnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
_TESTS = [{
u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
u'file': u'90419.flv',
@@ -48,6 +48,19 @@ class RTLnowIE(InfoExtractor):
u'params': {
u'skip_download': True,
},
+ },
+ {
+ u'url': u'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
+ u'file': u'99205.flv',
+ u'info_dict': {
+ u'upload_date': u'20080928',
+ u'title': u'Medicopter 117 - Angst!',
+ u'description': u'Angst!',
+ u'thumbnail': u'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg'
+ },
+ u'params': {
+ u'skip_download': True,
+ },
}]
def _real_extract(self,url):
diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
new file mode 100644
index 000000000..77bb0a8dc
--- /dev/null
+++ b/youtube_dl/extractor/sohu.py
@@ -0,0 +1,90 @@
+# encoding: utf-8
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class SohuIE(InfoExtractor):
+ _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?'
+
+ _TEST = {
+ u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super',
+ u'file': u'382479172.mp4',
+ u'md5': u'bde8d9a6ffd82c63a1eefaef4eeefec7',
+ u'info_dict': {
+ u'title': u'MV:Far East Movement《The Illest》',
+ },
+ }
+
+ def _real_extract(self, url):
+
+ def _fetch_data(vid_id):
+ base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid='
+ data_url = base_data_url + str(vid_id)
+ data_json = self._download_webpage(
+ data_url, video_id,
+ note=u'Downloading JSON data for ' + str(vid_id))
+ return json.loads(data_json)
+
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ raw_title = self._html_search_regex(r'(?s)<title>(.+?)</title>',
+ webpage, u'video title')
+ title = raw_title.partition('-')[0].strip()
+
+ vid = self._html_search_regex(r'var vid="(\d+)"', webpage,
+ u'video path')
+ data = _fetch_data(vid)
+
+ QUALITIES = ('ori', 'super', 'high', 'nor')
+ vid_ids = [data['data'][q + 'Vid']
+ for q in QUALITIES
+ if data['data'][q + 'Vid'] != 0]
+ if not vid_ids:
+ raise ExtractorError(u'No formats available for this video')
+
+ # For now, we just pick the highest available quality
+ vid_id = vid_ids[-1]
+
+ format_data = data if vid == vid_id else _fetch_data(vid_id)
+ part_count = format_data['data']['totalBlocks']
+ allot = format_data['allot']
+ prot = format_data['prot']
+ clipsURL = format_data['data']['clipsURL']
+ su = format_data['data']['su']
+
+ playlist = []
+ for i in range(part_count):
+ part_url = ('http://%s/?prot=%s&file=%s&new=%s' %
+ (allot, prot, clipsURL[i], su[i]))
+ part_str = self._download_webpage(
+ part_url, video_id,
+ note=u'Downloading part %d of %d' % (i+1, part_count))
+
+ part_info = part_str.split('|')
+ video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3])
+
+ video_info = {
+ 'id': '%s_part%02d' % (video_id, i + 1),
+ 'title': title,
+ 'url': video_url,
+ 'ext': 'mp4',
+ }
+ playlist.append(video_info)
+
+ if len(playlist) == 1:
+ info = playlist[0]
+ info['id'] = video_id
+ else:
+ info = {
+ '_type': 'playlist',
+ 'entries': playlist,
+ 'id': video_id,
+ }
+
+ return info
diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py
new file mode 100644
index 000000000..f278951ba
--- /dev/null
+++ b/youtube_dl/extractor/trilulilu.py
@@ -0,0 +1,73 @@
+import json
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+
+
+class TriluliluIE(InfoExtractor):
+ _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?trilulilu\.ro/video-(?P<category>[^/]+)/(?P<video_id>[^/]+)'
+ _TEST = {
+ u"url": u"http://www.trilulilu.ro/video-animatie/big-buck-bunny-1",
+ u'file': u"big-buck-bunny-1.mp4",
+ u'info_dict': {
+ u"title": u"Big Buck Bunny",
+ u"description": u":) pentru copilul din noi",
+ },
+ # Server ignores Range headers (--test)
+ u"params": {
+ u"skip_download": True
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('video_id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ description = self._og_search_description(webpage)
+
+ log_str = self._search_regex(
+ r'block_flash_vars[ ]=[ ]({[^}]+})', webpage, u'log info')
+ log = json.loads(log_str)
+
+ format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/'
+ u'video-formats2' % log)
+ format_str = self._download_webpage(
+ format_url, video_id,
+ note=u'Downloading formats',
+ errnote=u'Error while downloading formats')
+
+ format_doc = xml.etree.ElementTree.fromstring(format_str)
+
+ video_url_template = (
+ u'http://fs%(server)s.trilulilu.ro/stream.php?type=video'
+ u'&source=site&hash=%(hash)s&username=%(userid)s&'
+ u'key=ministhebest&format=%%s&sig=&exp=' %
+ log)
+ formats = [
+ {
+ 'format': fnode.text,
+ 'url': video_url_template % fnode.text,
+ }
+
+ for fnode in format_doc.findall('./formats/format')
+ ]
+
+ info = {
+ '_type': 'video',
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
+
+ # TODO: Remove when #980 has been merged
+ info['url'] = formats[-1]['url']
+ info['ext'] = formats[-1]['format'].partition('-')[0]
+
+ return info
diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py
index 5ba0a9061..516e18914 100644
--- a/youtube_dl/extractor/unistra.py
+++ b/youtube_dl/extractor/unistra.py
@@ -11,7 +11,7 @@ class UnistraIE(InfoExtractor):
u'md5': u'736f605cfdc96724d55bb543ab3ced24',
u'info_dict': {
u'title': u'M!ss Yella',
- u'description': u'md5:75e8439a3e2981cd5d4b6db232e8fdfc',
+ u'description': u'md5:104892c71bd48e55d70b902736b81bbf',
},
}
diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py
new file mode 100644
index 000000000..3a99a29c6
--- /dev/null
+++ b/youtube_dl/extractor/veehd.py
@@ -0,0 +1,56 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urlparse,
+ get_element_by_id,
+ clean_html,
+)
+
+class VeeHDIE(InfoExtractor):
+ _VALID_URL = r'https?://veehd.com/video/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://veehd.com/video/4686958',
+ u'file': u'4686958.mp4',
+ u'info_dict': {
+ u'title': u'Time Lapse View from Space ( ISS)',
+ u'uploader_id': u'spotted',
+ u'description': u'md5:f0094c4cf3a72e22bc4e4239ef767ad7',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ player_path = self._search_regex(r'\$\("#playeriframe"\).attr\({src : "(.+?)"',
+ webpage, u'player path')
+ player_url = compat_urlparse.urljoin(url, player_path)
+ player_page = self._download_webpage(player_url, video_id,
+ u'Downloading player page')
+ config_json = self._search_regex(r'value=\'config=({.+?})\'',
+ player_page, u'config json')
+ config = json.loads(config_json)
+
+ video_url = compat_urlparse.unquote(config['clip']['url'])
+ title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0])
+ uploader_id = self._html_search_regex(r'<a href="/profile/\d+">(.+?)</a>',
+ webpage, u'uploader')
+ thumbnail = self._search_regex(r'<img id="veehdpreview" src="(.+?)"',
+ webpage, u'thumbnail')
+ description = self._html_search_regex(r'<td class="infodropdown".*?<div>(.*?)<ul',
+ webpage, u'description', flags=re.DOTALL)
+
+ return {
+ '_type': 'video',
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'uploader_id': uploader_id,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 512e06e2a..4a7d82b7a 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -44,6 +44,16 @@ class VimeoIE(InfoExtractor):
u'title': u'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
},
},
+ {
+ u'url': u'http://player.vimeo.com/video/54469442',
+ u'file': u'54469442.mp4',
+ u'md5': u'619b811a4417aa4abe78dc653becf511',
+ u'note': u'Videos that embed the url in the player page',
+ u'info_dict': {
+ u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software',
+ u'uploader': u'The BLN & Business of Software',
+ },
+ },
]
def _login(self):
@@ -112,7 +122,8 @@ class VimeoIE(InfoExtractor):
# Extract the config JSON
try:
- config = webpage.split(' = {config:')[1].split(',assets:')[0]
+ config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'],
+ webpage, u'info section', flags=re.DOTALL)
config = json.loads(config)
except:
if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
@@ -132,12 +143,22 @@ class VimeoIE(InfoExtractor):
video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
# Extract video thumbnail
- video_thumbnail = config["video"]["thumbnail"]
+ video_thumbnail = config["video"].get("thumbnail")
+ if video_thumbnail is None:
+ _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1]
# Extract video description
- video_description = get_element_by_attribute("itemprop", "description", webpage)
- if video_description: video_description = clean_html(video_description)
- else: video_description = u''
+ video_description = None
+ try:
+ video_description = get_element_by_attribute("itemprop", "description", webpage)
+ if video_description: video_description = clean_html(video_description)
+ except AssertionError as err:
+ # On some pages like (http://player.vimeo.com/video/54469442) the
+ # html tags are not closed, python 2.6 cannot handle it
+ if err.args[0] == 'we should not get here!':
+ pass
+ else:
+ raise
# Extract upload date
video_upload_date = None
@@ -154,14 +175,15 @@ class VimeoIE(InfoExtractor):
# TODO bind to format param
codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
files = { 'hd': [], 'sd': [], 'other': []}
+ config_files = config["video"].get("files") or config["request"].get("files")
for codec_name, codec_extension in codecs:
- if codec_name in config["video"]["files"]:
- if 'hd' in config["video"]["files"][codec_name]:
+ if codec_name in config_files:
+ if 'hd' in config_files[codec_name]:
files['hd'].append((codec_name, codec_extension, 'hd'))
- elif 'sd' in config["video"]["files"][codec_name]:
+ elif 'sd' in config_files[codec_name]:
files['sd'].append((codec_name, codec_extension, 'sd'))
else:
- files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
+ files['other'].append((codec_name, codec_extension, config_files[codec_name][0]))
for quality in ('hd', 'sd', 'other'):
if len(files[quality]) > 0:
@@ -173,8 +195,12 @@ class VimeoIE(InfoExtractor):
else:
raise ExtractorError(u'No known codec found')
- video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
- %(video_id, sig, timestamp, video_quality, video_codec.upper())
+ video_url = None
+ if isinstance(config_files[video_codec], dict):
+ video_url = config_files[video_codec][video_quality].get("url")
+ if video_url is None:
+ video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
+ %(video_id, sig, timestamp, video_quality, video_codec.upper())
return [{
'id': video_id,
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py
index 7d228edac..29c25f0e3 100644
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -6,7 +6,6 @@ import re
from .common import InfoExtractor
from ..utils import (
- compat_urllib_parse,
unified_strdate,
)
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 0f1feeffd..88b8b6be0 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -3,7 +3,8 @@ import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
-
+ unescapeHTML,
+ determine_ext,
ExtractorError,
)
@@ -36,15 +37,16 @@ class XHamsterIE(InfoExtractor):
video_url = compat_urllib_parse.unquote(mobj.group('file'))
else:
video_url = mobj.group('server')+'/key='+mobj.group('file')
- video_extension = video_url.split('.')[-1]
video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
webpage, u'title')
- # Can't see the description anywhere in the UI
- # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
- # webpage, u'description', fatal=False)
- # if video_description: video_description = unescapeHTML(video_description)
+ # Only a few videos have an description
+ mobj = re.search('<span>Description: </span>(?P<description>[^<]+)', webpage)
+ if mobj:
+ video_description = unescapeHTML(mobj.group('description'))
+ else:
+ video_description = None
mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
if mobj:
@@ -62,9 +64,9 @@ class XHamsterIE(InfoExtractor):
return [{
'id': video_id,
'url': video_url,
- 'ext': video_extension,
+ 'ext': determine_ext(video_url),
'title': video_title,
- # 'description': video_description,
+ 'description': video_description,
'upload_date': video_upload_date,
'uploader_id': video_uploader_id,
'thumbnail': video_thumbnail
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index d1156bf42..c85fd4b5a 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -12,14 +12,16 @@ from ..utils import (
unescapeHTML,
unified_strdate,
)
-
+from ..aes import (
+ aes_decrypt_text
+)
class YouPornIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
_TEST = {
u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
u'file': u'505835.mp4',
- u'md5': u'c37ddbaaa39058c76a7e86c6813423c1',
+ u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89',
u'info_dict': {
u"upload_date": u"20101221",
u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",
@@ -75,7 +77,15 @@ class YouPornIE(InfoExtractor):
# Get all of the links from the page
LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
links = re.findall(LINK_RE, download_list_html)
- if(len(links) == 0):
+
+ # Get link of hd video if available
+ mobj = re.search(r'var encryptedQuality720URL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';', webpage)
+ if mobj != None:
+ encrypted_video_url = mobj.group(u'encrypted_video_url')
+ video_url = aes_decrypt_text(encrypted_video_url, video_title, 32).decode('utf-8')
+ links = [video_url] + links
+
+ if not links:
raise ExtractorError(u'ERROR: no known formats available for video')
self.to_screen(u'Links found: %d' % len(links))
@@ -112,7 +122,7 @@ class YouPornIE(InfoExtractor):
self._print_formats(formats)
return
- req_format = self._downloader.params.get('format', None)
+ req_format = self._downloader.params.get('format', 'best')
self.to_screen(u'Format: %s' % req_format)
if req_format is None or req_format == 'best':
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index b3400df0a..11611f10d 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -194,7 +194,7 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
_VALID_URL = r"""^
(
(?:https?://)? # http(s):// (optional)
- (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
+ (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
@@ -205,15 +205,18 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
(?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
v=
)
- )? # optional -> youtube.com/xxxx is OK
+ ))
+ |youtu\.be/ # just youtu.be/xxxx
+ )
)? # all until now is optional -> you can pass the naked ID
([0-9A-Za-z_-]+) # here is it! the YouTube video ID
(?(1).+)? # if we found the ID, everything can follow
$"""
_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
# Listed in order of quality
- _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13',
- '95', '94', '93', '92', '132', '151',
+ _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
+ # Apple HTTP Live Streaming
+ '96', '95', '94', '93', '92', '132', '151',
# 3D
'85', '84', '102', '83', '101', '82', '100',
# Dash video
@@ -222,8 +225,10 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
# Dash audio
'141', '172', '140', '171', '139',
]
- _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13',
- '95', '94', '93', '92', '132', '151',
+ _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
+ # Apple HTTP Live Streaming
+ '96', '95', '94', '93', '92', '132', '151',
+ # 3D
'85', '102', '84', '101', '83', '100', '82',
# Dash video
'138', '248', '137', '247', '136', '246', '245',
@@ -231,11 +236,18 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
# Dash audio
'172', '141', '171', '140', '139',
]
+ _video_formats_map = {
+ 'flv': ['35', '34', '6', '5'],
+ '3gp': ['36', '17', '13'],
+ 'mp4': ['38', '37', '22', '18'],
+ 'webm': ['46', '45', '44', '43'],
+ }
_video_extensions = {
'13': '3gp',
- '17': 'mp4',
+ '17': '3gp',
'18': 'mp4',
'22': 'mp4',
+ '36': '3gp',
'37': 'mp4',
'38': 'mp4',
'43': 'webm',
@@ -252,7 +264,7 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
'101': 'webm',
'102': 'webm',
- # videos that use m3u8
+ # Apple HTTP Live Streaming
'92': 'mp4',
'93': 'mp4',
'94': 'mp4',
@@ -293,6 +305,7 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
'22': '720x1280',
'34': '360x640',
'35': '480x854',
+ '36': '240x320',
'37': '1080x1920',
'38': '3072x4096',
'43': '360x640',
@@ -394,7 +407,7 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
u"info_dict": {
u"upload_date": u"20120506",
u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
- u"description": u"md5:b085c9804f5ab69f4adea963a2dceb3c",
+ u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
u"uploader": u"Icona Pop",
u"uploader_id": u"IconaPop"
}
@@ -432,7 +445,7 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
- if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False
+ if YoutubePlaylistIE.suitable(url): return False
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def report_video_webpage_download(self, video_id):
@@ -465,15 +478,15 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
elif len(s) == 89:
return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
elif len(s) == 88:
- return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12]
+ return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
elif len(s) == 87:
return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
elif len(s) == 86:
- return s[5:20] + s[2] + s[21:]
+ return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
elif len(s) == 85:
return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27]
elif len(s) == 84:
- return s[83:27:-1] + s[0] + s[26:5:-1] + s[2:0:-1] + s[27]
+ return s[81:36:-1] + s[0] + s[35:2:-1]
elif len(s) == 83:
return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
elif len(s) == 82:
@@ -537,13 +550,25 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
else:
# Specific formats. We pick the first in a slash-delimeted sequence.
- # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
+ # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
+ # available in the specified format. For example,
+ # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
+ # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
+ # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
req_formats = req_format.split('/')
video_url_list = None
for rf in req_formats:
if rf in url_map:
video_url_list = [(rf, url_map[rf])]
break
+ if rf in self._video_formats_map:
+ for srf in self._video_formats_map[rf]:
+ if srf in url_map:
+ video_url_list = [(srf, url_map[srf])]
+ break
+ else:
+ continue
+ break
if video_url_list is None:
raise ExtractorError(u'requested format not available')
return video_url_list
@@ -558,7 +583,7 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
formats_urls = _get_urls(manifest)
for format_url in formats_urls:
- itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
+ itag = self._search_regex(r'itag%3D(\d+?)/', format_url, 'itag')
url_map[itag] = format_url
return url_map
@@ -860,8 +885,11 @@ class YoutubePlaylistIE(InfoExtractor):
for entry in response['feed']['entry']:
index = entry['yt$position']['$t']
- if 'media$group' in entry and 'media$player' in entry['media$group']:
- videos.append((index, entry['media$group']['media$player']['url']))
+ if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
+ videos.append((
+ index,
+ 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
+ ))
videos = [v[1] for v in sorted(videos)]
@@ -927,13 +955,20 @@ class YoutubeChannelIE(InfoExtractor):
class YoutubeUserIE(InfoExtractor):
IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
+ _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
_GDATA_PAGE_SIZE = 50
- _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
- _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
+ _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
IE_NAME = u'youtube:user'
+ @classmethod
+ def suitable(cls, url):
+ # Don't return True if the url can be extracted with other youtube
+ # extractor, the regex would is too permissive and it would match.
+ other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
+ if any(ie.suitable(url) for ie in other_ies): return False
+ else: return super(YoutubeUserIE, cls).suitable(url)
+
def _real_extract(self, url):
# Extract username
mobj = re.match(self._VALID_URL, url)
@@ -956,13 +991,15 @@ class YoutubeUserIE(InfoExtractor):
page = self._download_webpage(gdata_url, username,
u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
+ try:
+ response = json.loads(page)
+ except ValueError as err:
+ raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
+
# Extract video identifiers
ids_in_page = []
-
- for mobj in re.finditer(self._VIDEO_INDICATOR, page):
- if mobj.group(1) not in ids_in_page:
- ids_in_page.append(mobj.group(1))
-
+ for entry in response['feed']['entry']:
+ ids_in_page.append(entry['id']['$t'].split('/')[-1])
video_ids.extend(ids_in_page)
# A little optimization - if current page is not
@@ -1101,7 +1138,7 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
IE_NAME = u'youtube:favorites'
IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:o?rites)?'
+ _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
_LOGIN_REQUIRED = True
def _real_extract(self, url):