aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/breakcom.py7
-rw-r--r--youtube_dl/extractor/dailymotion.py1
-rw-r--r--youtube_dl/extractor/keezmovies.py26
-rw-r--r--youtube_dl/extractor/morningstar.py8
-rw-r--r--youtube_dl/extractor/motorsport.py2
-rw-r--r--youtube_dl/extractor/pornhd.py77
-rw-r--r--youtube_dl/extractor/ro220.py2
-rw-r--r--youtube_dl/extractor/rts.py13
-rw-r--r--youtube_dl/extractor/teamcoco.py27
-rw-r--r--youtube_dl/extractor/yahoo.py38
10 files changed, 123 insertions, 78 deletions
diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py
index 85635d1cc..1bfc9f35b 100644
--- a/youtube_dl/extractor/breakcom.py
+++ b/youtube_dl/extractor/breakcom.py
@@ -27,9 +27,10 @@ class BreakIE(InfoExtractor):
webpage, 'info json', flags=re.DOTALL)
info = json.loads(info_json)
video_url = info['videoUri']
- m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url)
- if m_youtube is not None:
- return self.url_result(m_youtube.group(1), 'Youtube')
+ youtube_id = info.get('youtubeId')
+ if youtube_id:
+ return self.url_result(youtube_id, 'Youtube')
+
final_url = video_url + '?' + info['AuthToken']
return {
'id': video_id,
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 7ed7f2723..5504d93eb 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -8,7 +8,6 @@ from .subtitles import SubtitlesInfoExtractor
from ..utils import (
compat_urllib_request,
compat_str,
- get_element_by_attribute,
get_element_by_id,
orderedSet,
str_to_int,
diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py
index 29658a7d6..75b63cffb 100644
--- a/youtube_dl/extractor/keezmovies.py
+++ b/youtube_dl/extractor/keezmovies.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import os
import re
@@ -11,22 +13,22 @@ from ..aes import (
aes_decrypt_text
)
+
class KeezMoviesIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
+ _VALID_URL = r'^https?://(?:www\.)?keezmovies\.com/video/.+?(?P<videoid>[0-9]+)(?:[/?&]|$)'
_TEST = {
- u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
- u'file': u'1214711.mp4',
- u'md5': u'6e297b7e789329923fcf83abb67c9289',
- u'info_dict': {
- u"title": u"Petite Asian Lady Mai Playing In Bathtub",
- u"age_limit": 18,
+ 'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
+ 'file': '1214711.mp4',
+ 'md5': '6e297b7e789329923fcf83abb67c9289',
+ 'info_dict': {
+ 'title': 'Petite Asian Lady Mai Playing In Bathtub',
+ 'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
- url = 'http://www.' + mobj.group('url')
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
@@ -38,10 +40,10 @@ class KeezMoviesIE(InfoExtractor):
embedded_url = mobj.group(1)
return self.url_result(embedded_url)
- video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, u'title')
- video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, u'video_url'))
- if webpage.find('encrypted=true')!=-1:
- password = self._html_search_regex(r'video_title=(.+?)&amp;', webpage, u'password')
+ video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, 'title')
+ video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, 'video_url'))
+ if 'encrypted=true' in webpage:
+ password = self._html_search_regex(r'video_title=(.+?)&amp;', webpage, 'password')
video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8')
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
diff --git a/youtube_dl/extractor/morningstar.py b/youtube_dl/extractor/morningstar.py
index 3d3225699..4f7a5d2e4 100644
--- a/youtube_dl/extractor/morningstar.py
+++ b/youtube_dl/extractor/morningstar.py
@@ -1,17 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
-import hashlib
-import json
import re
-import time
from .common import InfoExtractor
-from ..utils import (
- compat_parse_qs,
- compat_str,
- int_or_none,
-)
class MorningstarIE(InfoExtractor):
diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py
index dc727be10..7c0ec6a12 100644
--- a/youtube_dl/extractor/motorsport.py
+++ b/youtube_dl/extractor/motorsport.py
@@ -44,7 +44,7 @@ class MotorsportIE(InfoExtractor):
e = compat_str(int(time.time()) + 24 * 60 * 60)
base_video_url = params['location'] + '?e=' + e
s = 'h3hg713fh32'
- h = hashlib.md5(s + base_video_url).hexdigest()
+ h = hashlib.md5((s + base_video_url).encode('utf-8')).hexdigest()
video_url = base_video_url + '&h=' + h
uploader = self._html_search_regex(
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
index 58f9c690e..718fe9aba 100644
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -1,44 +1,81 @@
from __future__ import unicode_literals
import re
+import json
from .common import InfoExtractor
-from ..utils import compat_urllib_parse
+from ..utils import int_or_none
class PornHdIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)'
+ _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)'
_TEST = {
'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
- 'file': '1962.flv',
- 'md5': '35272469887dca97abd30abecc6cdf75',
+ 'md5': '956b8ca569f7f4d8ec563e2c41598441',
'info_dict': {
- "title": "sierra-day-gets-his-cum-all-over-herself-hd-porn-video",
- "age_limit": 18,
+ 'id': '1962',
+ 'ext': 'mp4',
+ 'title': 'Sierra loves doing laundry',
+ 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294',
+ 'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('video_id')
- video_title = mobj.group('video_title')
+ video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- next_url = self._html_search_regex(
- r'&hd=(http.+?)&', webpage, 'video URL')
- next_url = compat_urllib_parse.unquote(next_url)
+ title = self._og_search_title(webpage)
+ TITLE_SUFFIX = ' porn HD Video | PornHD.com '
+ if title.endswith(TITLE_SUFFIX):
+ title = title[:-len(TITLE_SUFFIX)]
+
+ description = self._html_search_regex(
+ r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
+ view_count = int_or_none(self._html_search_regex(
+ r'(\d+) views </span>', webpage, 'view count', fatal=False))
+
+ formats = [
+ {
+ 'url': format_url,
+ 'ext': format.lower(),
+ 'format_id': '%s-%s' % (format.lower(), quality.lower()),
+ 'quality': 1 if quality.lower() == 'high' else 0,
+ } for format, quality, format_url in re.findall(
+ r'var __video([\da-zA-Z]+?)(Low|High)StreamUrl = \'(http://.+?)\?noProxy=1\'', webpage)
+ ]
+
+ mobj = re.search(r'flashVars = (?P<flashvars>{.+?});', webpage)
+ if mobj:
+ flashvars = json.loads(mobj.group('flashvars'))
+ formats.extend([
+ {
+ 'url': flashvars['hashlink'].replace('?noProxy=1', ''),
+ 'ext': 'flv',
+ 'format_id': 'flv-low',
+ 'quality': 0,
+ },
+ {
+ 'url': flashvars['hd'].replace('?noProxy=1', ''),
+ 'ext': 'flv',
+ 'format_id': 'flv-high',
+ 'quality': 1,
+ }
+ ])
+ thumbnail = flashvars['urlWallpaper']
+ else:
+ thumbnail = self._og_search_thumbnail(webpage)
- video_url = self._download_webpage(
- next_url, video_id, note='Retrieving video URL',
- errnote='Could not retrieve video URL')
- age_limit = 18
+ self._sort_formats(formats)
return {
'id': video_id,
- 'url': video_url,
- 'ext': 'flv',
- 'title': video_title,
- 'age_limit': age_limit,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'view_count': view_count,
+ 'formats': formats,
+ 'age_limit': 18,
}
diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py
index 4678f62df..a6ad59465 100644
--- a/youtube_dl/extractor/ro220.py
+++ b/youtube_dl/extractor/ro220.py
@@ -18,7 +18,7 @@ class Ro220IE(InfoExtractor):
'md5': '03af18b73a07b4088753930db7a34add',
'info_dict': {
"title": "Luati-le Banii sez 4 ep 1",
- "description": "Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.",
+ "description": "re:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$",
}
}
diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py
index 40224d761..0f85b2320 100644
--- a/youtube_dl/extractor/rts.py
+++ b/youtube_dl/extractor/rts.py
@@ -37,7 +37,7 @@ class RTSIE(InfoExtractor):
'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html',
'md5': 'c197f0b2421995c63a64cc73d800f42e',
'info_dict': {
- 'id': '5738317',
+ 'id': '5624067',
'ext': 'mp4',
'duration': 55,
'title': 'Bande de lancement de Passe-moi les jumelles',
@@ -98,17 +98,20 @@ class RTSIE(InfoExtractor):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
- def download_json(video_id):
+ def download_json(internal_id):
return self._download_json(
- 'http://www.rts.ch/a/%s.html?f=json/article' % video_id, video_id)
+ 'http://www.rts.ch/a/%s.html?f=json/article' % internal_id,
+ video_id)
all_info = download_json(video_id)
# video_id extracted out of URL is not always a real id
if 'video' not in all_info and 'audio' not in all_info:
page = self._download_webpage(url, video_id)
- video_id = self._html_search_regex(r'<(?:video|audio) data-id="(\d+)"', page, 'video id')
- all_info = download_json(video_id)
+ internal_id = self._html_search_regex(
+ r'<(?:video|audio) data-id="([0-9]+)"', page,
+ 'internal video id')
+ all_info = download_json(internal_id)
info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio']
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index 9dcffead0..c86f1e423 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -9,8 +9,18 @@ from ..utils import (
class TeamcocoIE(InfoExtractor):
- _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
- _TEST = {
+ _VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<url_title>.*)'
+ _TESTS = [
+ {
+ 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant',
+ 'file': '80187.mp4',
+ 'md5': '3f7746aa0dc86de18df7539903d399ea',
+ 'info_dict': {
+ 'title': 'Conan Becomes A Mary Kay Beauty Consultant',
+ 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.'
+ }
+ },
+ {
'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush',
'file': '19705.mp4',
'md5': 'cde9ba0fa3506f5f017ce11ead928f9a',
@@ -19,6 +29,7 @@ class TeamcocoIE(InfoExtractor):
"title": "Louis C.K. Interview Pt. 1 11/3/11"
}
}
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -26,11 +37,13 @@ class TeamcocoIE(InfoExtractor):
raise ExtractorError('Invalid URL: %s' % url)
url_title = mobj.group('url_title')
webpage = self._download_webpage(url, url_title)
-
- video_id = self._html_search_regex(
- r'<article class="video" data-id="(\d+?)"',
- webpage, 'video id')
-
+
+ video_id = mobj.group("video_id")
+ if video_id == '':
+ video_id = self._html_search_regex(
+ r'<article class="video" data-id="(\d+?)"',
+ webpage, 'video id')
+
self.report_extraction(video_id)
data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index d92d14f71..e2cf1ae56 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -15,22 +15,24 @@ from ..utils import (
class YahooIE(InfoExtractor):
IE_DESC = 'Yahoo screen'
- _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
+ _VALID_URL = r'https?://screen\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html'
_TESTS = [
{
'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
- 'file': '214727115.mp4',
'md5': '4962b075c08be8690a922ee026d05e69',
'info_dict': {
+ 'id': '214727115',
+ 'ext': 'mp4',
'title': 'Julian Smith & Travis Legg Watch Julian Smith',
'description': 'Julian and Travis watch Julian Smith',
},
},
{
'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
- 'file': '103000935.mp4',
'md5': 'd6e6fc6e1313c608f316ddad7b82b306',
'info_dict': {
+ 'id': '103000935',
+ 'ext': 'mp4',
'title': 'Codefellas - The Cougar Lies with Spanish Moss',
'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
},
@@ -60,10 +62,9 @@ class YahooIE(InfoExtractor):
'env': 'prod',
'format': 'json',
})
- query_result_json = self._download_webpage(
+ query_result = self._download_json(
'http://video.query.yahoo.com/v1/public/yql?' + data,
video_id, 'Downloading video info')
- query_result = json.loads(query_result_json)
info = query_result['query']['results']['mediaObj'][0]
meta = info['meta']
@@ -86,7 +87,6 @@ class YahooIE(InfoExtractor):
else:
format_url = compat_urlparse.urljoin(host, path)
format_info['url'] = format_url
-
formats.append(format_info)
self._sort_formats(formats)
@@ -134,27 +134,25 @@ class YahooSearchIE(SearchInfoExtractor):
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
-
- res = {
- '_type': 'playlist',
- 'id': query,
- 'entries': []
- }
- for pagenum in itertools.count(0):
+ entries = []
+ for pagenum in itertools.count(0):
result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
- webpage = self._download_webpage(result_url, query,
- note='Downloading results page '+str(pagenum+1))
- info = json.loads(webpage)
+ info = self._download_json(result_url, query,
+ note='Downloading results page '+str(pagenum+1))
m = info['m']
results = info['results']
for (i, r) in enumerate(results):
- if (pagenum * 30) +i >= n:
+ if (pagenum * 30) + i >= n:
break
mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
- res['entries'].append(e)
- if (pagenum * 30 +i >= n) or (m['last'] >= (m['total'] -1)):
+ entries.append(e)
+ if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
break
- return res
+ return {
+ '_type': 'playlist',
+ 'id': query,
+ 'entries': entries,
+ }