aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Hagemeister <phihag@phihag.de>2014-12-04 17:02:05 +0100
committerPhilipp Hagemeister <phihag@phihag.de>2014-12-04 17:02:05 +0100
commit4349c07dd7cd07620365b36093f0a148c41ce434 (patch)
tree5e587e06a17c0468a1c7d4e30b4cf08e08954a44
parent9776bc7f57f061d133b204c056b1cebee775ddad (diff)
downloadyoutube-dl-4349c07dd7cd07620365b36093f0a148c41ce434.tar.xz
[minhateca] Add extractor (Fixes #4094)
-rw-r--r--test/test_utils.py1
-rw-r--r--youtube_dl/extractor/__init__.py1
-rw-r--r--youtube_dl/extractor/minhateca.py71
-rw-r--r--youtube_dl/utils.py7
4 files changed, 78 insertions, 2 deletions
diff --git a/test/test_utils.py b/test/test_utils.py
index baa3a2156..04f1bf283 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -376,6 +376,7 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_filesize('2 MiB'), 2097152)
self.assertEqual(parse_filesize('5 GB'), 5000000000)
self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
+ self.assertEqual(parse_filesize('1,24 KB'), 1240)
if __name__ == '__main__':
unittest.main()
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 647352b59..a56ec4fb5 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -217,6 +217,7 @@ from .mdr import MDRIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
from .mgoon import MgoonIE
+from .minhateca import MinhatecaIE
from .ministrygrid import MinistryGridIE
from .mit import TechTVMITIE, MITIE, OCWMITIE
from .mitele import MiTeleIE
diff --git a/youtube_dl/extractor/minhateca.py b/youtube_dl/extractor/minhateca.py
new file mode 100644
index 000000000..077c9b19d
--- /dev/null
+++ b/youtube_dl/extractor/minhateca.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+from ..utils import (
+ int_or_none,
+ parse_filesize,
+)
+
+
+class MinhatecaIE(InfoExtractor):
+ _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P<id>[0-9]+)\.'
+ _TEST = {
+ 'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)',
+ 'info_dict': {
+ 'id': '125848331',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'filesize_approx': 1530000,
+ 'duration': 9,
+ 'view_count': int,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ token = self._html_search_regex(
+ r'<input name="__RequestVerificationToken".*?value="([^"]+)"',
+ webpage, 'request token')
+ token_data = [
+ ('fileId', video_id),
+ ('__RequestVerificationToken', token),
+ ]
+ req = compat_urllib_request.Request(
+ 'http://minhateca.com.br/action/License/Download',
+ data=compat_urllib_parse.urlencode(token_data))
+ req.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ data = self._download_json(
+ req, video_id, note='Downloading metadata')
+
+ video_url = data['redirectUrl']
+ title_str = self._html_search_regex(
+ r'<h1.*?>(.*?)</h1>', webpage, 'title')
+ title, _, ext = title_str.rpartition('.')
+ filesize_approx = parse_filesize(self._html_search_regex(
+ r'<p class="fileSize">(.*?)</p>',
+ webpage, 'file size approximation', fatal=False))
+ duration = int_or_none(self._html_search_regex(
+ r'(?s)<p class="fileLeng[ht][th]">.*?([0-9]+)\s*s',
+ webpage, 'duration', fatal=False))
+ view_count = int_or_none(self._html_search_regex(
+ r'<p class="downloadsCounter">([0-9]+)</p>',
+ webpage, 'view count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'ext': ext,
+ 'filesize_approx': filesize_approx,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 4d3cbac74..5e9ae7a42 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1090,11 +1090,14 @@ def parse_filesize(s):
}
units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
- m = re.match(r'(?P<num>[0-9]+(?:\.[0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
+ m = re.match(
+ r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
if not m:
return None
- return int(float(m.group('num')) * _UNIT_TABLE[m.group('unit')])
+ num_str = m.group('num').replace(',', '.')
+ mult = _UNIT_TABLE[m.group('unit')]
+ return int(float(num_str) * mult)
def get_term_width():