From b5ba7b9dcfed5ded96c841a0ebbbf12132de838f Mon Sep 17 00:00:00 2001 From: Jeff Smith Date: Wed, 28 Aug 2013 14:00:59 -0500 Subject: Fix MIT extractor for Python 2.6 The HTML for the MIT page does not parse cleanly for Python 2.6 due to script tags within an actual script element. The offending piece is inside a comment block, so removing all such comment blocks fixes the parsing. --- youtube_dl/extractor/mit.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index d09d03e36..52be9232f 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -25,23 +25,21 @@ class TechTVMITIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage( + raw_page = self._download_webpage( 'http://techtv.mit.edu/videos/%s' % video_id, video_id) - embed_page = self._download_webpage( - 'http://techtv.mit.edu/embeds/%s/' % video_id, video_id, - note=u'Downloading embed page') + clean_page = re.compile(u'', re.S).sub(u'', raw_page) base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', - embed_page, u'base url') - formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page, + raw_page, u'base url') + formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page, u'video formats') formats = json.loads(formats_json) formats = sorted(formats, key=lambda f: f['bitrate']) - title = get_element_by_id('edit-title', webpage) - description = clean_html(get_element_by_id('edit-description', webpage)) + title = get_element_by_id('edit-title', clean_page) + description = clean_html(get_element_by_id('edit-description', clean_page)) thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', - embed_page, u'thumbnail', flags=re.DOTALL) + raw_page, u'thumbnail', flags=re.DOTALL) return {'id': video_id, 'title': title, -- cgit v1.2.3 From 0d75ae2ce313c5738b2bdd9602ab3cc15e78810d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 29 Aug 2013 11:35:15 +0200 Subject: Fix detection of the webpage charset if it's declared using ' instead of " Like in "" --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a2986cebe..77726ee24 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -150,7 +150,7 @@ class InfoExtractor(object): if m: encoding = m.group(1) else: - m = re.search(br']+charset="?([^"]+)[ /">]', + m = re.search(br']+charset=[\'"]?([^\'")]+)[ /\'">]', webpage_bytes[:1024]) if m: encoding = m.group(1).decode('ascii') -- cgit v1.2.3 From c7bf7366bc0d4d1c4fc9c81ee5d33bf3c3512aa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 29 Aug 2013 13:41:59 +0200 Subject: Update descriptions checksum for some test for Unistra and Youtube --- youtube_dl/extractor/unistra.py | 2 +- youtube_dl/extractor/youtube.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py index 5ba0a9061..516e18914 100644 --- a/youtube_dl/extractor/unistra.py +++ b/youtube_dl/extractor/unistra.py @@ -11,7 +11,7 @@ class UnistraIE(InfoExtractor): u'md5': u'736f605cfdc96724d55bb543ab3ced24', u'info_dict': { u'title': u'M!ss Yella', - u'description': u'md5:75e8439a3e2981cd5d4b6db232e8fdfc', + u'description': u'md5:104892c71bd48e55d70b902736b81bbf', }, } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8e486afd0..4038af256 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -335,7 +335,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): u"info_dict": { u"upload_date": u"20120506", u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", - u"description": u"md5:b085c9804f5ab69f4adea963a2dceb3c", + u"description": u"md5:3e2666e0a55044490499ea45fe9037b7", u"uploader": u"Icona Pop", u"uploader_id": u"IconaPop" } -- cgit v1.2.3 From 545434670b7b055a7f0ff82b76ee7acbb3d07dd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 29 Aug 2013 19:16:07 +0200 Subject: Add an extractor for orf.at (closes #1346) Make find_xpath_attr also accept numbers in the value --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/orf.py | 65 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 youtube_dl/extractor/orf.py (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6b5037c8c..90f1a4418 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -59,6 +59,7 @@ from .myvideo import MyVideoIE from .nba import NBAIE from .nbc import NBCNewsIE from .ooyala import OoyalaIE +from .orf import ORFIE from .pbs import PBSIE from .photobucket import PhotobucketIE from .pornotube import PornotubeIE diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py new file mode 100644 index 000000000..8da0a2c8e --- /dev/null +++ b/youtube_dl/extractor/orf.py @@ -0,0 +1,65 @@ +import re +import xml.etree.ElementTree +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + ExtractorError, + find_xpath_attr, +) + +class ORFIE(InfoExtractor): + _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P\d+)' + + _TEST = { + u'url': u'http://tvthek.orf.at/programs/1171769-Wetter-ZIB/episodes/6557323-Wetter', + u'file': u'6566957.flv', + u'info_dict': { + u'title': u'Wetter', + u'description': u'Christa Kummer, Marcus Wadsak und Kollegen präsentieren abwechselnd ihre täglichen Wetterprognosen für Österreich.\r \r Mehr Wetter unter wetter.ORF.at', + }, + u'params': { + # It uses rtmp + u'skip_download': True, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + webpage = self._download_webpage(url, playlist_id) + + flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml') + flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0] + flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8')) + playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"') + playlist = json.loads(playlist_json) + + videos = [] + ns = '{http://tempuri.org/XMLSchema.xsd}' + xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns} + webpage_description = self._og_search_description(webpage) + for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1): + # Get best quality url + rtmp_url = None + for q in ['Q6A', 'Q4A', 'Q1A']: + video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q) + if video_url is not None: + rtmp_url = video_url.text + break + if rtmp_url is None: + raise ExtractorError(u'Couldn\'t get video url: %s' % info['id']) + description = self._html_search_regex( + r'id="playlist_entry_%s".*?

(.*?)

' % i, webpage, + u'description', default=webpage_description, flags=re.DOTALL) + videos.append({ + '_type': 'video', + 'id': info['id'], + 'title': info['title'], + 'url': rtmp_url, + 'ext': 'flv', + 'description': description, + }) + + return videos -- cgit v1.2.3 From 8928491074095ec4da84be9c7d5ff4f1c0f98400 Mon Sep 17 00:00:00 2001 From: Jeff Smith Date: Thu, 29 Aug 2013 12:51:38 -0500 Subject: Fix orf.at extractor by adding file coding mark --- youtube_dl/extractor/orf.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 8da0a2c8e..41ef8e992 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -1,3 +1,5 @@ +# coding: utf-8 + import re import xml.etree.ElementTree import json -- cgit v1.2.3 From f1fb2d12b32910c641f27096e585513c5f97f9ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 29 Aug 2013 21:39:36 +0200 Subject: [ign] extract videos from articles pages --- youtube_dl/extractor/ign.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 62abab655..263959716 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -13,7 +13,7 @@ class IGNIE(InfoExtractor): Some videos of it.ign.com are also supported """ - _VALID_URL = r'https?://.+?\.ign\.com/(?:videos|show_videos)(/.+)?/(?P.+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?Pvideos|show_videos|articles)(/.+)?/(?P.+)' IE_NAME = u'ign.com' _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' @@ -41,7 +41,11 @@ class IGNIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) name_or_id = mobj.group('name_or_id') + page_type = mobj.group('type') webpage = self._download_webpage(url, name_or_id) + if page_type == 'articles': + video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, u'video url') + return self.url_result(video_url, ie='IGN') video_id = self._find_video_id(webpage) result = self._get_video_info(video_id) description = self._html_search_regex(self._DESCRIPTION_RE, -- cgit v1.2.3 From ee80d66727d7b194e595fa7e0c19c40cc4adb408 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 29 Aug 2013 21:51:09 +0200 Subject: [ign] update 1up extractor to work with the updated IGNIE --- youtube_dl/extractor/ign.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 263959716..b1c84278a 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -72,7 +72,7 @@ class IGNIE(InfoExtractor): class OneUPIE(IGNIE): """Extractor for 1up.com, it uses the ign videos system.""" - _VALID_URL = r'https?://gamevideos.1up.com/video/id/(?P.+)' + _VALID_URL = r'https?://gamevideos.1up.com/(?Pvideo)/id/(?P.+)' IE_NAME = '1up.com' _DESCRIPTION_RE = r'
(.+?)
' -- cgit v1.2.3 From 52e1eea18bae4771137abd888830036c40b6eaa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 29 Aug 2013 22:33:58 +0200 Subject: [youtube] update algo for length 86 (fixes #1349) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4038af256..3a07df027 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -423,7 +423,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif len(s) == 87: return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: - return s[5:40] + s[3] + s[41:48] + s[0] + s[49:86] + return s[83:36:-1] + s[0] + s[35:2:-1] elif len(s) == 85: return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27] elif len(s) == 84: -- cgit v1.2.3 From 23b00bc0e4ae7d85876409fad59d95ce29b00d82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 29 Aug 2013 22:44:29 +0200 Subject: [youtube] update algo for length 84 Only appears sometimes, nearly identical to length 86. --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3a07df027..9e2373bd5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -427,7 +427,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif len(s) == 85: return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27] elif len(s) == 84: - return s[5:40] + s[3] + s[41:48] + s[0] + s[49:84] + return s[81:36:-1] + s[0] + s[35:2:-1] elif len(s) == 83: return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0] elif len(s) == 82: -- cgit v1.2.3 From c7a7750d3b825e3f7aa9cd3617a76d2f56c1387d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 30 Aug 2013 20:13:05 +0200 Subject: [youtube] Fix typo in the _VALID_URL for YoutubeFavouritesIE, it was intended to also match :ytfavourites --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9e2373bd5..00e2b320a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1161,7 +1161,7 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = u'youtube:favorites' IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:o?rites)?' + _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?' _LOGIN_REQUIRED = True def _real_extract(self, url): -- cgit v1.2.3 From 2e756879f156139c68c3557bc65a7ab1ac31137d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 30 Aug 2013 20:49:51 +0200 Subject: [youtube] update algo for length 86 --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 00e2b320a..810ce6f5d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -423,7 +423,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif len(s) == 87: return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: - return s[83:36:-1] + s[0] + s[35:2:-1] + return s[81:73:-1] + s[84] + s[72:58:-1] + s[0] + s[57:35:-1] + s[85] + s[34:0:-1] elif len(s) == 85: return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27] elif len(s) == 84: -- cgit v1.2.3