diff options
| -rwxr-xr-x | youtube_dl/InfoExtractors.py | 96 | ||||
| -rw-r--r-- | youtube_dl/extractor/stanfordoc.py | 112 | 
2 files changed, 113 insertions, 95 deletions
| diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 502df6a1f..af505387c 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -37,6 +37,7 @@ from .extractor.myvideo import MyVideoIE  from .extractor.statigram import StatigramIE  from .extractor.photobucket import PhotobucketIE  from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE +from .extractor.stanfordoc import StanfordOpenClassroomIE  from .extractor.vimeo import VimeoIE  from .extractor.xvideos import XVideosIE  from .extractor.yahoo import YahooIE, YahooSearchIE @@ -150,101 +151,6 @@ class MixcloudIE(InfoExtractor):              'player_url': player_url.decode('utf-8'),          }] -class StanfordOpenClassroomIE(InfoExtractor): -    """Information extractor for Stanford's Open ClassRoom""" - -    _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' -    IE_NAME = u'stanfordoc' - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url) - -        if mobj.group('course') and mobj.group('video'): # A specific video -            course = mobj.group('course') -            video = mobj.group('video') -            info = { -                'id': course + '_' + video, -                'uploader': None, -                'upload_date': None, -            } - -            self.report_extraction(info['id']) -            baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' -            xmlUrl = baseUrl + video + '.xml' -            try: -                metaXml = compat_urllib_request.urlopen(xmlUrl).read() -            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -                raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) -            mdoc = xml.etree.ElementTree.fromstring(metaXml) -            try: -                info['title'] = mdoc.findall('./title')[0].text -                info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text -            except IndexError: -                raise ExtractorError(u'Invalid metadata XML file') -            info['ext'] = info['url'].rpartition('.')[2] -            return [info] -        elif mobj.group('course'): # A course page -            course = mobj.group('course') -            info = { -                'id': course, -                'type': 'playlist', -                'uploader': None, -                'upload_date': None, -            } - -            coursepage = self._download_webpage(url, info['id'], -                                        note='Downloading course info page', -                                        errnote='Unable to download course info page') - -            info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - -            info['description'] = self._html_search_regex('<description>([^<]+)</description>', -                coursepage, u'description', fatal=False) - -            links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) -            info['list'] = [ -                { -                    'type': 'reference', -                    'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), -                } -                    for vpage in links] -            results = [] -            for entry in info['list']: -                assert entry['type'] == 'reference' -                results += self.extract(entry['url']) -            return results -        else: # Root page -            info = { -                'id': 'Stanford OpenClassroom', -                'type': 'playlist', -                'uploader': None, -                'upload_date': None, -            } - -            self.report_download_webpage(info['id']) -            rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' -            try: -                rootpage = compat_urllib_request.urlopen(rootURL).read() -            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -                raise ExtractorError(u'Unable to download course info page: ' + compat_str(err)) - -            info['title'] = info['id'] - -            links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) -            info['list'] = [ -                { -                    'type': 'reference', -                    'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), -                } -                    for cpage in links] - -            results = [] -            for entry in info['list']: -                assert entry['type'] == 'reference' -                results += self.extract(entry['url']) -            return results  class MTVIE(InfoExtractor):      """Information extractor for MTV.com""" diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py new file mode 100644 index 000000000..8d3e32ab9 --- /dev/null +++ b/youtube_dl/extractor/stanfordoc.py @@ -0,0 +1,112 @@ +import re +import socket +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( +    compat_http_client, +    compat_str, +    compat_urllib_error, +    compat_urllib_request, + +    ExtractorError, +    orderedSet, +    unescapeHTML, +) + + +class StanfordOpenClassroomIE(InfoExtractor): +    """Information extractor for Stanford's Open ClassRoom""" + +    _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' +    IE_NAME = u'stanfordoc' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        if mobj is None: +            raise ExtractorError(u'Invalid URL: %s' % url) + +        if mobj.group('course') and mobj.group('video'): # A specific video +            course = mobj.group('course') +            video = mobj.group('video') +            info = { +                'id': course + '_' + video, +                'uploader': None, +                'upload_date': None, +            } + +            self.report_extraction(info['id']) +            baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' +            xmlUrl = baseUrl + video + '.xml' +            try: +                metaXml = compat_urllib_request.urlopen(xmlUrl).read() +            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: +                raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) +            mdoc = xml.etree.ElementTree.fromstring(metaXml) +            try: +                info['title'] = mdoc.findall('./title')[0].text +                info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text +            except IndexError: +                raise ExtractorError(u'Invalid metadata XML file') +            info['ext'] = info['url'].rpartition('.')[2] +            return [info] +        elif mobj.group('course'): # A course page +            course = mobj.group('course') +            info = { +                'id': course, +                'type': 'playlist', +                'uploader': None, +                'upload_date': None, +            } + +            coursepage = self._download_webpage(url, info['id'], +                                        note='Downloading course info page', +                                        errnote='Unable to download course info page') + +            info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) + +            info['description'] = self._html_search_regex('<description>([^<]+)</description>', +                coursepage, u'description', fatal=False) + +            links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) +            info['list'] = [ +                { +                    'type': 'reference', +                    'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), +                } +                    for vpage in links] +            results = [] +            for entry in info['list']: +                assert entry['type'] == 'reference' +                results += self.extract(entry['url']) +            return results +        else: # Root page +            info = { +                'id': 'Stanford OpenClassroom', +                'type': 'playlist', +                'uploader': None, +                'upload_date': None, +            } + +            self.report_download_webpage(info['id']) +            rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' +            try: +                rootpage = compat_urllib_request.urlopen(rootURL).read() +            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: +                raise ExtractorError(u'Unable to download course info page: ' + compat_str(err)) + +            info['title'] = info['id'] + +            links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) +            info['list'] = [ +                { +                    'type': 'reference', +                    'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), +                } +                    for cpage in links] + +            results = [] +            for entry in info['list']: +                assert entry['type'] == 'reference' +                results += self.extract(entry['url']) +            return results | 
