diff options
| author | fnord <fnord@fnord.mobi> | 2015-06-25 00:34:46 -0500 | 
|---|---|---|
| committer | fnord <fnord@fnord.mobi> | 2015-06-25 00:34:46 -0500 | 
| commit | aa5740fb61d388754e9278a3e38de12203c1b89d (patch) | |
| tree | 8195c113270d167f57225bdaa6f89df807341968 | |
| parent | da92eeae42f556926cb676b3c14e270603b7e38e (diff) | |
| parent | 18b5e1e5348ba3a6d1b6a98e97217eebb3d32a1e (diff) | |
Merge remote-tracking branch 'origin/master' into pr-bbcnews
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 12 | ||||
| -rw-r--r-- | youtube_dl/extractor/adobetv.py | 60 | ||||
| -rw-r--r-- | youtube_dl/extractor/bbc.py | 15 | ||||
| -rw-r--r-- | youtube_dl/extractor/brightcove.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/dailymotion.py | 42 | ||||
| -rw-r--r-- | youtube_dl/extractor/dramafever.py | 43 | ||||
| -rw-r--r-- | youtube_dl/extractor/drbonanza.py | 12 | ||||
| -rw-r--r-- | youtube_dl/extractor/faz.py | 21 | ||||
| -rw-r--r-- | youtube_dl/extractor/francetv.py | 15 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 69 | ||||
| -rw-r--r-- | youtube_dl/extractor/imdb.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/pinkbike.py | 96 | ||||
| -rw-r--r-- | youtube_dl/extractor/sohu.py | 54 | ||||
| -rw-r--r-- | youtube_dl/extractor/tumblr.py | 16 | ||||
| -rw-r--r-- | youtube_dl/extractor/viki.py | 10 | ||||
| -rw-r--r-- | youtube_dl/extractor/vimeo.py | 16 | ||||
| -rw-r--r-- | youtube_dl/extractor/xhamster.py | 34 | ||||
| -rw-r--r-- | youtube_dl/extractor/xvideos.py | 27 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 2 | ||||
| -rw-r--r-- | youtube_dl/postprocessor/ffmpeg.py | 196 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 213 | 
21 files changed, 701 insertions, 257 deletions
| diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a48346e60..1a9585c92 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -4,7 +4,10 @@ from .abc import ABCIE  from .abc7news import Abc7NewsIE  from .academicearth import AcademicEarthCourseIE  from .addanime import AddAnimeIE -from .adobetv import AdobeTVIE +from .adobetv import ( +    AdobeTVIE, +    AdobeTVVideoIE, +)  from .adultswim import AdultSwimIE  from .aftenposten import AftenpostenIE  from .aftonbladet import AftonbladetIE @@ -103,6 +106,7 @@ from .dailymotion import (      DailymotionIE,      DailymotionPlaylistIE,      DailymotionUserIE, +    DailymotionCloudIE,  )  from .daum import DaumIE  from .dbtv import DBTVIE @@ -401,6 +405,7 @@ from .pbs import PBSIE  from .philharmoniedeparis import PhilharmonieDeParisIE  from .phoenix import PhoenixIE  from .photobucket import PhotobucketIE +from .pinkbike import PinkbikeIE  from .planetaplay import PlanetaPlayIE  from .pladform import PladformIE  from .played import PlayedIE @@ -696,7 +701,10 @@ from .wrzuta import WrzutaIE  from .wsj import WSJIE  from .xbef import XBefIE  from .xboxclips import XboxClipsIE -from .xhamster import XHamsterIE +from .xhamster import ( +    XHamsterIE, +    XHamsterEmbedIE, +)  from .xminus import XMinusIE  from .xnxx import XNXXIE  from .xstream import XstreamIE diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 97d128560..5e43adc51 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -5,6 +5,8 @@ from ..utils import (      parse_duration,      unified_strdate,      str_to_int, +    float_or_none, +    ISO639Utils,  ) @@ -69,3 +71,61 @@ class AdobeTVIE(InfoExtractor):              'view_count': view_count,              'formats': formats,          } + + +class AdobeTVVideoIE(InfoExtractor): +    _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)' + +    _TEST = { +        # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners +        'url': 'https://video.tv.adobe.com/v/2456/', +        'md5': '43662b577c018ad707a63766462b1e87', +        'info_dict': { +            'id': '2456', +            'ext': 'mp4', +            'title': 'New experience with Acrobat DC', +            'description': 'New experience with Acrobat DC', +            'duration': 248.667, +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        player_params = self._parse_json(self._search_regex( +            r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'), +            video_id) + +        formats = [{ +            'url': source['src'], +            'width': source.get('width'), +            'height': source.get('height'), +            'tbr': source.get('bitrate'), +        } for source in player_params['sources']] + +        # For both metadata and downloaded files the duration varies among +        # formats. I just pick the max one +        duration = max(filter(None, [ +            float_or_none(source.get('duration'), scale=1000) +            for source in player_params['sources']])) + +        subtitles = {} +        for translation in player_params.get('translations', []): +            lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) +            if lang_id not in subtitles: +                subtitles[lang_id] = [] +            subtitles[lang_id].append({ +                'url': translation['vttPath'], +                'ext': 'vtt', +            }) + +        return { +            'id': video_id, +            'formats': formats, +            'title': player_params['title'], +            'description': self._og_search_description(webpage), +            'duration': duration, +            'subtitles': subtitles, +        } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index bb671d473..471d865d2 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -255,26 +255,11 @@ class BBCCoUkIE(InfoExtractor):          for connection in self._extract_connections(media):              captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')              lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') -            ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}')) -            srt = '' - -            def _extract_text(p): -                if p.text is not None: -                    stripped_text = p.text.strip() -                    if stripped_text: -                        return stripped_text -                return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span')) -            for pos, p in enumerate(ps): -                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))              subtitles[lang] = [                  {                      'url': connection.get('href'),                      'ext': 'ttml',                  }, -                { -                    'data': srt, -                    'ext': 'srt', -                },              ]          return subtitles diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index d768f99e6..4721c2293 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -13,6 +13,7 @@ from ..compat import (      compat_urllib_parse_urlparse,      compat_urllib_request,      compat_urlparse, +    compat_xml_parse_error,  )  from ..utils import (      determine_ext, @@ -119,7 +120,7 @@ class BrightcoveIE(InfoExtractor):          try:              object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) -        except xml.etree.ElementTree.ParseError: +        except compat_xml_parse_error:              return          fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 70aa4333c..96f0ed9ad 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -251,3 +251,45 @@ class DailymotionUserIE(DailymotionPlaylistIE):              'title': full_user,              'entries': self._extract_entries(user),          } + + +class DailymotionCloudIE(DailymotionBaseInfoExtractor): +    _VALID_URL = r'http://api\.dmcloud\.net/embed/[^/]+/(?P<id>[^/?]+)' + +    _TEST = { +        # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html +        # Tested at FranceTvInfo_2 +        'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1', +        'only_matching': True, +    } + +    @classmethod +    def _extract_dmcloud_url(self, webpage): +        mobj = re.search(r'<iframe[^>]+src=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage) +        if mobj: +            return mobj.group(1) + +        mobj = re.search(r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage) +        if mobj: +            return mobj.group(1) + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        request = self._build_request(url) +        webpage = self._download_webpage(request, video_id) + +        title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title') + +        video_info = self._parse_json(self._search_regex( +            r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id) + +        # TODO: parse ios_url, which is in fact a manifest +        video_url = video_info['mp4_url'] + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'thumbnail': video_info.get('thumbnail_url'), +        } diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index a34aad486..ca41a3abf 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -6,6 +6,8 @@ import itertools  from .common import InfoExtractor  from ..compat import (      compat_HTTPError, +    compat_urllib_parse, +    compat_urllib_request,      compat_urlparse,  )  from ..utils import ( @@ -17,7 +19,39 @@ from ..utils import (  ) -class DramaFeverIE(InfoExtractor): +class DramaFeverBaseIE(InfoExtractor): +    _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' +    _NETRC_MACHINE = 'dramafever' + +    def _real_initialize(self): +        self._login() + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            return + +        login_form = { +            'username': username, +            'password': password, +        } + +        request = compat_urllib_request.Request( +            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) +        response = self._download_webpage( +            request, None, 'Logging in as %s' % username) + +        if all(logout_pattern not in response +               for logout_pattern in ['href="/accounts/logout/"', '>Log out<']): +            error = self._html_search_regex( +                r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<', +                response, 'error message', default=None) +            if error: +                raise ExtractorError('Unable to login: %s' % error, expected=True) +            raise ExtractorError('Unable to log in') + + +class DramaFeverIE(DramaFeverBaseIE):      IE_NAME = 'dramafever'      _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)'      _TEST = { @@ -97,7 +131,7 @@ class DramaFeverIE(InfoExtractor):          } -class DramaFeverSeriesIE(InfoExtractor): +class DramaFeverSeriesIE(DramaFeverBaseIE):      IE_NAME = 'dramafever:series'      _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$'      _TESTS = [{ @@ -151,8 +185,11 @@ class DramaFeverSeriesIE(InfoExtractor):                  % (consumer_secret, series_id, self._PAGE_SIZE, page_num),                  series_id, 'Downloading episodes JSON page #%d' % page_num)              for episode in episodes.get('value', []): +                episode_url = episode.get('episode_url') +                if not episode_url: +                    continue                  entries.append(self.url_result( -                    compat_urlparse.urljoin(url, episode['episode_url']), +                    compat_urlparse.urljoin(url, episode_url),                      'DramaFever', episode.get('guid')))              if page_num == episodes['num_pages']:                  break diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py index 7626219ba..8b98b013a 100644 --- a/youtube_dl/extractor/drbonanza.py +++ b/youtube_dl/extractor/drbonanza.py @@ -15,7 +15,6 @@ class DRBonanzaIE(InfoExtractor):      _TESTS = [{          'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517', -        'md5': 'fe330252ddea607635cf2eb2c99a0af3',          'info_dict': {              'id': '65517',              'ext': 'mp4', @@ -26,6 +25,9 @@ class DRBonanzaIE(InfoExtractor):              'upload_date': '20110120',              'duration': 3664,          }, +        'params': { +            'skip_download': True,  # requires rtmp +        },      }, {          'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410',          'md5': '6dfe039417e76795fb783c52da3de11d', @@ -93,6 +95,11 @@ class DRBonanzaIE(InfoExtractor):                          'format_id': file['Type'].replace('Video', ''),                          'preference': preferencemap.get(file['Type'], -10),                      }) +                    if format['url'].startswith('rtmp'): +                        rtmp_url = format['url'] +                        format['rtmp_live'] = True  # --resume does not work +                        if '/bonanza/' in rtmp_url: +                            format['play_path'] = rtmp_url.split('/bonanza/')[1]                      formats.append(format)                  elif file['Type'] == "Thumb":                      thumbnail = file['Location'] @@ -111,9 +118,6 @@ class DRBonanzaIE(InfoExtractor):          description = '%s\n%s\n%s\n' % (              info['Description'], info['Actors'], info['Colophon']) -        for f in formats: -            f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/') -            f['url'] = f['url'].replace('mp4:bonanza', 'bonanza')          self._sort_formats(formats)          display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index 3c39ca451..cebdd0193 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -6,9 +6,9 @@ from .common import InfoExtractor  class FazIE(InfoExtractor):      IE_NAME = 'faz.net' -    _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html' +    _VALID_URL = r'https?://(?:www\.)?faz\.net/(?:[^/]+/)*.*?-(?P<id>\d+)\.html' -    _TEST = { +    _TESTS = [{          'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',          'info_dict': {              'id': '12610585', @@ -16,7 +16,22 @@ class FazIE(InfoExtractor):              'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher',              'description': 'md5:1453fbf9a0d041d985a47306192ea253',          }, -    } +    }, { +        'url': 'http://www.faz.net/aktuell/politik/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html', +        'only_matching': True, +    }, { +        'url': 'http://www.faz.net/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html', +        'only_matching': True, +    }, { +        'url': 'http://www.faz.net/-13659345.html', +        'only_matching': True, +    }, { +        'url': 'http://www.faz.net/aktuell/politik/-13659345.html', +        'only_matching': True, +    }, { +        'url': 'http://www.faz.net/foobarblafasel-13659345.html', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index db0bbec1e..b2c984bf2 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -18,6 +18,7 @@ from ..utils import (      parse_duration,      determine_ext,  ) +from .dailymotion import DailymotionCloudIE  class FranceTVBaseInfoExtractor(InfoExtractor): @@ -131,12 +132,26 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):              'skip_download': 'HLS (reqires ffmpeg)'          },          'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.', +    }, { +        'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', +        'md5': 'f485bda6e185e7d15dbc69b72bae993e', +        'info_dict': { +            'id': '556e03339473995ee145930c', +            'ext': 'mp4', +            'title': 'Les entreprises familiales : le secret de la réussite', +            'thumbnail': 're:^https?://.*\.jpe?g$', +        }      }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          page_title = mobj.group('title')          webpage = self._download_webpage(url, page_title) + +        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) +        if dmcloud_url: +            return self.url_result(dmcloud_url, 'DailymotionCloud') +          video_id, catalogue = self._search_regex(              r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@')          return self._extract_video(video_id, catalogue) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f6b984300..5c03fddc6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -43,6 +43,9 @@ from .senateisvp import SenateISVPIE  from .bliptv import BlipTVIE  from .svt import SVTIE  from .pornhub import PornHubIE +from .xhamster import XHamsterEmbedIE +from .vimeo import VimeoIE +from .dailymotion import DailymotionCloudIE  class GenericIE(InfoExtractor): @@ -333,6 +336,15 @@ class GenericIE(InfoExtractor):                  'skip_download': True,              },          }, +        # XHamster embed +        { +            'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8', +            'info_dict': { +                'id': 'showthread', +                'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', +            }, +            'playlist_mincount': 7, +        },          # Embedded TED video          {              'url': 'http://en.support.wordpress.com/videos/ted-talks/', @@ -812,6 +824,29 @@ class GenericIE(InfoExtractor):                  'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',                  'uploader': 'Rogers Sportsnet',              }, +        }, +        # Dailymotion Cloud video +        { +            'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', +            'md5': '49444254273501a64675a7e68c502681', +            'info_dict': { +                'id': '5585de919473990de4bee11b', +                'ext': 'mp4', +                'title': 'Le débat', +                'thumbnail': 're:^https?://.*\.jpe?g$', +            } +        }, +        # AdobeTVVideo embed +        { +            'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners', +            'md5': '43662b577c018ad707a63766462b1e87', +            'info_dict': { +                'id': '2456', +                'ext': 'mp4', +                'title': 'New experience with Acrobat DC', +                'description': 'New experience with Acrobat DC', +                'duration': 248.667, +            },          }      ] @@ -1089,18 +1124,9 @@ class GenericIE(InfoExtractor):          if matches:              return _playlist_from_matches(matches, ie='RtlNl') -        # Look for embedded (iframe) Vimeo player -        mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) -        if mobj: -            player_url = unescapeHTML(mobj.group('url')) -            surl = smuggle_url(player_url, {'Referer': url}) -            return self.url_result(surl) -        # Look for embedded (swf embed) Vimeo player -        mobj = re.search( -            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) -        if mobj: -            return self.url_result(mobj.group(1)) +        vimeo_url = VimeoIE._extract_vimeo_url(url, webpage) +        if vimeo_url is not None: +            return self.url_result(vimeo_url)          # Look for embedded YouTube player          matches = re.findall(r'''(?x) @@ -1327,6 +1353,11 @@ class GenericIE(InfoExtractor):          if pornhub_url:              return self.url_result(pornhub_url, 'PornHub') +        # Look for embedded XHamster player +        xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) +        if xhamster_urls: +            return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') +          # Look for embedded Tvigle player          mobj = re.search(              r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) @@ -1494,6 +1525,20 @@ class GenericIE(InfoExtractor):          if senate_isvp_url:              return self.url_result(senate_isvp_url, 'SenateISVP') +        # Look for Dailymotion Cloud videos +        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) +        if dmcloud_url: +            return self.url_result(dmcloud_url, 'DailymotionCloud') + +        # Look for AdobeTVVideo embeds +        mobj = re.search( +            r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', +            webpage) +        if mobj is not None: +            return self.url_result( +                self._proto_relative_url(unescapeHTML(mobj.group(1))), +                'AdobeTVVideo') +          def check_video(vurl):              if YoutubeIE.suitable(vurl):                  return True diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index f29df36b5..4bb574cf3 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -46,7 +46,7 @@ class ImdbIE(InfoExtractor):              format_info = info['videoPlayerObject']['video']              formats.append({                  'format_id': f_id, -                'url': format_info['url'], +                'url': format_info['videoInfoList'][0]['videoUrl'],              })          return { diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py new file mode 100644 index 000000000..a52210fab --- /dev/null +++ b/youtube_dl/extractor/pinkbike.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    remove_end, +    remove_start, +    str_to_int, +    unified_strdate, +) + + +class PinkbikeIE(InfoExtractor): +    _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)' +    _TESTS = [{ +        'url': 'http://www.pinkbike.com/video/402811/', +        'md5': '4814b8ca7651034cd87e3361d5c2155a', +        'info_dict': { +            'id': '402811', +            'ext': 'mp4', +            'title': 'Brandon Semenuk - RAW 100', +            'description': 'Official release: www.redbull.ca/rupertwalker', +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 100, +            'upload_date': '20150406', +            'uploader': 'revelco', +            'location': 'Victoria, British Columbia, Canada', +            'view_count': int, +            'comment_count': int, +        } +    }, { +        'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage( +            'http://www.pinkbike.com/video/%s' % video_id, video_id) + +        formats = [] +        for _, format_id, src in re.findall( +                r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage): +            height = int_or_none(self._search_regex( +                r'^(\d+)[pP]$', format_id, 'height', default=None)) +            formats.append({ +                'url': src, +                'format_id': format_id, +                'height': height, +            }) +        self._sort_formats(formats) + +        title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike') +        description = self._html_search_regex( +            r'(?s)id="media-description"[^>]*>(.+?)<', +            webpage, 'description', default=None) or remove_start( +            self._og_search_description(webpage), title + '. ') +        thumbnail = self._og_search_thumbnail(webpage) +        duration = int_or_none(self._html_search_meta( +            'video:duration', webpage, 'duration')) + +        uploader = self._search_regex( +            r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False) +        upload_date = unified_strdate(self._search_regex( +            r'class="fullTime"[^>]+title="([^"]+)"', +            webpage, 'upload date', fatal=False)) + +        location = self._html_search_regex( +            r'(?s)<dt>Location</dt>\s*<dd>(.+?)<', +            webpage, 'location', fatal=False) + +        def extract_count(webpage, label): +            return str_to_int(self._search_regex( +                r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label, +                webpage, label, fatal=False)) + +        view_count = extract_count(webpage, 'Views') +        comment_count = extract_count(webpage, 'Comments') + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'upload_date': upload_date, +            'uploader': uploader, +            'location': location, +            'view_count': view_count, +            'comment_count': comment_count, +            'formats': formats +        } diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 29bd9ce6f..ba2d5e19b 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -6,9 +6,12 @@ import re  from .common import InfoExtractor  from ..compat import (      compat_str, -    compat_urllib_request +    compat_urllib_request, +    compat_urllib_parse, +) +from ..utils import ( +    ExtractorError,  ) -from ..utils import ExtractorError  class SohuIE(InfoExtractor): @@ -26,7 +29,7 @@ class SohuIE(InfoExtractor):          'skip': 'On available in China',      }, {          'url': 'http://tv.sohu.com/20150305/n409385080.shtml', -        'md5': 'ac9a5d322b4bf9ae184d53e4711e4f1a', +        'md5': '699060e75cf58858dd47fb9c03c42cfb',          'info_dict': {              'id': '409385080',              'ext': 'mp4', @@ -34,7 +37,7 @@ class SohuIE(InfoExtractor):          }      }, {          'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', -        'md5': '49308ff6dafde5ece51137d04aec311e', +        'md5': '9bf34be48f2f4dadcb226c74127e203c',          'info_dict': {              'id': '78693464',              'ext': 'mp4', @@ -48,7 +51,7 @@ class SohuIE(InfoExtractor):              'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',          },          'playlist': [{ -            'md5': '492923eac023ba2f13ff69617c32754a', +            'md5': 'bdbfb8f39924725e6589c146bc1883ad',              'info_dict': {                  'id': '78910339_part1',                  'ext': 'mp4', @@ -56,7 +59,7 @@ class SohuIE(InfoExtractor):                  'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',              }          }, { -            'md5': 'de604848c0e8e9c4a4dde7e1347c0637', +            'md5': '3e1f46aaeb95354fd10e7fca9fc1804e',              'info_dict': {                  'id': '78910339_part2',                  'ext': 'mp4', @@ -64,7 +67,7 @@ class SohuIE(InfoExtractor):                  'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',              }          }, { -            'md5': '93584716ee0657c0b205b8aa3d27aa13', +            'md5': '8407e634175fdac706766481b9443450',              'info_dict': {                  'id': '78910339_part3',                  'ext': 'mp4', @@ -139,21 +142,42 @@ class SohuIE(InfoExtractor):          for i in range(part_count):              formats = []              for format_id, format_data in formats_json.items(): +                allot = format_data['allot'] +                  data = format_data['data'] +                clips_url = data['clipsURL'] +                su = data['su'] -                # URLs starts with http://newflv.sohu.ccgslb.net/ is not usable -                # so retry until got a working URL                  video_url = 'newflv.sohu.ccgslb.net' +                cdnId = None                  retries = 0 -                while 'newflv.sohu.ccgslb.net' in video_url and retries < 5: -                    download_note = 'Download information from CDN gateway for format ' + format_id + +                while 'newflv.sohu.ccgslb.net' in video_url: +                    params = { +                        'prot': 9, +                        'file': clips_url[i], +                        'new': su[i], +                        'prod': 'flash', +                    } + +                    if cdnId is not None: +                        params['idc'] = cdnId + +                    download_note = 'Downloading %s video URL part %d of %d' % ( +                        format_id, i + 1, part_count) +                      if retries > 0:                          download_note += ' (retry #%d)' % retries +                    part_info = self._parse_json(self._download_webpage( +                        'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)), +                        video_id, download_note), video_id) + +                    video_url = part_info['url'] +                    cdnId = part_info.get('nid') +                      retries += 1 -                    cdn_info = self._download_json( -                        'http://data.vod.itc.cn/cdnList?new=' + data['su'][i], -                        video_id, download_note) -                    video_url = cdn_info['url'] +                    if retries > 5: +                        raise ExtractorError('Failed to get video URL')                  formats.append({                      'url': video_url, diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 63c20310d..9ead13a91 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -5,6 +5,7 @@ import re  from .common import InfoExtractor  from .pornhub import PornHubIE +from .vimeo import VimeoIE  class TumblrIE(InfoExtractor): @@ -40,6 +41,17 @@ class TumblrIE(InfoExtractor):              'timestamp': 1430931613,          },          'add_ie': ['Vidme'], +    }, { +        'url': 'http://camdamage.tumblr.com/post/98846056295/', +        'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6', +        'info_dict': { +            'id': '105463834', +            'ext': 'mp4', +            'title': 'Cam Damage-HD 720p', +            'uploader': 'John Moyer', +            'uploader_id': 'user32021558', +        }, +        'add_ie': ['Vimeo'],      }]      def _real_extract(self, url): @@ -60,6 +72,10 @@ class TumblrIE(InfoExtractor):          if pornhub_url:              return self.url_result(pornhub_url, 'PornHub') +        vimeo_url = VimeoIE._extract_vimeo_url(url, webpage) +        if vimeo_url: +            return self.url_result(vimeo_url, 'Vimeo') +          iframe_url = self._search_regex(              r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',              webpage, 'iframe url') diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 52d10d242..51cdc6b65 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -28,11 +28,15 @@ class VikiBaseIE(InfoExtractor):      _NETRC_MACHINE = 'viki' +    _token = None +      def _prepare_call(self, path, timestamp=None, post_data=None):          path += '?' if '?' not in path else '&'          if not timestamp:              timestamp = int(time.time())          query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp) +        if self._token: +            query += '&token=%s' % self._token          sig = hmac.new(              self._APP_SECRET.encode('ascii'),              query.encode('ascii'), @@ -76,10 +80,14 @@ class VikiBaseIE(InfoExtractor):              'password': password,          } -        self._call_api( +        login = self._call_api(              'sessions.json', None,              'Logging in as %s' % username, post_data=login_form) +        self._token = login.get('token') +        if not self._token: +            self.report_warning('Unable to get session token, login has probably failed') +  class VikiIE(VikiBaseIE):      IE_NAME = 'viki' diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index f300c7ca4..cae90205d 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -22,6 +22,7 @@ from ..utils import (      unified_strdate,      unsmuggle_url,      urlencode_postdata, +    unescapeHTML,  ) @@ -173,6 +174,21 @@ class VimeoIE(VimeoBaseInfoExtractor):          },      ] +    @staticmethod +    def _extract_vimeo_url(url, webpage): +        # Look for embedded (iframe) Vimeo player +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) +        if mobj: +            player_url = unescapeHTML(mobj.group('url')) +            surl = smuggle_url(player_url, {'Referer': url}) +            return surl +        # Look for embedded (swf embed) Vimeo player +        mobj = re.search( +            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) +        if mobj: +            return mobj.group(1) +      def _verify_video_password(self, url, video_id, webpage):          password = self._downloader.params.get('videopassword', None)          if password is None: diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 4527567f8..b4ad513a0 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -13,7 +13,6 @@ from ..utils import (  class XHamsterIE(InfoExtractor): -    """Information Extractor for xHamster"""      _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'      _TESTS = [          { @@ -133,3 +132,36 @@ class XHamsterIE(InfoExtractor):              'age_limit': age_limit,              'formats': formats,          } + + +class XHamsterEmbedIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P<id>\d+)' +    _TEST = { +        'url': 'http://xhamster.com/xembed.php?video=3328539', +        'info_dict': { +            'id': '3328539', +            'ext': 'mp4', +            'title': 'Pen Masturbation', +            'upload_date': '20140728', +            'uploader_id': 'anonymous', +            'duration': 5, +            'age_limit': 18, +        } +    } + +    @staticmethod +    def _extract_urls(webpage): +        return [url for _, url in re.findall( +            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1', +            webpage)] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        video_url = self._search_regex( +            r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id, +            webpage, 'xhamster url') + +        return self.url_result(video_url, 'XHamster') diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 2a45dc574..d8415bed4 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -5,10 +5,12 @@ import re  from .common import InfoExtractor  from ..compat import (      compat_urllib_parse, +    compat_urllib_request,  )  from ..utils import (      clean_html,      ExtractorError, +    determine_ext,  ) @@ -25,6 +27,8 @@ class XVideosIE(InfoExtractor):          }      } +    _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19' +      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) @@ -40,9 +44,30 @@ class XVideosIE(InfoExtractor):          video_thumbnail = self._search_regex(              r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) +        formats = [{ +            'url': video_url, +        }] + +        android_req = compat_urllib_request.Request(url) +        android_req.add_header('User-Agent', self._ANDROID_USER_AGENT) +        android_webpage = self._download_webpage(android_req, video_id, fatal=False) + +        if android_webpage is not None: +            player_params_str = self._search_regex( +                'mobileReplacePlayerDivTwoQual\(([^)]+)\)', +                android_webpage, 'player parameters', default='') +            player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(','))) +            if player_params: +                formats.extend([{ +                    'url': param, +                    'preference': -10, +                } for param in player_params if determine_ext(param) == 'mp4']) + +        self._sort_formats(formats) +          return {              'id': video_id, -            'url': video_url, +            'formats': formats,              'title': video_title,              'ext': 'flv',              'thumbnail': video_thumbnail, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9e2671192..a3da56c14 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -234,6 +234,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          '44': {'ext': 'webm', 'width': 854, 'height': 480},          '45': {'ext': 'webm', 'width': 1280, 'height': 720},          '46': {'ext': 'webm', 'width': 1920, 'height': 1080}, +        '59': {'ext': 'mp4', 'width': 854, 'height': 480}, +        '78': {'ext': 'mp4', 'width': 854, 'height': 480},          # 3d videos diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index cc65b34e7..fe7e0a8ee 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -21,6 +21,7 @@ from ..utils import (      shell_quote,      subtitles_filename,      dfxp2srt, +    ISO639Utils,  ) @@ -307,199 +308,6 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):  class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): -    # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt -    _lang_map = { -        'aa': 'aar', -        'ab': 'abk', -        'ae': 'ave', -        'af': 'afr', -        'ak': 'aka', -        'am': 'amh', -        'an': 'arg', -        'ar': 'ara', -        'as': 'asm', -        'av': 'ava', -        'ay': 'aym', -        'az': 'aze', -        'ba': 'bak', -        'be': 'bel', -        'bg': 'bul', -        'bh': 'bih', -        'bi': 'bis', -        'bm': 'bam', -        'bn': 'ben', -        'bo': 'bod', -        'br': 'bre', -        'bs': 'bos', -        'ca': 'cat', -        'ce': 'che', -        'ch': 'cha', -        'co': 'cos', -        'cr': 'cre', -        'cs': 'ces', -        'cu': 'chu', -        'cv': 'chv', -        'cy': 'cym', -        'da': 'dan', -        'de': 'deu', -        'dv': 'div', -        'dz': 'dzo', -        'ee': 'ewe', -        'el': 'ell', -        'en': 'eng', -        'eo': 'epo', -        'es': 'spa', -        'et': 'est', -        'eu': 'eus', -        'fa': 'fas', -        'ff': 'ful', -        'fi': 'fin', -        'fj': 'fij', -        'fo': 'fao', -        'fr': 'fra', -        'fy': 'fry', -        'ga': 'gle', -        'gd': 'gla', -        'gl': 'glg', -        'gn': 'grn', -        'gu': 'guj', -        'gv': 'glv', -        'ha': 'hau', -        'he': 'heb', -        'hi': 'hin', -        'ho': 'hmo', -        'hr': 'hrv', -        'ht': 'hat', -        'hu': 'hun', -        'hy': 'hye', -        'hz': 'her', -        'ia': 'ina', -        'id': 'ind', -        'ie': 'ile', -        'ig': 'ibo', -        'ii': 'iii', -        'ik': 'ipk', -        'io': 'ido', -        'is': 'isl', -        'it': 'ita', -        'iu': 'iku', -        'ja': 'jpn', -        'jv': 'jav', -        'ka': 'kat', -        'kg': 'kon', -        'ki': 'kik', -        'kj': 'kua', -        'kk': 'kaz', -        'kl': 'kal', -        'km': 'khm', -        'kn': 'kan', -        'ko': 'kor', -        'kr': 'kau', -        'ks': 'kas', -        'ku': 'kur', -        'kv': 'kom', -        'kw': 'cor', -        'ky': 'kir', -        'la': 'lat', -        'lb': 'ltz', -        'lg': 'lug', -        'li': 'lim', -        'ln': 'lin', -        'lo': 'lao', -        'lt': 'lit', -        'lu': 'lub', -        'lv': 'lav', -        'mg': 'mlg', -        'mh': 'mah', -        'mi': 'mri', -        'mk': 'mkd', -        'ml': 'mal', -        'mn': 'mon', -        'mr': 'mar', -        'ms': 'msa', -        'mt': 'mlt', -        'my': 'mya', -        'na': 'nau', -        'nb': 'nob', -        'nd': 'nde', -        'ne': 'nep', -        'ng': 'ndo', -        'nl': 'nld', -        'nn': 'nno', -        'no': 'nor', -        'nr': 'nbl', -        'nv': 'nav', -        'ny': 'nya', -        'oc': 'oci', -        'oj': 'oji', -        'om': 'orm', -        'or': 'ori', -        'os': 'oss', -        'pa': 'pan', -        'pi': 'pli', -        'pl': 'pol', -        'ps': 'pus', -        'pt': 'por', -        'qu': 'que', -        'rm': 'roh', -        'rn': 'run', -        'ro': 'ron', -        'ru': 'rus', -        'rw': 'kin', -        'sa': 'san', -        'sc': 'srd', -        'sd': 'snd', -        'se': 'sme', -        'sg': 'sag', -        'si': 'sin', -        'sk': 'slk', -        'sl': 'slv', -        'sm': 'smo', -        'sn': 'sna', -        'so': 'som', -        'sq': 'sqi', -        'sr': 'srp', -        'ss': 'ssw', -        'st': 'sot', -        'su': 'sun', -        'sv': 'swe', -        'sw': 'swa', -        'ta': 'tam', -        'te': 'tel', -        'tg': 'tgk', -        'th': 'tha', -        'ti': 'tir', -        'tk': 'tuk', -        'tl': 'tgl', -        'tn': 'tsn', -        'to': 'ton', -        'tr': 'tur', -        'ts': 'tso', -        'tt': 'tat', -        'tw': 'twi', -        'ty': 'tah', -        'ug': 'uig', -        'uk': 'ukr', -        'ur': 'urd', -        'uz': 'uzb', -        've': 'ven', -        'vi': 'vie', -        'vo': 'vol', -        'wa': 'wln', -        'wo': 'wol', -        'xh': 'xho', -        'yi': 'yid', -        'yo': 'yor', -        'za': 'zha', -        'zh': 'zho', -        'zu': 'zul', -    } - -    @classmethod -    def _conver_lang_code(cls, code): -        """Convert language code from ISO 639-1 to ISO 639-2/T""" -        return cls._lang_map.get(code[:2]) -      def run(self, information):          if information['ext'] not in ['mp4', 'mkv']:              self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files') @@ -525,7 +333,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):              opts += ['-c:s', 'mov_text']          for (i, lang) in enumerate(sub_langs):              opts.extend(['-map', '%d:0' % (i + 1)]) -            lang_code = self._conver_lang_code(lang) +            lang_code = ISO639Utils.short2long(lang)              if lang_code is not None:                  opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 52d198fa3..a2746b2d1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1841,7 +1841,10 @@ def srt_subtitles_timecode(seconds):  def dfxp2srt(dfxp_data): -    _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'}) +    _x = functools.partial(xpath_with_ns, ns_map={ +        'ttml': 'http://www.w3.org/ns/ttml', +        'ttaf1': 'http://www.w3.org/2006/10/ttaf1', +    })      def parse_node(node):          str_or_empty = functools.partial(str_or_none, default='') @@ -1849,9 +1852,9 @@ def dfxp2srt(dfxp_data):          out = str_or_empty(node.text)          for child in node: -            if child.tag in (_x('ttml:br'), 'br'): +            if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):                  out += '\n' + str_or_empty(child.tail) -            elif child.tag in (_x('ttml:span'), 'span'): +            elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):                  out += str_or_empty(parse_node(child))              else:                  out += str_or_empty(xml.etree.ElementTree.tostring(child)) @@ -1860,7 +1863,7 @@ def dfxp2srt(dfxp_data):      dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))      out = [] -    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') +    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')      if not paras:          raise ValueError('Invalid dfxp/TTML subtitle') @@ -1879,6 +1882,208 @@ def dfxp2srt(dfxp_data):      return ''.join(out) +class ISO639Utils(object): +    # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt +    _lang_map = { +        'aa': 'aar', +        'ab': 'abk', +        'ae': 'ave', +        'af': 'afr', +        'ak': 'aka', +        'am': 'amh', +        'an': 'arg', +        'ar': 'ara', +        'as': 'asm', +        'av': 'ava', +        'ay': 'aym', +        'az': 'aze', +        'ba': 'bak', +        'be': 'bel', +        'bg': 'bul', +        'bh': 'bih', +        'bi': 'bis', +        'bm': 'bam', +        'bn': 'ben', +        'bo': 'bod', +        'br': 'bre', +        'bs': 'bos', +        'ca': 'cat', +        'ce': 'che', +        'ch': 'cha', +        'co': 'cos', +        'cr': 'cre', +        'cs': 'ces', +        'cu': 'chu', +        'cv': 'chv', +        'cy': 'cym', +        'da': 'dan', +        'de': 'deu', +        'dv': 'div', +        'dz': 'dzo', +        'ee': 'ewe', +        'el': 'ell', +        'en': 'eng', +        'eo': 'epo', +        'es': 'spa', +        'et': 'est', +        'eu': 'eus', +        'fa': 'fas', +        'ff': 'ful', +        'fi': 'fin', +        'fj': 'fij', +        'fo': 'fao', +        'fr': 'fra', +        'fy': 'fry', +        'ga': 'gle', +        'gd': 'gla', +        'gl': 'glg', +        'gn': 'grn', +        'gu': 'guj', +        'gv': 'glv', +        'ha': 'hau', +        'he': 'heb', +        'hi': 'hin', +        'ho': 'hmo', +        'hr': 'hrv', +        'ht': 'hat', +        'hu': 'hun', +        'hy': 'hye', +        'hz': 'her', +        'ia': 'ina', +        'id': 'ind', +        'ie': 'ile', +        'ig': 'ibo', +        'ii': 'iii', +        'ik': 'ipk', +        'io': 'ido', +        'is': 'isl', +        'it': 'ita', +        'iu': 'iku', +        'ja': 'jpn', +        'jv': 'jav', +        'ka': 'kat', +        'kg': 'kon', +        'ki': 'kik', +        'kj': 'kua', +        'kk': 'kaz', +        'kl': 'kal', +        'km': 'khm', +        'kn': 'kan', +        'ko': 'kor', +        'kr': 'kau', +        'ks': 'kas', +        'ku': 'kur', +        'kv': 'kom', +        'kw': 'cor', +        'ky': 'kir', +        'la': 'lat', +        'lb': 'ltz', +        'lg': 'lug', +        'li': 'lim', +        'ln': 'lin', +        'lo': 'lao', +        'lt': 'lit', +        'lu': 'lub', +        'lv': 'lav', +        'mg': 'mlg', +        'mh': 'mah', +        'mi': 'mri', +        'mk': 'mkd', +        'ml': 'mal', +        'mn': 'mon', +        'mr': 'mar', +        'ms': 'msa', +        'mt': 'mlt', +        'my': 'mya', +        'na': 'nau', +        'nb': 'nob', +        'nd': 'nde', +        'ne': 'nep', +        'ng': 'ndo', +        'nl': 'nld', +        'nn': 'nno', +        'no': 'nor', +        'nr': 'nbl', +        'nv': 'nav', +        'ny': 'nya', +        'oc': 'oci', +        'oj': 'oji', +        'om': 'orm', +        'or': 'ori', +        'os': 'oss', +        'pa': 'pan', +        'pi': 'pli', +        'pl': 'pol', +        'ps': 'pus', +        'pt': 'por', +        'qu': 'que', +        'rm': 'roh', +        'rn': 'run', +        'ro': 'ron', +        'ru': 'rus', +        'rw': 'kin', +        'sa': 'san', +        'sc': 'srd', +        'sd': 'snd', +        'se': 'sme', +        'sg': 'sag', +        'si': 'sin', +        'sk': 'slk', +        'sl': 'slv', +        'sm': 'smo', +        'sn': 'sna', +        'so': 'som', +        'sq': 'sqi', +        'sr': 'srp', +        'ss': 'ssw', +        'st': 'sot', +        'su': 'sun', +        'sv': 'swe', +        'sw': 'swa', +        'ta': 'tam', +        'te': 'tel', +        'tg': 'tgk', +        'th': 'tha', +        'ti': 'tir', +        'tk': 'tuk', +        'tl': 'tgl', +        'tn': 'tsn', +        'to': 'ton', +        'tr': 'tur', +        'ts': 'tso', +        'tt': 'tat', +        'tw': 'twi', +        'ty': 'tah', +        'ug': 'uig', +        'uk': 'ukr', +        'ur': 'urd', +        'uz': 'uzb', +        've': 'ven', +        'vi': 'vie', +        'vo': 'vol', +        'wa': 'wln', +        'wo': 'wol', +        'xh': 'xho', +        'yi': 'yid', +        'yo': 'yor', +        'za': 'zha', +        'zh': 'zho', +        'zu': 'zul', +    } + +    @classmethod +    def short2long(cls, code): +        """Convert language code from ISO 639-1 to ISO 639-2/T""" +        return cls._lang_map.get(code[:2]) + +    @classmethod +    def long2short(cls, code): +        """Convert language code from ISO 639-2/T to ISO 639-1""" +        for short_name, long_name in cls._lang_map.items(): +            if long_name == code: +                return short_name + +  class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):      def __init__(self, proxies=None):          # Set default handlers | 
