diff options
Diffstat (limited to 'youtube_dl/utils.py')
| -rw-r--r-- | youtube_dl/utils.py | 481 | 
1 files changed, 475 insertions, 6 deletions
| diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 52d198fa3..942f76d24 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -62,6 +62,8 @@ std_headers = {  } +NO_DEFAULT = object() +  ENGLISH_MONTH_NAMES = [      'January', 'February', 'March', 'April', 'May', 'June',      'July', 'August', 'September', 'October', 'November', 'December'] @@ -171,13 +173,15 @@ def xpath_with_ns(path, ns_map):      return '/'.join(replaced) -def xpath_text(node, xpath, name=None, fatal=False): +def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):      if sys.version_info < (2, 7):  # Crazy 2.6          xpath = xpath.encode('ascii')      n = node.find(xpath)      if n is None or n.text is None: -        if fatal: +        if default is not NO_DEFAULT: +            return default +        elif fatal:              name = xpath if name is None else name              raise ExtractorError('Could not find XML element %s' % name)          else: @@ -1841,7 +1845,10 @@ def srt_subtitles_timecode(seconds):  def dfxp2srt(dfxp_data): -    _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'}) +    _x = functools.partial(xpath_with_ns, ns_map={ +        'ttml': 'http://www.w3.org/ns/ttml', +        'ttaf1': 'http://www.w3.org/2006/10/ttaf1', +    })      def parse_node(node):          str_or_empty = functools.partial(str_or_none, default='') @@ -1849,9 +1856,9 @@ def dfxp2srt(dfxp_data):          out = str_or_empty(node.text)          for child in node: -            if child.tag in (_x('ttml:br'), 'br'): +            if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):                  out += '\n' + str_or_empty(child.tail) -            elif child.tag in (_x('ttml:span'), 'span'): +            elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):                  out += str_or_empty(parse_node(child))              else:                  out += str_or_empty(xml.etree.ElementTree.tostring(child)) @@ -1860,7 +1867,7 @@ def dfxp2srt(dfxp_data):      dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))      out = [] -    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') +    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')      if not paras:          raise ValueError('Invalid dfxp/TTML subtitle') @@ -1879,6 +1886,468 @@ def dfxp2srt(dfxp_data):      return ''.join(out) +class ISO639Utils(object): +    # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt +    _lang_map = { +        'aa': 'aar', +        'ab': 'abk', +        'ae': 'ave', +        'af': 'afr', +        'ak': 'aka', +        'am': 'amh', +        'an': 'arg', +        'ar': 'ara', +        'as': 'asm', +        'av': 'ava', +        'ay': 'aym', +        'az': 'aze', +        'ba': 'bak', +        'be': 'bel', +        'bg': 'bul', +        'bh': 'bih', +        'bi': 'bis', +        'bm': 'bam', +        'bn': 'ben', +        'bo': 'bod', +        'br': 'bre', +        'bs': 'bos', +        'ca': 'cat', +        'ce': 'che', +        'ch': 'cha', +        'co': 'cos', +        'cr': 'cre', +        'cs': 'ces', +        'cu': 'chu', +        'cv': 'chv', +        'cy': 'cym', +        'da': 'dan', +        'de': 'deu', +        'dv': 'div', +        'dz': 'dzo', +        'ee': 'ewe', +        'el': 'ell', +        'en': 'eng', +        'eo': 'epo', +        'es': 'spa', +        'et': 'est', +        'eu': 'eus', +        'fa': 'fas', +        'ff': 'ful', +        'fi': 'fin', +        'fj': 'fij', +        'fo': 'fao', +        'fr': 'fra', +        'fy': 'fry', +        'ga': 'gle', +        'gd': 'gla', +        'gl': 'glg', +        'gn': 'grn', +        'gu': 'guj', +        'gv': 'glv', +        'ha': 'hau', +        'he': 'heb', +        'hi': 'hin', +        'ho': 'hmo', +        'hr': 'hrv', +        'ht': 'hat', +        'hu': 'hun', +        'hy': 'hye', +        'hz': 'her', +        'ia': 'ina', +        'id': 'ind', +        'ie': 'ile', +        'ig': 'ibo', +        'ii': 'iii', +        'ik': 'ipk', +        'io': 'ido', +        'is': 'isl', +        'it': 'ita', +        'iu': 'iku', +        'ja': 'jpn', +        'jv': 'jav', +        'ka': 'kat', +        'kg': 'kon', +        'ki': 'kik', +        'kj': 'kua', +        'kk': 'kaz', +        'kl': 'kal', +        'km': 'khm', +        'kn': 'kan', +        'ko': 'kor', +        'kr': 'kau', +        'ks': 'kas', +        'ku': 'kur', +        'kv': 'kom', +        'kw': 'cor', +        'ky': 'kir', +        'la': 'lat', +        'lb': 'ltz', +        'lg': 'lug', +        'li': 'lim', +        'ln': 'lin', +        'lo': 'lao', +        'lt': 'lit', +        'lu': 'lub', +        'lv': 'lav', +        'mg': 'mlg', +        'mh': 'mah', +        'mi': 'mri', +        'mk': 'mkd', +        'ml': 'mal', +        'mn': 'mon', +        'mr': 'mar', +        'ms': 'msa', +        'mt': 'mlt', +        'my': 'mya', +        'na': 'nau', +        'nb': 'nob', +        'nd': 'nde', +        'ne': 'nep', +        'ng': 'ndo', +        'nl': 'nld', +        'nn': 'nno', +        'no': 'nor', +        'nr': 'nbl', +        'nv': 'nav', +        'ny': 'nya', +        'oc': 'oci', +        'oj': 'oji', +        'om': 'orm', +        'or': 'ori', +        'os': 'oss', +        'pa': 'pan', +        'pi': 'pli', +        'pl': 'pol', +        'ps': 'pus', +        'pt': 'por', +        'qu': 'que', +        'rm': 'roh', +        'rn': 'run', +        'ro': 'ron', +        'ru': 'rus', +        'rw': 'kin', +        'sa': 'san', +        'sc': 'srd', +        'sd': 'snd', +        'se': 'sme', +        'sg': 'sag', +        'si': 'sin', +        'sk': 'slk', +        'sl': 'slv', +        'sm': 'smo', +        'sn': 'sna', +        'so': 'som', +        'sq': 'sqi', +        'sr': 'srp', +        'ss': 'ssw', +        'st': 'sot', +        'su': 'sun', +        'sv': 'swe', +        'sw': 'swa', +        'ta': 'tam', +        'te': 'tel', +        'tg': 'tgk', +        'th': 'tha', +        'ti': 'tir', +        'tk': 'tuk', +        'tl': 'tgl', +        'tn': 'tsn', +        'to': 'ton', +        'tr': 'tur', +        'ts': 'tso', +        'tt': 'tat', +        'tw': 'twi', +        'ty': 'tah', +        'ug': 'uig', +        'uk': 'ukr', +        'ur': 'urd', +        'uz': 'uzb', +        've': 'ven', +        'vi': 'vie', +        'vo': 'vol', +        'wa': 'wln', +        'wo': 'wol', +        'xh': 'xho', +        'yi': 'yid', +        'yo': 'yor', +        'za': 'zha', +        'zh': 'zho', +        'zu': 'zul', +    } + +    @classmethod +    def short2long(cls, code): +        """Convert language code from ISO 639-1 to ISO 639-2/T""" +        return cls._lang_map.get(code[:2]) + +    @classmethod +    def long2short(cls, code): +        """Convert language code from ISO 639-2/T to ISO 639-1""" +        for short_name, long_name in cls._lang_map.items(): +            if long_name == code: +                return short_name + + +class ISO3166Utils(object): +    # From http://data.okfn.org/data/core/country-list +    _country_map = { +        'AF': 'Afghanistan', +        'AX': 'Åland Islands', +        'AL': 'Albania', +        'DZ': 'Algeria', +        'AS': 'American Samoa', +        'AD': 'Andorra', +        'AO': 'Angola', +        'AI': 'Anguilla', +        'AQ': 'Antarctica', +        'AG': 'Antigua and Barbuda', +        'AR': 'Argentina', +        'AM': 'Armenia', +        'AW': 'Aruba', +        'AU': 'Australia', +        'AT': 'Austria', +        'AZ': 'Azerbaijan', +        'BS': 'Bahamas', +        'BH': 'Bahrain', +        'BD': 'Bangladesh', +        'BB': 'Barbados', +        'BY': 'Belarus', +        'BE': 'Belgium', +        'BZ': 'Belize', +        'BJ': 'Benin', +        'BM': 'Bermuda', +        'BT': 'Bhutan', +        'BO': 'Bolivia, Plurinational State of', +        'BQ': 'Bonaire, Sint Eustatius and Saba', +        'BA': 'Bosnia and Herzegovina', +        'BW': 'Botswana', +        'BV': 'Bouvet Island', +        'BR': 'Brazil', +        'IO': 'British Indian Ocean Territory', +        'BN': 'Brunei Darussalam', +        'BG': 'Bulgaria', +        'BF': 'Burkina Faso', +        'BI': 'Burundi', +        'KH': 'Cambodia', +        'CM': 'Cameroon', +        'CA': 'Canada', +        'CV': 'Cape Verde', +        'KY': 'Cayman Islands', +        'CF': 'Central African Republic', +        'TD': 'Chad', +        'CL': 'Chile', +        'CN': 'China', +        'CX': 'Christmas Island', +        'CC': 'Cocos (Keeling) Islands', +        'CO': 'Colombia', +        'KM': 'Comoros', +        'CG': 'Congo', +        'CD': 'Congo, the Democratic Republic of the', +        'CK': 'Cook Islands', +        'CR': 'Costa Rica', +        'CI': 'Côte d\'Ivoire', +        'HR': 'Croatia', +        'CU': 'Cuba', +        'CW': 'Curaçao', +        'CY': 'Cyprus', +        'CZ': 'Czech Republic', +        'DK': 'Denmark', +        'DJ': 'Djibouti', +        'DM': 'Dominica', +        'DO': 'Dominican Republic', +        'EC': 'Ecuador', +        'EG': 'Egypt', +        'SV': 'El Salvador', +        'GQ': 'Equatorial Guinea', +        'ER': 'Eritrea', +        'EE': 'Estonia', +        'ET': 'Ethiopia', +        'FK': 'Falkland Islands (Malvinas)', +        'FO': 'Faroe Islands', +        'FJ': 'Fiji', +        'FI': 'Finland', +        'FR': 'France', +        'GF': 'French Guiana', +        'PF': 'French Polynesia', +        'TF': 'French Southern Territories', +        'GA': 'Gabon', +        'GM': 'Gambia', +        'GE': 'Georgia', +        'DE': 'Germany', +        'GH': 'Ghana', +        'GI': 'Gibraltar', +        'GR': 'Greece', +        'GL': 'Greenland', +        'GD': 'Grenada', +        'GP': 'Guadeloupe', +        'GU': 'Guam', +        'GT': 'Guatemala', +        'GG': 'Guernsey', +        'GN': 'Guinea', +        'GW': 'Guinea-Bissau', +        'GY': 'Guyana', +        'HT': 'Haiti', +        'HM': 'Heard Island and McDonald Islands', +        'VA': 'Holy See (Vatican City State)', +        'HN': 'Honduras', +        'HK': 'Hong Kong', +        'HU': 'Hungary', +        'IS': 'Iceland', +        'IN': 'India', +        'ID': 'Indonesia', +        'IR': 'Iran, Islamic Republic of', +        'IQ': 'Iraq', +        'IE': 'Ireland', +        'IM': 'Isle of Man', +        'IL': 'Israel', +        'IT': 'Italy', +        'JM': 'Jamaica', +        'JP': 'Japan', +        'JE': 'Jersey', +        'JO': 'Jordan', +        'KZ': 'Kazakhstan', +        'KE': 'Kenya', +        'KI': 'Kiribati', +        'KP': 'Korea, Democratic People\'s Republic of', +        'KR': 'Korea, Republic of', +        'KW': 'Kuwait', +        'KG': 'Kyrgyzstan', +        'LA': 'Lao People\'s Democratic Republic', +        'LV': 'Latvia', +        'LB': 'Lebanon', +        'LS': 'Lesotho', +        'LR': 'Liberia', +        'LY': 'Libya', +        'LI': 'Liechtenstein', +        'LT': 'Lithuania', +        'LU': 'Luxembourg', +        'MO': 'Macao', +        'MK': 'Macedonia, the Former Yugoslav Republic of', +        'MG': 'Madagascar', +        'MW': 'Malawi', +        'MY': 'Malaysia', +        'MV': 'Maldives', +        'ML': 'Mali', +        'MT': 'Malta', +        'MH': 'Marshall Islands', +        'MQ': 'Martinique', +        'MR': 'Mauritania', +        'MU': 'Mauritius', +        'YT': 'Mayotte', +        'MX': 'Mexico', +        'FM': 'Micronesia, Federated States of', +        'MD': 'Moldova, Republic of', +        'MC': 'Monaco', +        'MN': 'Mongolia', +        'ME': 'Montenegro', +        'MS': 'Montserrat', +        'MA': 'Morocco', +        'MZ': 'Mozambique', +        'MM': 'Myanmar', +        'NA': 'Namibia', +        'NR': 'Nauru', +        'NP': 'Nepal', +        'NL': 'Netherlands', +        'NC': 'New Caledonia', +        'NZ': 'New Zealand', +        'NI': 'Nicaragua', +        'NE': 'Niger', +        'NG': 'Nigeria', +        'NU': 'Niue', +        'NF': 'Norfolk Island', +        'MP': 'Northern Mariana Islands', +        'NO': 'Norway', +        'OM': 'Oman', +        'PK': 'Pakistan', +        'PW': 'Palau', +        'PS': 'Palestine, State of', +        'PA': 'Panama', +        'PG': 'Papua New Guinea', +        'PY': 'Paraguay', +        'PE': 'Peru', +        'PH': 'Philippines', +        'PN': 'Pitcairn', +        'PL': 'Poland', +        'PT': 'Portugal', +        'PR': 'Puerto Rico', +        'QA': 'Qatar', +        'RE': 'Réunion', +        'RO': 'Romania', +        'RU': 'Russian Federation', +        'RW': 'Rwanda', +        'BL': 'Saint Barthélemy', +        'SH': 'Saint Helena, Ascension and Tristan da Cunha', +        'KN': 'Saint Kitts and Nevis', +        'LC': 'Saint Lucia', +        'MF': 'Saint Martin (French part)', +        'PM': 'Saint Pierre and Miquelon', +        'VC': 'Saint Vincent and the Grenadines', +        'WS': 'Samoa', +        'SM': 'San Marino', +        'ST': 'Sao Tome and Principe', +        'SA': 'Saudi Arabia', +        'SN': 'Senegal', +        'RS': 'Serbia', +        'SC': 'Seychelles', +        'SL': 'Sierra Leone', +        'SG': 'Singapore', +        'SX': 'Sint Maarten (Dutch part)', +        'SK': 'Slovakia', +        'SI': 'Slovenia', +        'SB': 'Solomon Islands', +        'SO': 'Somalia', +        'ZA': 'South Africa', +        'GS': 'South Georgia and the South Sandwich Islands', +        'SS': 'South Sudan', +        'ES': 'Spain', +        'LK': 'Sri Lanka', +        'SD': 'Sudan', +        'SR': 'Suriname', +        'SJ': 'Svalbard and Jan Mayen', +        'SZ': 'Swaziland', +        'SE': 'Sweden', +        'CH': 'Switzerland', +        'SY': 'Syrian Arab Republic', +        'TW': 'Taiwan, Province of China', +        'TJ': 'Tajikistan', +        'TZ': 'Tanzania, United Republic of', +        'TH': 'Thailand', +        'TL': 'Timor-Leste', +        'TG': 'Togo', +        'TK': 'Tokelau', +        'TO': 'Tonga', +        'TT': 'Trinidad and Tobago', +        'TN': 'Tunisia', +        'TR': 'Turkey', +        'TM': 'Turkmenistan', +        'TC': 'Turks and Caicos Islands', +        'TV': 'Tuvalu', +        'UG': 'Uganda', +        'UA': 'Ukraine', +        'AE': 'United Arab Emirates', +        'GB': 'United Kingdom', +        'US': 'United States', +        'UM': 'United States Minor Outlying Islands', +        'UY': 'Uruguay', +        'UZ': 'Uzbekistan', +        'VU': 'Vanuatu', +        'VE': 'Venezuela, Bolivarian Republic of', +        'VN': 'Viet Nam', +        'VG': 'Virgin Islands, British', +        'VI': 'Virgin Islands, U.S.', +        'WF': 'Wallis and Futuna', +        'EH': 'Western Sahara', +        'YE': 'Yemen', +        'ZM': 'Zambia', +        'ZW': 'Zimbabwe', +    } + +    @classmethod +    def short2full(cls, code): +        """Convert an ISO 3166-2 country code to the corresponding full name""" +        return cls._country_map.get(code.upper()) + +  class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):      def __init__(self, proxies=None):          # Set default handlers | 
