diff options
| author | Sergey M․ <dstftw@gmail.com> | 2016-06-25 22:30:35 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2016-06-25 23:19:18 +0700 | 
| commit | 46f59e89ea1e75bf2bd1657f0863a3e5e81f91ea (patch) | |
| tree | 6abbf9608ee0d0eb7b3a94939f744209c19765df | |
| parent | b4241e308e9b2d38d564833cb6c43c9fcc0fd280 (diff) | |
[utils] Add unified_timestamp
| -rw-r--r-- | test/test_utils.py | 21 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 150 | 
2 files changed, 113 insertions, 58 deletions
diff --git a/test/test_utils.py b/test/test_utils.py index b7ef51f8d..7f9385deb 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -60,6 +60,7 @@ from youtube_dl.utils import (      timeconvert,      unescapeHTML,      unified_strdate, +    unified_timestamp,      unsmuggle_url,      uppercase_escape,      lowercase_escape, @@ -283,8 +284,28 @@ class TestUtil(unittest.TestCase):              '20150202')          self.assertEqual(unified_strdate('Feb 14th 2016 5:45PM'), '20160214')          self.assertEqual(unified_strdate('25-09-2014'), '20140925') +        self.assertEqual(unified_strdate('27.02.2016 17:30'), '20160227')          self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None) +    def test_unified_timestamps(self): +        self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600) +        self.assertEqual(unified_timestamp('8/7/2009'), 1247011200) +        self.assertEqual(unified_timestamp('Dec 14, 2012'), 1355443200) +        self.assertEqual(unified_timestamp('2012/10/11 01:56:38 +0000'), 1349920598) +        self.assertEqual(unified_timestamp('1968 12 10'), -33436800) +        self.assertEqual(unified_timestamp('1968-12-10'), -33436800) +        self.assertEqual(unified_timestamp('28/01/2014 21:00:00 +0100'), 1390939200) +        self.assertEqual( +            unified_timestamp('11/26/2014 11:30:00 AM PST', day_first=False), +            1417001400) +        self.assertEqual( +            unified_timestamp('2/2/2015 6:47:40 PM', day_first=False), +            1422902860) +        self.assertEqual(unified_timestamp('Feb 14th 2016 5:45PM'), 1455471900) +        self.assertEqual(unified_timestamp('25-09-2014'), 1411603200) +        self.assertEqual(unified_timestamp('27.02.2016 17:30'), 1456594200) +        self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None) +      def test_determine_ext(self):          self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')          self.assertEqual(determine_ext('http://example.com/foo/bar/?download', None), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 562031fe1..de66cb482 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -110,6 +110,49 @@ ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙ                          itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],                                          'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy'))) +DATE_FORMATS = ( +    '%d %B %Y', +    '%d %b %Y', +    '%B %d %Y', +    '%b %d %Y', +    '%b %dst %Y %I:%M', +    '%b %dnd %Y %I:%M', +    '%b %dth %Y %I:%M', +    '%Y %m %d', +    '%Y-%m-%d', +    '%Y/%m/%d', +    '%Y/%m/%d %H:%M:%S', +    '%Y-%m-%d %H:%M:%S', +    '%Y-%m-%d %H:%M:%S.%f', +    '%d.%m.%Y %H:%M', +    '%d.%m.%Y %H.%M', +    '%Y-%m-%dT%H:%M:%SZ', +    '%Y-%m-%dT%H:%M:%S.%fZ', +    '%Y-%m-%dT%H:%M:%S.%f0Z', +    '%Y-%m-%dT%H:%M:%S', +    '%Y-%m-%dT%H:%M:%S.%f', +    '%Y-%m-%dT%H:%M', +) + +DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) +DATE_FORMATS_DAY_FIRST.extend([ +    '%d-%m-%Y', +    '%d.%m.%Y', +    '%d.%m.%y', +    '%d/%m/%Y', +    '%d/%m/%y', +    '%d/%m/%Y %H:%M:%S', +]) + +DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) +DATE_FORMATS_MONTH_FIRST.extend([ +    '%m-%d-%Y', +    '%m.%d.%Y', +    '%m/%d/%Y', +    '%m/%d/%y', +    '%m/%d/%Y %H:%M:%S', +]) +  def preferredencoding():      """Get preferred encoding. @@ -975,6 +1018,24 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):      https_response = http_response +def extract_timezone(date_str): +    m = re.search( +        r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', +        date_str) +    if not m: +        timezone = datetime.timedelta() +    else: +        date_str = date_str[:-len(m.group('tz'))] +        if not m.group('sign'): +            timezone = datetime.timedelta() +        else: +            sign = 1 if m.group('sign') == '+' else -1 +            timezone = datetime.timedelta( +                hours=sign * int(m.group('hours')), +                minutes=sign * int(m.group('minutes'))) +    return timezone, date_str + +  def parse_iso8601(date_str, delimiter='T', timezone=None):      """ Return a UNIX timestamp from the given date """ @@ -984,20 +1045,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):      date_str = re.sub(r'\.[0-9]+', '', date_str)      if timezone is None: -        m = re.search( -            r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', -            date_str) -        if not m: -            timezone = datetime.timedelta() -        else: -            date_str = date_str[:-len(m.group(0))] -            if not m.group('sign'): -                timezone = datetime.timedelta() -            else: -                sign = 1 if m.group('sign') == '+' else -1 -                timezone = datetime.timedelta( -                    hours=sign * int(m.group('hours')), -                    minutes=sign * int(m.group('minutes'))) +        timezone, date_str = extract_timezone(date_str) +      try:          date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)          dt = datetime.datetime.strptime(date_str, date_format) - timezone @@ -1006,6 +1055,10 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):          pass +def date_formats(day_first=True): +    return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST + +  def unified_strdate(date_str, day_first=True):      """Return a string with the date in the format YYYYMMDD""" @@ -1014,53 +1067,11 @@ def unified_strdate(date_str, day_first=True):      upload_date = None      # Replace commas      date_str = date_str.replace(',', ' ') -    # %z (UTC offset) is only supported in python>=3.2 -    if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str): -        date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)      # Remove AM/PM + timezone      date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) +    _, date_str = extract_timezone(date_str) -    format_expressions = [ -        '%d %B %Y', -        '%d %b %Y', -        '%B %d %Y', -        '%b %d %Y', -        '%b %dst %Y %I:%M', -        '%b %dnd %Y %I:%M', -        '%b %dth %Y %I:%M', -        '%Y %m %d', -        '%Y-%m-%d', -        '%Y/%m/%d', -        '%Y/%m/%d %H:%M:%S', -        '%Y-%m-%d %H:%M:%S', -        '%Y-%m-%d %H:%M:%S.%f', -        '%d.%m.%Y %H:%M', -        '%d.%m.%Y %H.%M', -        '%Y-%m-%dT%H:%M:%SZ', -        '%Y-%m-%dT%H:%M:%S.%fZ', -        '%Y-%m-%dT%H:%M:%S.%f0Z', -        '%Y-%m-%dT%H:%M:%S', -        '%Y-%m-%dT%H:%M:%S.%f', -        '%Y-%m-%dT%H:%M', -    ] -    if day_first: -        format_expressions.extend([ -            '%d-%m-%Y', -            '%d.%m.%Y', -            '%d.%m.%y', -            '%d/%m/%Y', -            '%d/%m/%y', -            '%d/%m/%Y %H:%M:%S', -        ]) -    else: -        format_expressions.extend([ -            '%m-%d-%Y', -            '%m.%d.%Y', -            '%m/%d/%Y', -            '%m/%d/%y', -            '%m/%d/%Y %H:%M:%S', -        ]) -    for expression in format_expressions: +    for expression in date_formats(day_first):          try:              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')          except ValueError: @@ -1076,6 +1087,29 @@ def unified_strdate(date_str, day_first=True):          return compat_str(upload_date) +def unified_timestamp(date_str, day_first=True): +    if date_str is None: +        return None + +    date_str = date_str.replace(',', ' ') + +    pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0) +    timezone, date_str = extract_timezone(date_str) + +    # Remove AM/PM + timezone +    date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + +    for expression in date_formats(day_first): +        try: +            dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta +            return calendar.timegm(dt.timetuple()) +        except ValueError: +            pass +    timetuple = email.utils.parsedate_tz(date_str) +    if timetuple: +        return calendar.timegm(timetuple.timetuple()) + +  def determine_ext(url, default_ext='unknown_video'):      if url is None:          return default_ext  | 
