diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2015-02-10 03:32:21 +0100 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2015-02-10 03:32:24 +0100 | 
| commit | 347de4931cb3e496fb7b1dfb0314c213f44cce6b (patch) | |
| tree | 778a6d8a67b4ba6e1ecff2d68ee02d44cefb105a | |
| parent | 88296505131f4b91ff91eaa0af34318664d892c9 (diff) | |
[YoutubeDL] Add generic video filtering (Fixes #4916)
This functionality is intended to eventually encompass the current format filtering.
| -rw-r--r-- | test/test_utils.py | 32 | ||||
| -rwxr-xr-x | youtube_dl/YoutubeDL.py | 14 | ||||
| -rw-r--r-- | youtube_dl/__init__.py | 7 | ||||
| -rw-r--r-- | youtube_dl/options.py | 19 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 77 | 
5 files changed, 147 insertions, 2 deletions
diff --git a/test/test_utils.py b/test/test_utils.py index 80c765bc4..1c29d0889 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -53,6 +53,7 @@ from youtube_dl.utils import (      version_tuple,      xpath_with_ns,      render_table, +    match_str,  ) @@ -459,6 +460,37 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')              '123  4\n'              '9999 51') +    def test_match_str(self): +        self.assertRaises(ValueError, match_str, 'xy>foobar', {}) +        self.assertFalse(match_str('xy', {'x': 1200})) +        self.assertTrue(match_str('!xy', {'x': 1200})) +        self.assertTrue(match_str('x', {'x': 1200})) +        self.assertFalse(match_str('!x', {'x': 1200})) +        self.assertTrue(match_str('x', {'x': 0})) +        self.assertFalse(match_str('x>0', {'x': 0})) +        self.assertFalse(match_str('x>0', {})) +        self.assertTrue(match_str('x>?0', {})) +        self.assertTrue(match_str('x>1K', {'x': 1200})) +        self.assertFalse(match_str('x>2K', {'x': 1200})) +        self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200})) +        self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200})) +        self.assertFalse(match_str('y=a212', {'y': 'foobar42'})) +        self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'})) +        self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'})) +        self.assertTrue(match_str('y!=foobar2', {'y': 'foobar42'})) +        self.assertFalse(match_str( +            'like_count > 100 & dislike_count <? 50 & description', +            {'like_count': 90, 'description': 'foo'})) +        self.assertTrue(match_str( +            'like_count > 100 & dislike_count <? 50 & description', +            {'like_count': 190, 'description': 'foo'})) +        self.assertFalse(match_str( +            'like_count > 100 & dislike_count <? 50 & description', +            {'like_count': 190, 'dislike_count': 60, 'description': 'foo'})) +        self.assertFalse(match_str( +            'like_count > 100 & dislike_count <? 50 & description', +            {'like_count': 190, 'dislike_count': 10})) +  if __name__ == '__main__':      unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 3975ae0bc..dda222fee 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -228,6 +228,11 @@ class YoutubeDL(object):      external_downloader:  Executable of the external downloader to call.      listformats:       Print an overview of available video formats and exit.      list_thumbnails:   Print a table of all thumbnails and exit. +    match_filter:      A function that gets called with the info_dict of +                       every video. +                       If it returns a message, the video is ignored. +                       If it returns None, the video is downloaded. +                       match_filter_func in utils.py is one example for this.      The following parameters are not used by YoutubeDL itself, they are used by @@ -583,9 +588,16 @@ class YoutubeDL(object):              if max_views is not None and view_count > max_views:                  return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)          if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): -            return 'Skipping "%s" because it is age restricted' % title +            return 'Skipping "%s" because it is age restricted' % video_title          if self.in_download_archive(info_dict):              return '%s has already been recorded in archive' % video_title + +        match_filter = self.params.get('match_filter') +        if match_filter is not None: +            ret = match_filter(info_dict) +            if ret is not None: +                return ret +          return None      @staticmethod diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index e90679ff9..eefca0fe4 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -23,9 +23,10 @@ from .compat import (  )  from .utils import (      DateRange, -    DEFAULT_OUTTMPL,      decodeOption, +    DEFAULT_OUTTMPL,      DownloadError, +    match_filter_func,      MaxDownloadsReached,      preferredencoding,      read_batch_urls, @@ -247,6 +248,9 @@ def _real_main(argv=None):              xattr  # Confuse flake8          except ImportError:              parser.error('setting filesize xattr requested but python-xattr is not available') +    match_filter = ( +        None if opts.match_filter is None +        else match_filter_func(opts.match_filter))      ydl_opts = {          'usenetrc': opts.usenetrc, @@ -344,6 +348,7 @@ def _real_main(argv=None):          'list_thumbnails': opts.list_thumbnails,          'playlist_items': opts.playlist_items,          'xattr_set_filesize': opts.xattr_set_filesize, +        'match_filter': match_filter,      }      with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 9d9195b60..f64aa5b85 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -245,6 +245,25 @@ def parseOpts(overrideArguments=None):          metavar='COUNT', dest='max_views', default=None, type=int,          help='Do not download any videos with more than COUNT views')      selection.add_option( +        '--match-filter', +        metavar='FILTER', dest='match_filter', default=None, +        help=( +            '(Experimental) Generic video filter. ' +            'Specify any key (see help for -o for a list of available keys) to' +            ' match if the key is present, ' +            '!key to check if the key is not present,' +            'key > NUMBER (like "comment_count > 12", also works with ' +            '>=, <, <=, !=, =) to compare against a number, and ' +            '& to require multiple matches. ' +            'Values which are not known are excluded unless you' +            ' put a question mark (?) after the operator.' +            'For example, to only match videos that have been liked more than ' +            '100 times and disliked less than 50 times (or the dislike ' +            'functionality is not available at the given service), but who ' +            'also have a description, use  --match-filter ' +            '"like_count > 100 & dislike_count <? 50 & description" .' +        )) +    selection.add_option(          '--no-playlist',          action='store_true', dest='noplaylist', default=False,          help='If the URL refers to a video and a playlist, download only the video.') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8f5463f1c..03566d223 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -17,6 +17,7 @@ import io  import json  import locale  import math +import operator  import os  import pipes  import platform @@ -1678,3 +1679,79 @@ def render_table(header_row, data):      max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]      format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'      return '\n'.join(format_str % tuple(row) for row in table) + + +def _match_one(filter_part, dct): +    COMPARISON_OPERATORS = { +        '<': operator.lt, +        '<=': operator.le, +        '>': operator.gt, +        '>=': operator.ge, +        '=': operator.eq, +        '!=': operator.ne, +    } +    operator_rex = re.compile(r'''(?x)\s* +        (?P<key>[a-z_]+) +        \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* +        (?: +            (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)| +            (?P<strval>(?![0-9.])[a-z0-9A-Z]*) +        ) +        \s*$ +        ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))) +    m = operator_rex.search(filter_part) +    if m: +        op = COMPARISON_OPERATORS[m.group('op')] +        if m.group('strval') is not None: +            if m.group('op') not in ('=', '!='): +                raise ValueError( +                    'Operator %s does not support string values!' % m.group('op')) +            comparison_value = m.group('strval') +        else: +            try: +                comparison_value = int(m.group('intval')) +            except ValueError: +                comparison_value = parse_filesize(m.group('intval')) +                if comparison_value is None: +                    comparison_value = parse_filesize(m.group('intval') + 'B') +                if comparison_value is None: +                    raise ValueError( +                        'Invalid integer value %r in filter part %r' % ( +                            m.group('intval'), filter_part)) +        actual_value = dct.get(m.group('key')) +        if actual_value is None: +            return m.group('none_inclusive') +        return op(actual_value, comparison_value) + +    UNARY_OPERATORS = { +        '': lambda v: v is not None, +        '!': lambda v: v is None, +    } +    operator_rex = re.compile(r'''(?x)\s* +        (?P<op>%s)\s*(?P<key>[a-z_]+) +        \s*$ +        ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys()))) +    m = operator_rex.search(filter_part) +    if m: +        op = UNARY_OPERATORS[m.group('op')] +        actual_value = dct.get(m.group('key')) +        return op(actual_value) + +    raise ValueError('Invalid filter part %r' % filter_part) + + +def match_str(filter_str, dct): +    """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """ + +    return all( +        _match_one(filter_part, dct) for filter_part in filter_str.split('&')) + + +def match_filter_func(filter_str): +    def _match_func(info_dict): +        if match_str(filter_str, info_dict): +            return None +        else: +            video_title = info_dict.get('title', info_dict.get('id', 'video')) +            return '%s does not pass filter %s, skipping ..' % (video_title, filter_str) +    return _match_func  | 
