1#!/usr/bin/env python
2# Copyright 2016 Google Inc.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#      http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#
16################################################################################
17
18from __future__ import print_function
19import logging
20import os
21import re
22import sys
23import zipfile
24
25
26logging.basicConfig(level=logging.INFO, format='INFO: %(message)s')
27CODEC_NAME_REGEXP = re.compile(r'codec_id_(.+?)_fuzzer')
28
29
30def get_fuzzer_tags(fuzzer_name):
31  """Extract tags (are used to filter samples) from the given fuzzer name."""
32  tags = []
33  fuzzer_name = fuzzer_name.lower()
34  # All subtitle samples are in 'sub' directory, need to add 'sub' tag manually.
35  if 'subtitle' in fuzzer_name:
36    tags.append('sub')
37  m = CODEC_NAME_REGEXP.search(fuzzer_name)
38  if m:
39    codec_name = m.group(1)
40    # Some names are complex, need to split them and filter common strings.
41    codec_name_parts = codec_name.split('_')
42    for codec in codec_name_parts:
43      # Remove common strings from codec names like 'mpeg1video' or 'msvideo1'.
44      codec = codec.split('video')[0]
45      codec = codec.split('audio')[0]
46      codec = codec.split('subtitle')[0]
47      codec = codec.split('text')[0]
48      if codec:
49        # Some codec names have trailing characters: 'VP6F','FLV1', 'JPEGLS'.
50        # Use only first 3 characters for long enough codec names.
51        if len(codec) > 3:
52          tags.append(codec[:3])
53        else:
54          tags.append(codec)
55
56  return tags
57
58
59def parse_corpus(corpus_directory):
60  """Recursively list all files in the given directory and ignore checksums."""
61  all_corpus_files = []
62  for root, dirs, files in os.walk(corpus_directory):
63    for filename in files:
64      # Skip checksum files, they are useless in corpus.
65      if 'md5sum' in filename:
66        continue
67      path = os.path.join(root, filename)
68      all_corpus_files.append(path)
69
70  logging.info('Parsed %d corpus files from %s' % (len(all_corpus_files),
71                                                   corpus_directory))
72  return all_corpus_files
73
74
75def parse_fuzzers(fuzzers_directory):
76  """Recursively list all fuzzers in the given directory."""
77  all_fuzzers = []
78  for filename in os.listdir(fuzzers_directory):
79    # Skip non-ffmpeg and non-fuzzer files in the given directory,
80    if not filename.startswith('ffmpeg_') or not filename.endswith('_fuzzer'):
81      continue
82    fuzzer_path = os.path.join(fuzzers_directory, filename)
83    all_fuzzers.append(fuzzer_path)
84
85  logging.info('Parsed %d fuzzers from %s' % (len(all_fuzzers),
86                                              fuzzers_directory))
87  return all_fuzzers
88
89
90def zip_relevant_corpus(corpus_files, fuzzers):
91  """Find relevant corpus files and archive them for every fuzzer given."""
92  for fuzzer in fuzzers:
93    fuzzer_name = os.path.basename(fuzzer)
94    fuzzer_directory = os.path.dirname(fuzzer)
95    fuzzer_tags = get_fuzzer_tags(fuzzer_name)
96    relevant_corpus_files = set()
97    for filename in corpus_files:
98      # Remove 'ffmpeg' substring to do not use everything for 'MPEG' codec.
99      sanitized_filename = filename.replace('ffmpeg', '').lower()
100      for tag in fuzzer_tags:
101        if tag in sanitized_filename:
102          relevant_corpus_files.add(filename)
103
104      if not relevant_corpus_files:
105        # Strip last symbol from tags if we haven't found relevant corpus.
106        # It helps for such codecs as 'RV40' ('RV4' -> 'RV') or 'PCX' (-> 'PC').
107        for tag in fuzzer_tags:
108          if tag[:-1] in sanitized_filename:
109            relevant_corpus_files.add(filename)
110
111    logging.info(
112        'Found %d relevant samples for %s' % (len(relevant_corpus_files),
113                                              fuzzer_name))
114
115    if not relevant_corpus_files:
116      continue
117
118    zip_archive_name = fuzzer + "_seed_corpus.zip"
119    with zipfile.ZipFile(zip_archive_name, 'w') as archive:
120      for filename in relevant_corpus_files:
121        archive.write(filename)
122
123
124def main():
125  if len(sys.argv) < 3:
126    print('Usage: %s <seed_corpus_directory> <fuzzers_directory>' % __file__)
127    sys.exit(1)
128
129  seed_corpus_directory = sys.argv[1]
130  fuzzers_directory = sys.argv[2]
131
132  corpus_files = parse_corpus(seed_corpus_directory)
133  fuzzers = parse_fuzzers(fuzzers_directory)
134  zip_relevant_corpus(corpus_files, fuzzers)
135
136
137if __name__ == '__main__':
138  sys.exit(main())
139