#!/usr/bin/env python # # Copyright (C) 2018 The Android Open Source Project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Merge multiple CSV files, possibly with different columns. """ import argparse import csv import io from zipfile import ZipFile args_parser = argparse.ArgumentParser(description='Merge given CSV files into a single one.') args_parser.add_argument('--header', help='Comma separated field names; ' 'if missing determines the header from input files.') args_parser.add_argument('--zip_input', help='ZIP archive with all CSV files to merge.') args_parser.add_argument('--output', help='Output file for merged CSV.', default='-', type=argparse.FileType('w')) args_parser.add_argument('files', nargs=argparse.REMAINDER) args = args_parser.parse_args() def dict_reader(input): return csv.DictReader(input, delimiter=',', quotechar='|') if args.zip_input and len(args.files) > 0: raise ValueError('Expecting either a single ZIP with CSV files' ' or a list of CSV files as input; not both.') csv_readers = [] if len(args.files) > 0: for file in args.files: csv_readers.append(dict_reader(open(file, 'r'))) elif args.zip_input: with ZipFile(args.zip_input) as zip: for entry in zip.namelist(): if entry.endswith('.uau'): csv_readers.append(dict_reader(io.TextIOWrapper(zip.open(entry, 'r')))) headers = set() if args.header: fieldnames = args.header.split(',') else: # Build union of all columns from source files: for reader in csv_readers: headers = headers.union(reader.fieldnames) fieldnames = sorted(headers) # Concatenate all files to output: writer = csv.DictWriter(args.output, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL, dialect='unix', fieldnames=fieldnames) writer.writeheader() for reader in csv_readers: for row in reader: writer.writerow(row)