1#!/usr/bin/env python 2# Copyright (C) 2017 The Android Open Source Project 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15 16""" Mirrors a Gerrit repo into GitHub, turning CLs into individual branches. 17 18This script does a bit of git black magic. It does mainly two things: 191) Mirrors all the branches (refs/heads/foo) from Gerrit to Github as-is, taking 20 care of propagating also deletions. 212) Rewrites Gerrit CLs (refs/changes/NN/cl_number/patchset_number) as 22 Github branches (refs/heads/cl_number) recreating a linear chain of commits 23 for each patchset in any given CL. 24 252. Is the trickier part. The problem is that Gerrit stores each patchset of 26each CL as an independent ref, e.g.: 27 $ git ls-remote origin 28 94df12f950462b55a2257b89d1fad6fac24353f9 refs/changes/10/496410/1 29 4472fadddf8def74fd76a66ff373ca1245c71bcc refs/changes/10/496410/2 30 90b8535da0653d8f072e86cef9891a664f4e9ed7 refs/changes/10/496410/3 31 2149c215fa9969bb454f23ce355459f28604c545 refs/changes/10/496410/meta 32 33 53db7261268802648d7f6125ae6242db17e7a60d refs/changes/20/494620/1 34 d25e56930486363e0637b0a9debe3ae3ec805207 refs/changes/20/494620/2 35 36Where each ref is base on top of the master branch (or whatever the dev choose). 37On GitHub, instead, we want to recreate something similar to the pull-request 38model, ending up with one branch per CL, and one commit per patchset. 39Also we want to make them non-hidden branch heads (i.e. in the refs/heads/) 40name space, because Travis CI does not hooks hidden branches. 41In conclusion we want to transform the above into: 42 43refs/changes/496410 44 * commit: [CL 496410, Patchset 3] (parent: [CL 496410, Patchset 2]) 45 * commit: [CL 496410, Patchset 2] (parent: [CL 496410, Patchset 1]) 46 * commit: [CL 496410, Patchset 1] (parent: [master]) 47refs/changes/496420 48 * commit: [CL 496420, Patchset 2] (parent: [CL 496420, Patchset 1]) 49 * commit: [CL 496420, Patchset 1] (parent: [master]) 50 51""" 52 53import collections 54import logging 55import os 56import re 57import shutil 58import subprocess 59import sys 60import time 61import traceback 62 63from multiprocessing.pool import ThreadPool 64 65CUR_DIR = os.path.dirname(os.path.abspath(__file__)) 66GIT_UPSTREAM = 'https://android.googlesource.com/platform/external/perfetto/' 67GIT_MIRROR = 'git@github.com:catapult-project/perfetto.git' 68WORKDIR = os.path.join(CUR_DIR, 'repo') 69 70# Ignores CLs that have a cumulative tree size greater than this. GitHub rightly 71# refuses to accept commits that have files that are too big, suggesting to use 72# LFS instead. 73MAX_TREE_SIZE_MB = 50 74 75# Ignores all CL numbers < this. 913796 roughly maps to end of Feb 2019. 76MIN_CL_NUM = 913796 77 78# Max number of concurrent git subprocesses that can be run while generating 79# per-CL branches. 80GIT_SUBPROCESS_CONCURRENCY = 10 81 82# Min delay (in seconds) between two consecutive git poll cycles. This is to 83# avoid hitting gerrit API quota limits. 84POLL_PERIOD_SEC = 60 85 86# The actual deploy_key is stored into the internal team drive, undef /infra/. 87ENV = {'GIT_SSH_COMMAND': 'ssh -i ' + os.path.join(CUR_DIR, 'deploy_key')} 88 89 90def GitCmd(*args, **kwargs): 91 cmd = ['git'] + list(args) 92 p = subprocess.Popen( 93 cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=sys.stderr, 94 cwd=WORKDIR, env=ENV) 95 out = p.communicate(kwargs.get('stdin'))[0] 96 assert p.returncode == 0, 'FAIL: ' + ' '.join(cmd) 97 return out 98 99 100# Create a git repo that mirrors both the upstream and the mirror repos. 101def Setup(): 102 if os.path.exists(WORKDIR): 103 shutil.rmtree(WORKDIR) 104 os.makedirs(WORKDIR) 105 GitCmd('init', '--bare', '--quiet') 106 GitCmd('remote', 'add', 'upstream', GIT_UPSTREAM) 107 GitCmd('config', 'remote.upstream.fetch', '+refs/*:refs/remotes/upstream/*') 108 GitCmd('remote', 'add', 'mirror', GIT_MIRROR, '--mirror=fetch') 109 110 111# Returns the SUM(file.size) for file in the given git tree. 112def GetTreeSize(tree_sha1): 113 raw = GitCmd('ls-tree', '-r', '--long', tree_sha1) 114 return sum(int(line.split()[3]) for line in raw.splitlines()) 115 116 117def GetCommit(commit_sha1): 118 raw = GitCmd('cat-file', 'commit', commit_sha1) 119 return { 120 'tree': re.search(r'^tree\s(\w+)$', raw, re.M).group(1), 121 'parent': re.search(r'^parent\s(\w+)$', raw, re.M).group(1), 122 'author': re.search(r'^author\s(.+)$', raw, re.M).group(1), 123 'committer': re.search(r'^committer\s(.+)$', raw, re.M).group(1), 124 'message': re.search(r'\n\n(.+)', raw, re.M | re.DOTALL).group(1), 125 } 126 127 128def ForgeCommit(tree, parent, author, committer, message): 129 raw = 'tree %s\nparent %s\nauthor %s\ncommitter %s\n\n%s' % ( 130 tree, parent, author, committer, message) 131 out = GitCmd('hash-object', '-w', '-t', 'commit', '--stdin', stdin=raw) 132 return out.strip() 133 134 135# Translates a CL, identified by a (Gerrit) CL number and a list of patchsets 136# into a git branch, where all patchsets look like subsequent commits. 137# This function must be stateless and idempotent, it's invoked by ThreadPool. 138def TranslateClIntoBranch(packed_args): 139 cl_num, patchsets = packed_args 140 if cl_num < MIN_CL_NUM: 141 return 142 parent_sha1 = None 143 for patchset_num, commit_sha1 in sorted(patchsets.items(), key=lambda x:x[0]): 144 patchset_data = GetCommit(commit_sha1) 145 # Skip Cls that are too big as they would be rejected by GitHub. 146 tree_size_bytes = GetTreeSize(patchset_data['tree']) 147 if tree_size_bytes > MAX_TREE_SIZE_MB * (1 << 20): 148 logging.warning('Skipping CL %s because its too big (%d bytes)', 149 cl_num, tree_size_bytes) 150 return 151 parent_sha1 = parent_sha1 or patchset_data['parent'] 152 forged_sha1 = ForgeCommit( 153 tree=patchset_data['tree'], 154 parent=parent_sha1, 155 author=patchset_data['author'], 156 committer=patchset_data['committer'], 157 message='[Patchset %d] %s' % (patchset_num, patchset_data['message'])) 158 parent_sha1 = forged_sha1 159 return 'refs/heads/changes/%d' % cl_num, forged_sha1 160 161 162def Sync(): 163 logging.info('Fetching git remotes') 164 GitCmd('fetch', '--all', '--quiet') 165 all_refs = GitCmd('show-ref') 166 future_heads = {} 167 current_heads = {} 168 changes = collections.defaultdict(dict) 169 170 # List all refs from both repos and: 171 # 1. Keep track of all branch heads refnames and sha1s from the (github) 172 # mirror into |current_heads|. 173 # 2. Keep track of all upstream (AOSP) branch heads into |future_heads|. Note: 174 # this includes only pure branches and NOT CLs. CLs and their patchsets are 175 # stored in a hidden ref (refs/changes) which is NOT under refs/heads. 176 # 3. Keep track of all upstream (AOSP) CLs from the refs/changes namespace 177 # into changes[cl_number][patchset_number]. 178 for line in all_refs.splitlines(): 179 ref_sha1, ref = line.split() 180 181 PREFIX = 'refs/heads/' 182 if ref.startswith(PREFIX): 183 branch = ref[len(PREFIX):] 184 current_heads['refs/heads/' + branch] = ref_sha1 185 continue 186 187 PREFIX = 'refs/remotes/upstream/heads/' 188 if ref.startswith(PREFIX): 189 branch = ref[len(PREFIX):] 190 future_heads['refs/heads/' + branch] = ref_sha1 191 continue 192 193 PREFIX = 'refs/remotes/upstream/changes/' 194 if ref.startswith(PREFIX): 195 (_, cl_num, patchset) = ref[len(PREFIX):].split('/') 196 if not cl_num.isdigit() or not patchset.isdigit(): 197 continue 198 cl_num, patchset = int(cl_num), int(patchset) 199 changes[cl_num][patchset] = ref_sha1 200 201 # Now iterate over the upstream (AOSP) CLS and forge a chain of commits, 202 # creating one branch refs/heads/changes/cl_number for each set of patchsets. 203 # Forging commits is mostly fork() + exec() and I/O bound, parallelism helps 204 # significantly to hide those latencies. 205 logging.info('Forging per-CL branches') 206 pool = ThreadPool(processes=GIT_SUBPROCESS_CONCURRENCY) 207 for res in pool.imap_unordered(TranslateClIntoBranch, changes.iteritems()): 208 if res is None: 209 continue 210 branch_ref, forged_sha1 = res 211 future_heads[branch_ref] = forged_sha1 212 pool.close() 213 214 deleted_heads = set(current_heads) - set(future_heads) 215 logging.info('current_heads: %d, future_heads: %d, deleted_heads: %d', 216 len(current_heads), len(future_heads), len(deleted_heads)) 217 218 # Now compute: 219 # 1. The set of branches in the mirror (github) that have been deleted on the 220 # upstream (AOSP) repo. These will be deleted also from the mirror. 221 # 2. The set of rewritten branches to be updated. 222 update_ref_cmd = '' 223 for ref_to_delete in deleted_heads: 224 update_ref_cmd += 'delete %s\n' % ref_to_delete 225 for ref_to_update, ref_sha1 in future_heads.iteritems(): 226 if current_heads.get(ref_to_update) != ref_sha1: 227 update_ref_cmd += 'update %s %s\n' % (ref_to_update, ref_sha1) 228 print update_ref_cmd 229 230 logging.info('Pushing updates') 231 # Update objects and push. 232 GitCmd('update-ref', '--stdin', stdin=update_ref_cmd) 233 GitCmd('push', 'mirror', '--all', '--prune', '--force') 234 GitCmd('gc', '--prune=all', '--aggressive', '--quiet') 235 236 237def Main(): 238 logging.info('Setting up git repo one-off') 239 Setup() 240 while True: 241 logging.info('------- BEGINNING OF SYNC CYCLE -------') 242 Sync() 243 logging.info('------- END OF SYNC CYCLE -------') 244 time.sleep(POLL_PERIOD_SEC) 245 246 247if __name__ == '__main__': 248 logging.basicConfig( 249 format='%(asctime)s %(levelname)-8s %(message)s', 250 level=logging.INFO, 251 datefmt='%Y-%m-%d %H:%M:%S') 252 sys.exit(Main()) 253