1#!/usr/bin/env python
2# Copyright (C) 2017 The Android Open Source Project
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#      http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16""" Mirrors a Gerrit repo into GitHub, turning CLs into individual branches.
17
18This script does a bit of git black magic. It does mainly two things:
191) Mirrors all the branches (refs/heads/foo) from Gerrit to Github as-is, taking
20   care of propagating also deletions.
212) Rewrites Gerrit CLs (refs/changes/NN/cl_number/patchset_number) as
22   Github branches (refs/heads/cl_number) recreating a linear chain of commits
23   for each patchset in any given CL.
24
252. Is the trickier part. The problem is that Gerrit stores each patchset of
26each CL as an independent ref, e.g.:
27  $ git ls-remote origin
28  94df12f950462b55a2257b89d1fad6fac24353f9	refs/changes/10/496410/1
29  4472fadddf8def74fd76a66ff373ca1245c71bcc	refs/changes/10/496410/2
30  90b8535da0653d8f072e86cef9891a664f4e9ed7	refs/changes/10/496410/3
31  2149c215fa9969bb454f23ce355459f28604c545	refs/changes/10/496410/meta
32
33  53db7261268802648d7f6125ae6242db17e7a60d	refs/changes/20/494620/1
34  d25e56930486363e0637b0a9debe3ae3ec805207	refs/changes/20/494620/2
35
36Where each ref is base on top of the master branch (or whatever the dev choose).
37On GitHub, instead, we want to recreate something similar to the pull-request
38model, ending up with one branch per CL, and one commit per patchset.
39Also we want to make them non-hidden branch heads (i.e. in the refs/heads/)
40name space, because Travis CI does not hooks hidden branches.
41In conclusion we want to transform the above into:
42
43refs/changes/496410
44  * commit: [CL 496410, Patchset 3] (parent: [CL 496410, Patchset 2])
45  * commit: [CL 496410, Patchset 2] (parent: [CL 496410, Patchset 1])
46  * commit: [CL 496410, Patchset 1] (parent: [master])
47refs/changes/496420
48  * commit: [CL 496420, Patchset 2] (parent: [CL 496420, Patchset 1])
49  * commit: [CL 496420, Patchset 1] (parent: [master])
50
51"""
52
53import collections
54import logging
55import os
56import re
57import shutil
58import subprocess
59import sys
60import time
61import traceback
62
63from multiprocessing.pool import ThreadPool
64
65CUR_DIR = os.path.dirname(os.path.abspath(__file__))
66GIT_UPSTREAM = 'https://android.googlesource.com/platform/external/perfetto/'
67GIT_MIRROR = 'git@github.com:catapult-project/perfetto.git'
68WORKDIR = os.path.join(CUR_DIR, 'repo')
69
70# Ignores CLs that have a cumulative tree size greater than this. GitHub rightly
71# refuses to accept commits that have files that are too big, suggesting to use
72# LFS instead.
73MAX_TREE_SIZE_MB = 50
74
75# Ignores all CL numbers < this. 913796 roughly maps to end of Feb 2019.
76MIN_CL_NUM = 913796
77
78# Max number of concurrent git subprocesses that can be run while generating
79# per-CL branches.
80GIT_SUBPROCESS_CONCURRENCY = 10
81
82# Min delay (in seconds) between two consecutive git poll cycles. This is to
83# avoid hitting gerrit API quota limits.
84POLL_PERIOD_SEC = 60
85
86# The actual deploy_key is stored into the internal team drive, undef /infra/.
87ENV = {'GIT_SSH_COMMAND': 'ssh -i ' + os.path.join(CUR_DIR, 'deploy_key')}
88
89
90def GitCmd(*args, **kwargs):
91  cmd = ['git'] + list(args)
92  p = subprocess.Popen(
93      cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=sys.stderr,
94      cwd=WORKDIR, env=ENV)
95  out = p.communicate(kwargs.get('stdin'))[0]
96  assert p.returncode == 0, 'FAIL: ' + ' '.join(cmd)
97  return out
98
99
100# Create a git repo that mirrors both the upstream and the mirror repos.
101def Setup():
102  if os.path.exists(WORKDIR):
103    shutil.rmtree(WORKDIR)
104  os.makedirs(WORKDIR)
105  GitCmd('init', '--bare', '--quiet')
106  GitCmd('remote', 'add', 'upstream', GIT_UPSTREAM)
107  GitCmd('config', 'remote.upstream.fetch', '+refs/*:refs/remotes/upstream/*')
108  GitCmd('remote', 'add', 'mirror', GIT_MIRROR, '--mirror=fetch')
109
110
111# Returns the SUM(file.size) for file in the given git tree.
112def GetTreeSize(tree_sha1):
113  raw = GitCmd('ls-tree', '-r', '--long', tree_sha1)
114  return sum(int(line.split()[3]) for line in raw.splitlines())
115
116
117def GetCommit(commit_sha1):
118  raw = GitCmd('cat-file', 'commit', commit_sha1)
119  return {
120    'tree': re.search(r'^tree\s(\w+)$', raw, re.M).group(1),
121    'parent': re.search(r'^parent\s(\w+)$', raw, re.M).group(1),
122    'author': re.search(r'^author\s(.+)$', raw, re.M).group(1),
123    'committer': re.search(r'^committer\s(.+)$', raw, re.M).group(1),
124    'message': re.search(r'\n\n(.+)', raw, re.M | re.DOTALL).group(1),
125  }
126
127
128def ForgeCommit(tree, parent, author, committer, message):
129  raw = 'tree %s\nparent %s\nauthor %s\ncommitter %s\n\n%s' % (
130      tree, parent, author, committer, message)
131  out = GitCmd('hash-object', '-w', '-t', 'commit', '--stdin', stdin=raw)
132  return out.strip()
133
134
135# Translates a CL, identified by a (Gerrit) CL number and a list of patchsets
136# into a git branch, where all patchsets look like subsequent commits.
137# This function must be stateless and idempotent, it's invoked by ThreadPool.
138def TranslateClIntoBranch(packed_args):
139  cl_num, patchsets = packed_args
140  if cl_num < MIN_CL_NUM:
141    return
142  parent_sha1 = None
143  for patchset_num, commit_sha1 in sorted(patchsets.items(), key=lambda x:x[0]):
144    patchset_data = GetCommit(commit_sha1)
145    # Skip Cls that are too big as they would be rejected by GitHub.
146    tree_size_bytes = GetTreeSize(patchset_data['tree'])
147    if tree_size_bytes > MAX_TREE_SIZE_MB * (1 << 20):
148      logging.warning('Skipping CL %s because its too big (%d bytes)',
149                      cl_num, tree_size_bytes)
150      return
151    parent_sha1 = parent_sha1 or patchset_data['parent']
152    forged_sha1 = ForgeCommit(
153        tree=patchset_data['tree'],
154        parent=parent_sha1,
155        author=patchset_data['author'],
156        committer=patchset_data['committer'],
157        message='[Patchset %d] %s' % (patchset_num, patchset_data['message']))
158    parent_sha1 = forged_sha1
159    return 'refs/heads/changes/%d' % cl_num, forged_sha1
160
161
162def Sync():
163  logging.info('Fetching git remotes')
164  GitCmd('fetch', '--all', '--quiet')
165  all_refs = GitCmd('show-ref')
166  future_heads = {}
167  current_heads = {}
168  changes = collections.defaultdict(dict)
169
170  # List all refs from both repos and:
171  # 1. Keep track of all branch heads refnames and sha1s from the (github)
172  #    mirror into |current_heads|.
173  # 2. Keep track of all upstream (AOSP) branch heads into |future_heads|. Note:
174  #    this includes only pure branches and NOT CLs. CLs and their patchsets are
175  #    stored in a hidden ref (refs/changes) which is NOT under refs/heads.
176  # 3. Keep track of all upstream (AOSP) CLs from the refs/changes namespace
177  #    into changes[cl_number][patchset_number].
178  for line in all_refs.splitlines():
179    ref_sha1, ref = line.split()
180
181    PREFIX = 'refs/heads/'
182    if ref.startswith(PREFIX):
183      branch = ref[len(PREFIX):]
184      current_heads['refs/heads/' + branch] = ref_sha1
185      continue
186
187    PREFIX = 'refs/remotes/upstream/heads/'
188    if ref.startswith(PREFIX):
189      branch = ref[len(PREFIX):]
190      future_heads['refs/heads/' + branch] = ref_sha1
191      continue
192
193    PREFIX = 'refs/remotes/upstream/changes/'
194    if ref.startswith(PREFIX):
195      (_, cl_num, patchset) = ref[len(PREFIX):].split('/')
196      if not cl_num.isdigit() or not patchset.isdigit():
197        continue
198      cl_num, patchset = int(cl_num), int(patchset)
199      changes[cl_num][patchset] = ref_sha1
200
201  # Now iterate over the upstream (AOSP) CLS and forge a chain of commits,
202  # creating one branch refs/heads/changes/cl_number for each set of patchsets.
203  # Forging commits is mostly fork() + exec() and I/O bound, parallelism helps
204  # significantly to hide those latencies.
205  logging.info('Forging per-CL branches')
206  pool = ThreadPool(processes=GIT_SUBPROCESS_CONCURRENCY)
207  for res in pool.imap_unordered(TranslateClIntoBranch, changes.iteritems()):
208    if res is None:
209      continue
210    branch_ref, forged_sha1 = res
211    future_heads[branch_ref] = forged_sha1
212  pool.close()
213
214  deleted_heads = set(current_heads) - set(future_heads)
215  logging.info('current_heads: %d, future_heads: %d, deleted_heads: %d',
216               len(current_heads), len(future_heads), len(deleted_heads))
217
218  # Now compute:
219  # 1. The set of branches in the mirror (github) that have been deleted on the
220  #    upstream (AOSP) repo. These will be deleted also from the mirror.
221  # 2. The set of rewritten branches to be updated.
222  update_ref_cmd = ''
223  for ref_to_delete in deleted_heads:
224    update_ref_cmd += 'delete %s\n' % ref_to_delete
225  for ref_to_update, ref_sha1 in future_heads.iteritems():
226    if current_heads.get(ref_to_update) != ref_sha1:
227      update_ref_cmd += 'update %s %s\n' % (ref_to_update, ref_sha1)
228  print update_ref_cmd
229
230  logging.info('Pushing updates')
231  # Update objects and push.
232  GitCmd('update-ref', '--stdin', stdin=update_ref_cmd)
233  GitCmd('push', 'mirror', '--all', '--prune', '--force')
234  GitCmd('gc', '--prune=all', '--aggressive', '--quiet')
235
236
237def Main():
238  logging.info('Setting up git repo one-off')
239  Setup()
240  while True:
241    logging.info('------- BEGINNING OF SYNC CYCLE -------')
242    Sync()
243    logging.info('------- END OF SYNC CYCLE -------')
244    time.sleep(POLL_PERIOD_SEC)
245
246
247if __name__ == '__main__':
248  logging.basicConfig(
249    format='%(asctime)s %(levelname)-8s %(message)s',
250    level=logging.INFO,
251    datefmt='%Y-%m-%d %H:%M:%S')
252  sys.exit(Main())
253