1#!/usr/bin/env python3
2# Copyright 2020 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""
7Utility to disconnect history of files from a branch, and reconnect with base on
8a different branch.
9"""
10
11import argparse
12import collections
13import subprocess
14import sys
15
16import filtered_utils
17import lazytree
18import utils
19
20
21class CommitMetadataFactory(dict):
22    """Dict-like class to read commit metadata"""
23
24    def __missing__(self, key):
25        """Reads commit metadata if missing"""
26        value = filtered_utils.get_metadata(key)
27        self.__setitem__(key, value)
28        return value
29
30
31def disconnect(source_commit, ref_commit):
32    """Creates a commit that disconnects files from source_commit.
33
34    All files existing in ref_commit will be removed from source_commit.
35
36    Args:
37        source_commit: commit hash to disconnect from.
38        ref_commit: commit hash to be a file list reference.
39    """
40    source_files = utils.get_file_list(source_commit)
41    ref_files = utils.get_file_list(ref_commit)
42    ref_files_set = set(ref.path for ref in ref_files)
43    kept_files = [ref for ref in source_files if ref.path not in ref_files_set]
44    tree = utils.git_mktree(kept_files)
45    return utils.git_commit(
46        tree, [source_commit],
47        message=b'Disconnect history from %s' % (source_commit.encode('ascii')))
48
49
50def connect_base(current_commit, base_commit):
51    """Creates a merge commit that takes files from base_commit.
52
53    Literally it's identical to git merge base_commit in current_commit.
54
55    Args:
56        current_commit: commit hashes on where to commit to.
57        base_commit: commit hashes contains file histories.
58    """
59    current_files = utils.get_file_list(current_commit)
60    base_files = utils.get_file_list(base_commit)
61    tree = utils.git_mktree(current_files + base_files)
62    return utils.git_commit(
63        tree, [current_commit, base_commit],
64        message=b'Connect history with base %s' % (base_commit.encode('ascii')))
65
66
67def blame_files(commithash, files):
68    """Blames files on givven commithash"""
69    blames = {}
70    for path in files:
71        blames[path] = utils.git_blame(commithash, path)
72    return blames
73
74
75def search_blame_line(blames, amend_commits, target_commit_hash):
76    """Searches blames matching target_commit_hash in amend_commits
77
78    Returns a map from file path to a list of tuple, each tuple has
79    len(amend_commits) + 1 elements.  0-th element is the line in blames. and
80    1st to n-th element are corresponding lines in amend_commits blaems.
81
82    Args:
83        blames: a dict from path to list of GitBlameLine, for files blamed on
84            target_commit_hash.
85        amend_commits: a list of commit hashes to provide actual history.
86        target_commit_hash: commit hash that blames are blaemd on.
87    """
88    blames_combined = {}
89    for blame_file_path, blame_file in blames.items():
90        blames_amend = [
91            utils.git_blame(commit, blame_file_path) for commit in amend_commits
92        ]
93        blames_combined[blame_file_path] = [
94            blame_combined for blame_combined in zip(blame_file, *blames_amend)
95            if blame_combined[0].commit == target_commit_hash
96        ]
97    return blames_combined
98
99
100def get_track_from_blames(blames_combined, virtual_goal_commit, amend_commits,
101                          commit_choice_cache, commit_msg_cache):
102    """Blames diffs and locate the amend commits.
103
104    Returns a tuple containing:
105     - a set of commit hashes in amend_commits tree;
106     - a line-by-line mapping for files in diff to commit hashes in
107       amend_commits tree of diffed lines.
108
109    Args:
110        blames_combined: a map from path to a list of tuple. each tuple reflect
111            one line, and has len(amend_commits)+1 elements. See more details in
112            search_blame_line.
113        virtual_goal_commit: a commit that contains no useful history for diffs.
114        amend_commits: list of HEAD commit hashes that refers to tree that can
115            amend the diffs.
116        commit_choice_cache: caches user choice on which amend commit to use.
117        commit_msg_cache: caches commit metadata.
118    """
119    blame_untracked_lines = {}
120    commits_to_track = set()
121
122    for blame_file_path, blame_lines in blames_combined.items():
123        blame_untracked_lines[blame_file_path] = []
124        for blame_line in blame_lines:
125            original_commits = tuple(
126                blame_amend.commit for blame_amend in list(blame_line)[1:])
127            chosen = commit_choice_cache.get(original_commits)
128            if chosen is None:
129                for idx, original_commit in enumerate(original_commits):
130                    print('%d: %s' % (idx,
131                                      commit_msg_cache[original_commit].title))
132                # No validation on user_choice since no untrusted user.
133                # Also the developer can rerun if entered wrongly by accident.
134                user_choice = int(input('Choose patch: '))
135                chosen = original_commits[user_choice]
136                commit_choice_cache[original_commits] = chosen
137            commits_to_track.add(chosen)
138            blame_untracked_lines[blame_file_path].append((blame_line[0],
139                                                           chosen))
140
141    return commits_to_track, blame_untracked_lines
142
143
144def reconstruct_file(blame_goal, blame_base, lines_to_reconstruct,
145                     virtual_goal_commit):
146    """Reconstrucs a file to reflect changes in lines_to_reconstruct.
147
148    Takes lines to blame_base, and blame_goal it belongs lines_to_reconstruct.
149    It also deletes removed lines nearby.
150
151    Returns a binary for the new file content.
152
153    Args:
154        blame_goal: a list of utils.GitBlameLine blaming the file on
155            virtual_goal_commit.
156        blame_base: a list of utils.GitBlameLine blaming the file on last
157            commited commit.
158        lines_to_reconstruct: only to reconstruct these lines, instead of
159            everything in blame_goal. It is represented in a list of
160            GitBlameLine.
161        virtual_goal_commit: commit hash where blame_goal is based on.
162    """
163    idx_base, idx_goal = 0, 0
164    reconstructed_file = []
165
166    print('Changed lines are', [line.data for line in lines_to_reconstruct])
167    line_iter = iter(lines_to_reconstruct)
168    line = next(line_iter, None)
169    while idx_base < len(blame_base) or idx_goal< len(blame_goal):
170        # Both sides are idendical. We can't compare blame_base, and line
171        # directly due to blame commit difference could end up different lineno.
172        if (idx_base < len(blame_base) and
173                blame_base[idx_base].data == blame_goal[idx_goal].data and
174                blame_base[idx_base].commit == blame_goal[idx_goal].commit):
175            # We append this line if both sides are identical.
176            reconstructed_file.append(blame_base[idx_base].data)
177            idx_base += 1
178            idx_goal += 1
179            should_skip_base = False
180        elif line and blame_goal[idx_goal] == line:
181            # We append the line from goal, if blame_goal[idx_goal] is the line
182            # we're interested in.
183            reconstructed_file.append(line.data)
184            line = next(line_iter, None)
185            idx_goal += 1
186            should_skip_base = True
187        elif blame_goal[idx_goal].commit == virtual_goal_commit:
188            # We skip the line from goal, if the change in not in the commit
189            # we're interested. Thus, changed lines in other commits will not be
190            # reflected.
191            idx_goal += 1
192        else:
193            # We should skip base if we just appended some lines from goal.
194            # This would treat modified lines and append first and skip later.
195            # If we didn't append something from goal, lines from base should be
196            # preserved because the modified lines are not in the commit we're
197            # currently interested in.
198            if not should_skip_base:
199                reconstructed_file.append(blame_base[idx_base].data)
200            idx_base += 1
201
202    return b''.join([line + b'\n' for line in reconstructed_file])
203
204
205def reconstruct_files(track_commit, blame_untracked_lines, blames,
206                      current_base_commit, virtual_goal_commit):
207    """Reconstructs files to reflect changes in track_commit.
208
209    Returns a map from file path to file content for reconstructed files.
210
211    Args:
212        track_commit: commit hashes to track, and reconstruct from.
213        blame_untracked_lines: a line-by-line mapping regarding selected amend
214            commits for diffs. see get_track_from_blames for more.
215        blames: a map from filename to list of utils.GitBlameLine
216        current_base_commit: commit hashes for HEAD of base that contains base
217            history + already committed amend history.
218        virtual_goal_commit: commit hash for one giant commit that has no
219            history.  virtual_goal_commit is one commit ahead of
220            current_base_commit.
221    """
222    lines_to_track = collections.defaultdict(list)
223    for file, lines in blame_untracked_lines.items():
224        for line in lines:
225            if line[1] == track_commit:
226                lines_to_track[file].append(line[0])
227    constructed_files = {}
228    for current_file, current_file_lines in lines_to_track.items():
229        print('Reconstructing', current_file, 'for', track_commit)
230        blame_base = utils.git_blame(current_base_commit, current_file)
231        constructed_files[current_file] = reconstruct_file(
232            blames[current_file], blame_base, current_file_lines,
233            virtual_goal_commit)
234    return constructed_files
235
236
237def main():
238    # Init args
239    parser = argparse.ArgumentParser(description='Reconnect git history')
240    parser.add_argument(
241        'disconnect_from',
242        metavar='disconnect_from',
243        type=str,
244        nargs=1,
245        help='disconnect history from this commit')
246    parser.add_argument(
247        'base_commit',
248        metavar='base_commit',
249        type=str,
250        nargs=1,
251        help='base commit to use the history')
252    parser.add_argument(
253        'amend_commits',
254        metavar='amend_commits',
255        type=str,
256        nargs='+',
257        help='commits to amend histories from base_commit')
258
259    arg = parser.parse_args(sys.argv[1:])
260    empty_commit = disconnect(arg.disconnect_from[0], arg.base_commit[0])
261    connected_base = connect_base(empty_commit, arg.base_commit[0])
262
263    commit_msg_cache = CommitMetadataFactory()
264    commit_choice_cache = {}
265    last_commit = connected_base
266    # In each iteration of the loop, it
267    #  - re-create the new goal commit, (base + committed history + (one giant)
268    #  uncommited history).
269    #  - blame on new goal commit and tot of amend commits. map line-by-line
270    #  from uncommited to past histories.
271    #  - choose one of the past commits, reconstruct files to reflect changes in
272    #  that commit, and create a new commits.
273    # last_commit, commit_msg_cache, commit_choice_cache will be persistent
274    # across iteratins.
275    while True:
276        # One commit is processed per iteration.
277
278        # Create virtual target commit, and its diff.
279        virtual_goal = utils.git_commit(arg.disconnect_from[0] + '^{tree}',
280                                        [last_commit])
281        diffs = utils.git_difftree(None, virtual_goal)
282        if not diffs:
283            print('No diffs are found between %s and goal.' %
284                  (last_commit.decode('ascii'),))
285            break
286
287        blames = blame_files(virtual_goal,
288                             [diff.file.path for diff in diffs])
289        blames_combined = search_blame_line(blames, arg.amend_commits,
290                                            virtual_goal)
291
292        commits_to_track, blame_untracked_lines = get_track_from_blames(
293            blames_combined, virtual_goal, arg.amend_commits,
294            commit_choice_cache, commit_msg_cache)
295        if not commits_to_track:
296            print('no commits to track, stopping')
297            break
298
299        # Stablely choose one commit from commits_to_track, and reconstruct it.
300        track_commit = min(commits_to_track)
301        print('Reconstructing commit %s: %s' %
302              (track_commit, commit_msg_cache[track_commit].title))
303        constructed_files = reconstruct_files(track_commit,
304                                              blame_untracked_lines, blames,
305                                              last_commit, virtual_goal)
306
307        # Mktree and commit with re-constructed_files.
308        tree = lazytree.LazyTree(filtered_utils.get_metadata(last_commit).tree)
309        for filename, filedata in constructed_files.items():
310            blob = subprocess.check_output(
311                ['git', 'hash-object', '-w', '/dev/stdin'],
312                input=filedata).strip()
313            tree[filename] = utils.GitFile(filename, tree[filename].mode, blob)
314        meta = commit_msg_cache[track_commit]
315        last_commit = utils.git_commit(
316            tree.hash(), [last_commit],
317            (meta.message + b'\n(Reconstructed from ' + track_commit + b')\n'),
318            dict(
319                GIT_AUTHOR_NAME=meta.authorship.name,
320                GIT_AUTHOR_EMAIL=meta.authorship.email,
321                GIT_AUTHOR_DATE=b' '.join(
322                    [meta.authorship.time, meta.authorship.timezone])))
323        print('Reconstructed as', last_commit)
324    # Make last commit for history reconstruction.
325    print(
326        utils.git_commit(
327            filtered_utils.get_metadata(arg.disconnect_from[0]).tree,
328            [last_commit],
329            b'Finished history reconstruction\n\nRemoving unnecessary lines\n'))
330
331
332if __name__ == '__main__':
333    main()
334