1#!/usr/bin/env python
2
3import argparse
4import email.mime.multipart
5import email.mime.text
6import logging
7import os.path
8import pickle
9import re
10import smtplib
11import subprocess
12import sys
13from datetime import datetime, timedelta
14from phabricator import Phabricator
15
16# Setting up a virtualenv to run this script can be done by running the
17# following commands:
18# $ virtualenv venv
19# $ . ./venv/bin/activate
20# $ pip install Phabricator
21
22GIT_REPO_METADATA = (("llvm", "https://llvm.org/git/llvm.git"), )
23
24# The below PhabXXX classes represent objects as modelled by Phabricator.
25# The classes can be serialized to disk, to try and make sure that we don't
26# needlessly have to re-fetch lots of data from Phabricator, as that would
27# make this script unusably slow.
28
29
30class PhabObject:
31    OBJECT_KIND = None
32
33    def __init__(self, id):
34        self.id = id
35
36
37class PhabObjectCache:
38    def __init__(self, PhabObjectClass):
39        self.PhabObjectClass = PhabObjectClass
40        self.most_recent_info = None
41        self.oldest_info = None
42        self.id2PhabObjects = {}
43
44    def get_name(self):
45        return self.PhabObjectClass.OBJECT_KIND + "sCache"
46
47    def get(self, id):
48        if id not in self.id2PhabObjects:
49            self.id2PhabObjects[id] = self.PhabObjectClass(id)
50        return self.id2PhabObjects[id]
51
52    def get_ids_in_cache(self):
53        return self.id2PhabObjects.keys()
54
55    def get_objects(self):
56        return self.id2PhabObjects.values()
57
58    DEFAULT_DIRECTORY = "PhabObjectCache"
59
60    def _get_pickle_name(self, directory):
61        file_name = "Phab" + self.PhabObjectClass.OBJECT_KIND + "s.pickle"
62        return os.path.join(directory, file_name)
63
64    def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY):
65        """
66        FIXME: consider if serializing to JSON would bring interoperability
67        advantages over serializing to pickle.
68        """
69        try:
70            f = open(self._get_pickle_name(directory), "rb")
71        except IOError as err:
72            print("Could not find cache. Error message: {0}. Continuing..."
73                  .format(err))
74        else:
75            with f:
76                try:
77                    d = pickle.load(f)
78                    self.__dict__.update(d)
79                except EOFError as err:
80                    print("Cache seems to be corrupt. " +
81                          "Not using cache. Error message: {0}".format(err))
82
83    def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY):
84        if not os.path.exists(directory):
85            os.makedirs(directory)
86        with open(self._get_pickle_name(directory), "wb") as f:
87            pickle.dump(self.__dict__, f)
88        print("wrote cache to disk, most_recent_info= {0}".format(
89            datetime.fromtimestamp(self.most_recent_info)
90            if self.most_recent_info is not None else None))
91
92
93class PhabReview(PhabObject):
94    OBJECT_KIND = "Review"
95
96    def __init__(self, id):
97        PhabObject.__init__(self, id)
98
99    def update(self, title, dateCreated, dateModified, author):
100        self.title = title
101        self.dateCreated = dateCreated
102        self.dateModified = dateModified
103        self.author = author
104
105    def setPhabDiffs(self, phabDiffs):
106        self.phabDiffs = phabDiffs
107
108
109class PhabUser(PhabObject):
110    OBJECT_KIND = "User"
111
112    def __init__(self, id):
113        PhabObject.__init__(self, id)
114
115    def update(self, phid, realName):
116        self.phid = phid
117        self.realName = realName
118
119
120class PhabHunk:
121    def __init__(self, rest_api_hunk):
122        self.oldOffset = int(rest_api_hunk["oldOffset"])
123        self.oldLength = int(rest_api_hunk["oldLength"])
124        # self.actual_lines_changed_offset will contain the offsets of the
125        # lines that were changed in this hunk.
126        self.actual_lines_changed_offset = []
127        offset = self.oldOffset
128        inHunk = False
129        hunkStart = -1
130        contextLines = 3
131        for line in rest_api_hunk["corpus"].split("\n"):
132            if line.startswith("+"):
133                # line is a new line that got introduced in this patch.
134                # Do not record it as a changed line.
135                if inHunk is False:
136                    inHunk = True
137                    hunkStart = max(self.oldOffset, offset - contextLines)
138                continue
139            if line.startswith("-"):
140                # line was changed or removed from the older version of the
141                # code. Record it as a changed line.
142                if inHunk is False:
143                    inHunk = True
144                    hunkStart = max(self.oldOffset, offset - contextLines)
145                offset += 1
146                continue
147            # line is a context line.
148            if inHunk is True:
149                inHunk = False
150                hunkEnd = offset + contextLines
151                self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
152            offset += 1
153        if inHunk is True:
154            hunkEnd = offset + contextLines
155            self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
156
157        # The above algorithm could result in adjacent or overlapping ranges
158        # being recorded into self.actual_lines_changed_offset.
159        # Merge the adjacent and overlapping ranges in there:
160        t = []
161        lastRange = None
162        for start, end in self.actual_lines_changed_offset + \
163                [(sys.maxsize, sys.maxsize)]:
164            if lastRange is None:
165                lastRange = (start, end)
166            else:
167                if lastRange[1] >= start:
168                    lastRange = (lastRange[0], end)
169                else:
170                    t.append(lastRange)
171                    lastRange = (start, end)
172        self.actual_lines_changed_offset = t
173
174
175class PhabChange:
176    def __init__(self, rest_api_change):
177        self.oldPath = rest_api_change["oldPath"]
178        self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]]
179
180
181class PhabDiff(PhabObject):
182    OBJECT_KIND = "Diff"
183
184    def __init__(self, id):
185        PhabObject.__init__(self, id)
186
187    def update(self, rest_api_results):
188        self.revisionID = rest_api_results["revisionID"]
189        self.dateModified = int(rest_api_results["dateModified"])
190        self.dateCreated = int(rest_api_results["dateCreated"])
191        self.changes = [PhabChange(c) for c in rest_api_results["changes"]]
192
193
194class ReviewsCache(PhabObjectCache):
195    def __init__(self):
196        PhabObjectCache.__init__(self, PhabReview)
197
198
199class UsersCache(PhabObjectCache):
200    def __init__(self):
201        PhabObjectCache.__init__(self, PhabUser)
202
203
204reviews_cache = ReviewsCache()
205users_cache = UsersCache()
206
207
208def init_phab_connection():
209    phab = Phabricator()
210    phab.update_interfaces()
211    return phab
212
213
214def update_cached_info(phab, cache, phab_query, order, record_results,
215                       max_nr_entries_per_fetch, max_nr_days_to_cache):
216    q = phab
217    LIMIT = max_nr_entries_per_fetch
218    for query_step in phab_query:
219        q = getattr(q, query_step)
220    results = q(order=order, limit=LIMIT)
221    most_recent_info, oldest_info = record_results(cache, results, phab)
222    oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - \
223        timedelta(days=max_nr_days_to_cache)
224    most_recent_info_overall = most_recent_info
225    cache.write_cache_to_disk()
226    after = results["cursor"]["after"]
227    print("after: {0!r}".format(after))
228    print("most_recent_info: {0}".format(
229        datetime.fromtimestamp(most_recent_info)))
230    while (after is not None
231           and datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch):
232        need_more_older_data = \
233            (cache.oldest_info is None or
234             datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch)
235        print(("need_more_older_data={0} cache.oldest_info={1} " +
236               "oldest_info_to_fetch={2}").format(
237                   need_more_older_data,
238                   datetime.fromtimestamp(cache.oldest_info)
239                   if cache.oldest_info is not None else None,
240                   oldest_info_to_fetch))
241        need_more_newer_data = \
242            (cache.most_recent_info is None or
243             cache.most_recent_info < most_recent_info)
244        print(("need_more_newer_data={0} cache.most_recent_info={1} " +
245               "most_recent_info={2}")
246              .format(need_more_newer_data, cache.most_recent_info,
247                      most_recent_info))
248        if not need_more_older_data and not need_more_newer_data:
249            break
250        results = q(order=order, after=after, limit=LIMIT)
251        most_recent_info, oldest_info = record_results(cache, results, phab)
252        after = results["cursor"]["after"]
253        print("after: {0!r}".format(after))
254        print("most_recent_info: {0}".format(
255            datetime.fromtimestamp(most_recent_info)))
256        cache.write_cache_to_disk()
257    cache.most_recent_info = most_recent_info_overall
258    if after is None:
259        # We did fetch all records. Mark the cache to contain all info since
260        # the start of time.
261        oldest_info = 0
262    cache.oldest_info = oldest_info
263    cache.write_cache_to_disk()
264
265
266def record_reviews(cache, reviews, phab):
267    most_recent_info = None
268    oldest_info = None
269    for reviewInfo in reviews["data"]:
270        if reviewInfo["type"] != "DREV":
271            continue
272        id = reviewInfo["id"]
273        # phid = reviewInfo["phid"]
274        dateModified = int(reviewInfo["fields"]["dateModified"])
275        dateCreated = int(reviewInfo["fields"]["dateCreated"])
276        title = reviewInfo["fields"]["title"]
277        author = reviewInfo["fields"]["authorPHID"]
278        phabReview = cache.get(id)
279        if "dateModified" not in phabReview.__dict__ or \
280           dateModified > phabReview.dateModified:
281            diff_results = phab.differential.querydiffs(revisionIDs=[id])
282            diff_ids = sorted(diff_results.keys())
283            phabDiffs = []
284            for diff_id in diff_ids:
285                diffInfo = diff_results[diff_id]
286                d = PhabDiff(diff_id)
287                d.update(diffInfo)
288                phabDiffs.append(d)
289            phabReview.update(title, dateCreated, dateModified, author)
290            phabReview.setPhabDiffs(phabDiffs)
291            print("Updated D{0} modified on {1} ({2} diffs)".format(
292                id, datetime.fromtimestamp(dateModified), len(phabDiffs)))
293
294        if most_recent_info is None:
295            most_recent_info = dateModified
296        elif most_recent_info < dateModified:
297            most_recent_info = dateModified
298
299        if oldest_info is None:
300            oldest_info = dateModified
301        elif oldest_info > dateModified:
302            oldest_info = dateModified
303    return most_recent_info, oldest_info
304
305
306def record_users(cache, users, phab):
307    most_recent_info = None
308    oldest_info = None
309    for info in users["data"]:
310        if info["type"] != "USER":
311            continue
312        id = info["id"]
313        phid = info["phid"]
314        dateModified = int(info["fields"]["dateModified"])
315        # dateCreated = int(info["fields"]["dateCreated"])
316        realName = info["fields"]["realName"]
317        phabUser = cache.get(id)
318        phabUser.update(phid, realName)
319        if most_recent_info is None:
320            most_recent_info = dateModified
321        elif most_recent_info < dateModified:
322            most_recent_info = dateModified
323        if oldest_info is None:
324            oldest_info = dateModified
325        elif oldest_info > dateModified:
326            oldest_info = dateModified
327    return most_recent_info, oldest_info
328
329
330PHABCACHESINFO = ((reviews_cache, ("differential", "revision", "search"),
331                   "updated", record_reviews, 5, 7),
332                  (users_cache, ("user", "search"), "newest", record_users,
333                   100, 1000))
334
335
336def load_cache():
337    for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO:
338        cache.populate_cache_from_disk()
339        print("Loaded {0} nr entries: {1}".format(
340            cache.get_name(), len(cache.get_ids_in_cache())))
341        print("Loaded {0} has most recent info: {1}".format(
342            cache.get_name(),
343            datetime.fromtimestamp(cache.most_recent_info)
344            if cache.most_recent_info is not None else None))
345
346
347def update_cache(phab):
348    load_cache()
349    for cache, phab_query, order, record_results, max_nr_entries_per_fetch, \
350            max_nr_days_to_cache in PHABCACHESINFO:
351        update_cached_info(phab, cache, phab_query, order, record_results,
352                           max_nr_entries_per_fetch, max_nr_days_to_cache)
353        ids_in_cache = cache.get_ids_in_cache()
354        print("{0} objects in {1}".format(len(ids_in_cache), cache.get_name()))
355        cache.write_cache_to_disk()
356
357
358def get_most_recent_reviews(days):
359    newest_reviews = sorted(
360        reviews_cache.get_objects(), key=lambda r: -r.dateModified)
361    if len(newest_reviews) == 0:
362        return newest_reviews
363    most_recent_review_time = \
364        datetime.fromtimestamp(newest_reviews[0].dateModified)
365    cut_off_date = most_recent_review_time - timedelta(days=days)
366    result = []
367    for review in newest_reviews:
368        if datetime.fromtimestamp(review.dateModified) < cut_off_date:
369            return result
370        result.append(review)
371    return result
372
373
374# All of the above code is about fetching data from Phabricator and caching it
375# on local disk. The below code contains the actual "business logic" for this
376# script.
377
378_userphid2realname = None
379
380
381def get_real_name_from_author(user_phid):
382    global _userphid2realname
383    if _userphid2realname is None:
384        _userphid2realname = {}
385        for user in users_cache.get_objects():
386            _userphid2realname[user.phid] = user.realName
387    return _userphid2realname.get(user_phid, "unknown")
388
389
390def print_most_recent_reviews(phab, days, filter_reviewers):
391    msgs = []
392
393    def add_msg(msg):
394        msgs.append(msg)
395        print(msg)
396
397    newest_reviews = get_most_recent_reviews(days)
398    add_msg(u"These are the reviews that look interesting to be reviewed. " +
399            u"The report below has 2 sections. The first " +
400            u"section is organized per review; the second section is organized "
401            + u"per potential reviewer.\n")
402    oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None
403    oldest_datetime = \
404        datetime.fromtimestamp(oldest_review.dateModified) \
405        if oldest_review else None
406    add_msg((u"The report below is based on analyzing the reviews that got " +
407             u"touched in the past {0} days (since {1}). " +
408             u"The script found {2} such reviews.\n").format(
409                 days, oldest_datetime, len(newest_reviews)))
410    reviewer2reviews_and_scores = {}
411    for i, review in enumerate(newest_reviews):
412        matched_reviewers = find_reviewers_for_review(review)
413        matched_reviewers = filter_reviewers(matched_reviewers)
414        if len(matched_reviewers) == 0:
415            continue
416        add_msg((u"{0:>3}. https://reviews.llvm.org/D{1} by {2}\n     {3}\n" +
417                 u"     Last updated on {4}").format(
418                     i, review.id,
419                     get_real_name_from_author(review.author), review.title,
420                     datetime.fromtimestamp(review.dateModified)))
421        for reviewer, scores in matched_reviewers:
422            add_msg(u"    potential reviewer {0}, score {1}".format(
423                reviewer,
424                "(" + "/".join(["{0:.1f}%".format(s) for s in scores]) + ")"))
425            if reviewer not in reviewer2reviews_and_scores:
426                reviewer2reviews_and_scores[reviewer] = []
427            reviewer2reviews_and_scores[reviewer].append((review, scores))
428
429    # Print out a summary per reviewer.
430    for reviewer in sorted(reviewer2reviews_and_scores.keys()):
431        reviews_and_scores = reviewer2reviews_and_scores[reviewer]
432        reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True)
433        add_msg(u"\n\nSUMMARY FOR {0} (found {1} reviews):".format(
434            reviewer, len(reviews_and_scores)))
435        for review, scores in reviews_and_scores:
436            add_msg(u"[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format(
437                "/".join(["{0:.1f}%".format(s) for s in scores]), review.id,
438                review.title, get_real_name_from_author(review.author)))
439    return "\n".join(msgs)
440
441
442def get_git_cmd_output(cmd):
443    output = None
444    try:
445        logging.debug(cmd)
446        output = subprocess.check_output(
447            cmd, shell=True, stderr=subprocess.STDOUT)
448    except subprocess.CalledProcessError as e:
449        logging.debug(str(e))
450    if output is None:
451        return None
452    return output.decode("utf-8", errors='ignore')
453
454
455reAuthorMail = re.compile("^author-mail <([^>]*)>.*$")
456
457
458def parse_blame_output_line_porcelain(blame_output):
459    email2nr_occurences = {}
460    if blame_output is None:
461        return email2nr_occurences
462    for line in blame_output.split('\n'):
463        m = reAuthorMail.match(line)
464        if m:
465            author_email_address = m.group(1)
466            if author_email_address not in email2nr_occurences:
467                email2nr_occurences[author_email_address] = 1
468            else:
469                email2nr_occurences[author_email_address] += 1
470    return email2nr_occurences
471
472
473def find_reviewers_for_diff_heuristic(diff):
474    # Heuristic 1: assume good reviewers are the ones that touched the same
475    # lines before as this patch is touching.
476    # Heuristic 2: assume good reviewers are the ones that touched the same
477    # files before as this patch is touching.
478    reviewers2nr_lines_touched = {}
479    reviewers2nr_files_touched = {}
480    # Assume last revision before diff was modified is the revision the diff
481    # applies to.
482    git_repo = "git_repos/llvm"
483    cmd = 'git -C {0} rev-list -n 1 --before="{1}" master'.format(
484        git_repo,
485        datetime.fromtimestamp(
486            diff.dateModified).strftime("%Y-%m-%d %H:%M:%s"))
487    base_revision = get_git_cmd_output(cmd).strip()
488    logging.debug("Base revision={0}".format(base_revision))
489    for change in diff.changes:
490        path = change.oldPath
491        # Compute heuristic 1: look at context of patch lines.
492        for hunk in change.hunks:
493            for start_line, end_line in hunk.actual_lines_changed_offset:
494                # Collect git blame results for authors in those ranges.
495                cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e " +
496                       "-w --line-porcelain -L {1},{2} {3} -- {4}").format(
497                           git_repo, start_line, end_line, base_revision, path)
498                blame_output = get_git_cmd_output(cmd)
499                for reviewer, nr_occurences in \
500                        parse_blame_output_line_porcelain(blame_output).items():
501                    if reviewer not in reviewers2nr_lines_touched:
502                        reviewers2nr_lines_touched[reviewer] = 0
503                    reviewers2nr_lines_touched[reviewer] += nr_occurences
504        # Compute heuristic 2: don't look at context, just at files touched.
505        # Collect git blame results for authors in those ranges.
506        cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " +
507               "--line-porcelain {1} -- {2}").format(git_repo, base_revision,
508                                                     path)
509        blame_output = get_git_cmd_output(cmd)
510        for reviewer, nr_occurences in parse_blame_output_line_porcelain(
511                blame_output).items():
512            if reviewer not in reviewers2nr_files_touched:
513                reviewers2nr_files_touched[reviewer] = 0
514            reviewers2nr_files_touched[reviewer] += 1
515
516    # Compute "match scores"
517    total_nr_lines = sum(reviewers2nr_lines_touched.values())
518    total_nr_files = len(diff.changes)
519    reviewers_matchscores = \
520        [(reviewer,
521          (reviewers2nr_lines_touched.get(reviewer, 0)*100.0/total_nr_lines
522           if total_nr_lines != 0 else 0,
523           reviewers2nr_files_touched[reviewer]*100.0/total_nr_files
524           if total_nr_files != 0 else 0))
525         for reviewer, nr_lines
526         in reviewers2nr_files_touched.items()]
527    reviewers_matchscores.sort(key=lambda i: i[1], reverse=True)
528    return reviewers_matchscores
529
530
531def find_reviewers_for_review(review):
532    # Process the newest diff first.
533    diffs = sorted(
534        review.phabDiffs, key=lambda d: d.dateModified, reverse=True)
535    if len(diffs) == 0:
536        return
537    diff = diffs[0]
538    matched_reviewers = find_reviewers_for_diff_heuristic(diff)
539    # Show progress, as this is a slow operation:
540    sys.stdout.write('.')
541    sys.stdout.flush()
542    logging.debug(u"matched_reviewers: {0}".format(matched_reviewers))
543    return matched_reviewers
544
545
546def update_git_repos():
547    git_repos_directory = "git_repos"
548    for name, url in GIT_REPO_METADATA:
549        dirname = os.path.join(git_repos_directory, name)
550        if not os.path.exists(dirname):
551            cmd = "git clone {0} {1}".format(url, dirname)
552            output = get_git_cmd_output(cmd)
553        cmd = "git -C {0} pull --rebase".format(dirname)
554        output = get_git_cmd_output(cmd)
555
556
557def send_emails(email_addresses, sender, msg):
558    s = smtplib.SMTP()
559    s.connect()
560    for email_address in email_addresses:
561        email_msg = email.mime.multipart.MIMEMultipart()
562        email_msg['From'] = sender
563        email_msg['To'] = email_address
564        email_msg['Subject'] = 'LLVM patches you may be able to review.'
565        email_msg.attach(email.mime.text.MIMEText(msg.encode('utf-8'), 'plain'))
566        # python 3.x: s.send_message(email_msg)
567        s.sendmail(email_msg['From'], email_msg['To'], email_msg.as_string())
568    s.quit()
569
570
571def filter_reviewers_to_report_for(people_to_look_for):
572    # The below is just an example filter, to only report potential reviews
573    # to do for the people that will receive the report email.
574    return lambda potential_reviewers: [r for r in potential_reviewers
575                                        if r[0] in people_to_look_for]
576
577
578def main():
579    parser = argparse.ArgumentParser(
580        description='Match open reviews to potential reviewers.')
581    parser.add_argument(
582        '--no-update-cache',
583        dest='update_cache',
584        action='store_false',
585        default=True,
586        help='Do not update cached Phabricator objects')
587    parser.add_argument(
588        '--email-report',
589        dest='email_report',
590        nargs='*',
591        default="",
592        help="A email addresses to send the report to.")
593    parser.add_argument(
594        '--sender',
595        dest='sender',
596        default="",
597        help="The email address to use in 'From' on messages emailed out.")
598    parser.add_argument(
599        '--email-addresses',
600        dest='email_addresses',
601        nargs='*',
602        help="The email addresses (as known by LLVM git) of " +
603        "the people to look for reviews for.")
604    parser.add_argument('--verbose', '-v', action='count')
605
606    args = parser.parse_args()
607
608    if args.verbose >= 1:
609        logging.basicConfig(level=logging.DEBUG)
610
611    people_to_look_for = [e.decode('utf-8') for e in args.email_addresses]
612    logging.debug("Will look for reviews that following contributors could " +
613                  "review: {}".format(people_to_look_for))
614    logging.debug("Will email a report to: {}".format(args.email_report))
615
616    phab = init_phab_connection()
617
618    if args.update_cache:
619        update_cache(phab)
620
621    load_cache()
622    update_git_repos()
623    msg = print_most_recent_reviews(
624        phab,
625        days=1,
626        filter_reviewers=filter_reviewers_to_report_for(people_to_look_for))
627
628    if args.email_report != []:
629        send_emails(args.email_report, args.sender, msg)
630
631
632if __name__ == "__main__":
633    main()
634