poky/scripts/contrib/patchreview.py

#! /usr/bin/env python3
#
# Copyright OpenEmbedded Contributors
#
# SPDX-License-Identifier: GPL-2.0-only
#

import argparse
import collections
import json
import os
import os.path
import pathlib
import re
import subprocess

import sys
sys.path.append(os.path.join(sys.path[0], '../../meta/lib'))
import oe.qa

# TODO
# - option to just list all broken files
# - test suite
# - validate signed-off-by

status_values = ("accepted", "pending", "inappropriate", "backport", "submitted", "denied", "inactive-upstream")

class Result:
    # Whether the patch has an Upstream-Status or not
    missing_upstream_status = False
    # If the Upstream-Status tag is malformed in some way (string for bad bit)
    malformed_upstream_status = None
    # If the Upstream-Status value is unknown (boolean)
    unknown_upstream_status = False
    # The upstream status value (Pending, etc)
    upstream_status = None
    # Whether the patch has a Signed-off-by or not
    missing_sob = False
    # Whether the Signed-off-by tag is malformed in some way
    malformed_sob = False
    # The Signed-off-by tag value
    sob = None
    # Whether a patch looks like a CVE but doesn't have a CVE tag
    missing_cve = False

def blame_patch(patch):
    """
    From a patch filename, return a list of "commit summary (author name <author
    email>)" strings representing the history.
    """
    return subprocess.check_output(("git", "log",
                                    "--follow", "--find-renames", "--diff-filter=A",
                                    "--format=%s (%aN <%aE>)",
                                    "--", patch), cwd=os.path.dirname(patch)).decode("utf-8").splitlines()

def patchreview(patches):

    # General pattern: start of line, optional whitespace, tag with optional
    # hyphen or spaces, maybe a colon, some whitespace, then the value, all case
    # insensitive.
    sob_re = re.compile(r"^[\t ]*(Signed[-_ ]off[-_ ]by:?)[\t ]*(.+)", re.IGNORECASE | re.MULTILINE)
    status_re = re.compile(r"^[\t ]*(Upstream[-_ ]Status:?)[\t ]*([\w-]*)", re.IGNORECASE | re.MULTILINE)
    cve_tag_re = re.compile(r"^[\t ]*(CVE:)[\t ]*(.*)", re.IGNORECASE | re.MULTILINE)
    cve_re = re.compile(r"cve-[0-9]{4}-[0-9]{4,6}", re.IGNORECASE)

    results = {}

    for patch in patches:

        result = Result()
        results[patch] = result

        content = open(patch, encoding='ascii', errors='ignore').read()

        # Find the Signed-off-by tag
        match = sob_re.search(content)
        if match:
            value = match.group(1)
            if value != "Signed-off-by:":
                result.malformed_sob = value
            result.sob = match.group(2)
        else:
            result.missing_sob = True

        # Find the Upstream-Status tag
        match = status_re.search(content)
        if match:
            value = oe.qa.check_upstream_status(patch)
            if value:
                result.malformed_upstream_status = value

            value = match.group(2).lower()
            # TODO: check case
            if value not in status_values:
                result.unknown_upstream_status = True
            result.upstream_status = value
        else:
            result.missing_upstream_status = True

        # Check that patches which looks like CVEs have CVE tags
        if cve_re.search(patch) or cve_re.search(content):
            if not cve_tag_re.search(content):
                result.missing_cve = True
        # TODO: extract CVE list

    return results


def analyse(results, want_blame=False, verbose=True):
    """
    want_blame: display blame data for each malformed patch
    verbose: display per-file results instead of just summary
    """

    # want_blame requires verbose, so disable blame if we're not verbose
    if want_blame and not verbose:
        want_blame = False

    total_patches = 0
    missing_sob = 0
    malformed_sob = 0
    missing_status = 0
    malformed_status = 0
    missing_cve = 0
    pending_patches = 0

    for patch in sorted(results):
        r = results[patch]
        total_patches += 1
        need_blame = False

        # Build statistics
        if r.missing_sob:
            missing_sob += 1
        if r.malformed_sob:
            malformed_sob += 1
        if r.missing_upstream_status:
            missing_status += 1
        if r.malformed_upstream_status or r.unknown_upstream_status:
            malformed_status += 1
            # Count patches with no status as pending
            pending_patches +=1
        if r.missing_cve:
            missing_cve += 1
        if r.upstream_status == "pending":
            pending_patches += 1

        # Output warnings
        if r.missing_sob:
            need_blame = True
            if verbose:
                print("Missing Signed-off-by tag (%s)" % patch)
        if r.malformed_sob:
            need_blame = True
            if verbose:
                print("Malformed Signed-off-by '%s' (%s)" % (r.malformed_sob, patch))
        if r.missing_cve:
            need_blame = True
            if verbose:
                print("Missing CVE tag (%s)" % patch)
        if r.missing_upstream_status:
            need_blame = True
            if verbose:
                print("Missing Upstream-Status tag (%s)" % patch)
        if r.malformed_upstream_status:
            need_blame = True
            if verbose:
                print("Malformed Upstream-Status '%s' (%s)" % (r.malformed_upstream_status, patch))
        if r.unknown_upstream_status:
            need_blame = True
            if verbose:
                print("Unknown Upstream-Status value '%s' (%s)" % (r.upstream_status, patch))

        if want_blame and need_blame:
            print("\n".join(blame_patch(patch)) + "\n")

    def percent(num):
        try:
            return "%d (%d%%)" % (num, round(num * 100.0 / total_patches))
        except ZeroDivisionError:
            return "N/A"

    if verbose:
        print()

    print("""Total patches found: %d
Patches missing Signed-off-by: %s
Patches with malformed Signed-off-by: %s
Patches missing CVE: %s
Patches missing Upstream-Status: %s
Patches with malformed Upstream-Status: %s
Patches in Pending state: %s""" % (total_patches,
                                   percent(missing_sob),
                                   percent(malformed_sob),
                                   percent(missing_cve),
                                   percent(missing_status),
                                   percent(malformed_status),
                                   percent(pending_patches)))


def histogram(results):
    from toolz import recipes, dicttoolz
    import math

    counts = recipes.countby(lambda r: r.upstream_status, results.values())
    bars = dicttoolz.valmap(lambda v: "#" * int(math.ceil(float(v) / len(results) * 100)), counts)
    for k in bars:
        print("%-20s %s (%d)" % (k.capitalize() if k else "No status", bars[k], counts[k]))

def find_layers(candidate):
    # candidate can either be the path to a layer directly (eg meta-intel), or a
    # repository that contains other layers (meta-arm). We can determine what by
    # looking for a conf/layer.conf file. If that file exists then it's a layer,
    # otherwise its a repository of layers and we can assume they're called
    # meta-*.

    if (candidate / "conf" / "layer.conf").exists():
        return [candidate.absolute()]
    else:
        return [d.absolute() for d in candidate.iterdir() if d.is_dir() and (d.name == "meta" or d.name.startswith("meta-"))]

# TODO these don't actually handle dynamic-layers/

def gather_patches(layers):
    patches = []
    for directory in layers:
        filenames = subprocess.check_output(("git", "-C", directory, "ls-files", "recipes-*/**/*.patch", "recipes-*/**/*.diff"), universal_newlines=True).split()
        patches += [os.path.join(directory, f) for f in filenames]
    return patches

def count_recipes(layers):
    count = 0
    for directory in layers:
        output = subprocess.check_output(["git", "-C", directory, "ls-files", "recipes-*/**/*.bb"], universal_newlines=True)
        count += len(output.splitlines())
    return count

if __name__ == "__main__":
    args = argparse.ArgumentParser(description="Patch Review Tool")
    args.add_argument("-b", "--blame", action="store_true", help="show blame for malformed patches")
    args.add_argument("-v", "--verbose", action="store_true", help="show per-patch results")
    args.add_argument("-g", "--histogram", action="store_true", help="show patch histogram")
    args.add_argument("-j", "--json", help="update JSON")
    args.add_argument("directory", type=pathlib.Path, metavar="DIRECTORY", help="directory to scan (layer, or repository of layers)")
    args = args.parse_args()

    layers = find_layers(args.directory)
    print(f"Found layers {' '.join((d.name for d in layers))}")
    patches = gather_patches(layers)
    results = patchreview(patches)
    analyse(results, want_blame=args.blame, verbose=args.verbose)

    if args.json:
        if os.path.isfile(args.json):
            data = json.load(open(args.json))
        else:
            data = []

        row = collections.Counter()
        row["total"] = len(results)
        row["date"] = subprocess.check_output(["git", "-C", args.directory, "show", "-s", "--pretty=format:%cd", "--date=format:%s"], universal_newlines=True).strip()
        row["commit"] = subprocess.check_output(["git", "-C", args.directory, "rev-parse", "HEAD"], universal_newlines=True).strip()
        row['commit_count'] = subprocess.check_output(["git", "-C", args.directory, "rev-list", "--count", "HEAD"], universal_newlines=True).strip()
        row['recipe_count'] = count_recipes(layers)

        for r in results.values():
            if r.upstream_status in status_values:
                row[r.upstream_status] += 1
            if r.malformed_upstream_status or r.missing_upstream_status:
                row['malformed-upstream-status'] += 1
            if r.malformed_sob or r.missing_sob:
                row['malformed-sob'] += 1

        data.append(row)
        json.dump(data, open(args.json, "w"), sort_keys=True, indent="\t")

    if args.histogram:
        print()
        histogram(results)