326 lines
14 KiB
Perl
Executable file
326 lines
14 KiB
Perl
Executable file
#!/usr/bin/perl -w
|
|
# extract-code-added-in-commits.plx -*- Perl -*-
|
|
#
|
|
# Copyright (C) 2016 Bradley M. Kuhn <bkuhn@ebb.org>
|
|
#
|
|
# This software's license gives you freedom; you can copy, convey,
|
|
# propogate, redistribute and/or modify this program under the terms of
|
|
# the GNU General Public License (GPL) as published by the Free
|
|
# Software Foundation (FSF), either version 3 of the License, or (at your
|
|
# option) any later version of the GPL published by the FSF.
|
|
#
|
|
# This program is distributed in the hope that it will be useful, but
|
|
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
# General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program in a file in the toplevel directory called
|
|
# "GPLv3". If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
# Motivation for this script:
|
|
|
|
# This script takes as standard input a list of commit ids. This is called
|
|
# the "whitelisted commits" for the process.
|
|
|
|
# The output is a series of directories for each COMMIT_ID (all placed under
|
|
# the directory specified in $ARGV[1]). Under each COMMIT_ID directory,
|
|
# there is a redacted copy of the files specifically changed or added by the
|
|
# operations perfomed in COMMIT_ID. The redcated copy will contain only
|
|
# lines that were added or changed in that file by any operation in the
|
|
# "whitelisted commits".
|
|
|
|
# Motivation for this process:
|
|
|
|
# The idea is to create a corpus of code that we know received
|
|
# contributions from the whitelisted commits. Note that across the various
|
|
# COMMIT_ID directories, there will be substantial duplication. However,
|
|
# the full corpus requires building but in some cases, where code has been
|
|
# rewritten.
|
|
|
|
# That is the "comprehensive mode" of this script. There is also the
|
|
# "central commit" mode. In the central commit mode, to speed up, *one*
|
|
# specific commit is favored for the blame data gathering.
|
|
|
|
# Ultimately, this is input to a process that will compare the output to
|
|
# another codebase to see if material from these commits appear in the other
|
|
# codebase. Use the comprehensive mode if you don't know when the other
|
|
# codebase forked from the one studied here, and use the "central commit"
|
|
# mode if you're already sure where they forked.
|
|
|
|
# Clear Flaw in this process:
|
|
|
|
# In "Estimating the Total Cost of a Linux Distribution", found at
|
|
# https://www.linuxfoundation.org/sites/main/files/publications/estimatinglinux.html,
|
|
# McPherson, Proffitt, and Hale-Evans write:
|
|
|
|
# The biggest weakness in SLOC analysis is its focus on net additions to
|
|
# software projects. Anyone who is familiar with kernel development, for
|
|
# instance, realizes that the highest man-power cost in its development is
|
|
# when code is deleted and modified. The amount of effort that goes into
|
|
# deleting and changing code, not just adding to it, is not reflected in
|
|
# the values associated with this estimate. Because in a collaborative
|
|
# development model, code is developed and then changed and deleted, the
|
|
# true value is far greater than the existing code base. Just think about
|
|
# the process: when a few lines of code are added to the kernel, for
|
|
# instance, many more have to be modified to be compatible with that
|
|
# change. The work that goes into understanding the dependencies and
|
|
# outcomes and then changing that code is not well represented in this
|
|
# study.
|
|
|
|
# Therefore, this process, which ignores lines that are *deleted*, thus
|
|
# streamlining and improving code, ignore a fundamental tenant of software
|
|
# development: that making code smaller, more expressive, and more concise
|
|
# yeilds better designed software. While the process herein *can* produce a
|
|
# clear list of code whose known introduction is directly attributable to the
|
|
# whitelisted commits, the analysis produced by this process does not do
|
|
# justice to the full weight of the contributions made in those whitelisted
|
|
# commits, since removed code is outright ignored in this process.
|
|
|
|
# In other words, this process measures only quantity of code written and
|
|
# fails to examine the quality of the code.
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use Git::Repository 'Log';
|
|
use POSIX ":sys_wait_h";
|
|
use File::Spec;
|
|
use File::Path qw(make_path remove_tree);
|
|
use autodie qw(:all);
|
|
use POSIX qw(strftime);
|
|
use Getopt::Long;
|
|
use Pod::Usage;
|
|
|
|
my($GIT_REPOSITORY_PATH, $OUTPUT_DIRECTORY, $CENTRAL_COMMIT, $FORK_LIMIT, $PROGRESS, $VERBOSE, @ADDITIONAL_BLAME_OPTS);
|
|
$PROGRESS = $VERBOSE = 0;
|
|
$FORK_LIMIT = 1;
|
|
|
|
my $usage = "usage: $0 --repository=PATH --output-dir=DIR [--central-commit=COMMIT-ID] [--fork-limit=NUMBER [--progress] [--verbose[=LEVEL]]\n";
|
|
unless (GetOptions("repository=s" => \$GIT_REPOSITORY_PATH,
|
|
"output-dir=s" => \$OUTPUT_DIRECTORY,
|
|
"verbose:+" => \$VERBOSE,
|
|
"progress" => \$PROGRESS,
|
|
"--blame-opts=s" => \@ADDITIONAL_BLAME_OPTS,
|
|
"central-commit:s" => \$CENTRAL_COMMIT,
|
|
"fork-limit:i" => \$FORK_LIMIT)) {
|
|
print STDERR $usage;
|
|
exit 1;
|
|
}
|
|
|
|
if (not defined $GIT_REPOSITORY_PATH) {
|
|
print STDERR "--repository is a required command line argument.\n";
|
|
print STDERR $usage;
|
|
exit 1;
|
|
}
|
|
if (not defined $OUTPUT_DIRECTORY) {
|
|
print STDERR "--output-dir is a required command line argument.\n";
|
|
print STDERR $usage;
|
|
exit 1;
|
|
}
|
|
|
|
my $LOG_DIR = File::Spec->catfile($OUTPUT_DIRECTORY, ".logs");
|
|
remove_tree($LOG_DIR) if -d $LOG_DIR;
|
|
make_path($LOG_DIR, {mode => 0750});
|
|
die "The directory, $OUTPUT_DIRECTORY, must be a writeable directory" unless -d $OUTPUT_DIRECTORY and -w $OUTPUT_DIRECTORY;
|
|
die "The log directory, $LOG_DIR, must be a writeable directory" unless -d $LOG_DIR and -w $LOG_DIR;
|
|
|
|
##############################################################################
|
|
# Read STDIN for whitelist commits
|
|
|
|
my %WHITELIST_COMMIT_IDS;
|
|
|
|
# Snarf in data
|
|
|
|
while (my $line = <STDIN>) {
|
|
chomp $line;
|
|
die "badly formatted commit ID: $line" unless $line =~ /^[a-z0-9]{40,40}$/;
|
|
$WHITELIST_COMMIT_IDS{$line} = $line;
|
|
}
|
|
|
|
##############################################################################
|
|
# Set up child processing and signal commits.
|
|
|
|
my %childProcesses;
|
|
my %finishedOperations;
|
|
|
|
my $HANDLE_BLAME_WORKERS_SUB = sub {
|
|
# don't change $! and $? outside handler
|
|
local ($!, $?);
|
|
while ( (my $pid = waitpid(-1, WNOHANG)) > 0 ) {
|
|
my($errCode, $errString) = ($?, $!);
|
|
my $commitId = $childProcesses{$pid};
|
|
my $now = strftime("%Y-%m-%d %H:%M:%S", localtime);
|
|
print STDERR "Finished operation $commitId in $pid ($errCode, \"$errString\") at $now\n" if $VERBOSE;
|
|
$finishedOperations{$commitId} = { pid => $pid, time => $now, errCode => $errCode, errString => $errString };
|
|
delete $childProcesses{$pid};
|
|
}
|
|
};
|
|
##############################################################################
|
|
sub StartChildLog($$) {
|
|
my($operation, $pid) = @_;
|
|
my $logFile = File::Spec->catfile($LOG_DIR, "${operation}.${pid}.log");
|
|
open(my $fh, ">", $logFile);
|
|
my $now = strftime("%Y-%m-%d %H:%M:%S", localtime);
|
|
print $fh "Started $operation in $pid at $now\n";
|
|
return $fh;
|
|
}
|
|
##############################################################################
|
|
sub EndChildLog($$$) {
|
|
my($fh, $operation, $pid) = @_;
|
|
my $now = strftime("%Y-%m-%d %H:%M:%S", localtime);
|
|
print $fh "Finished $operation in $pid at $now\n";
|
|
close $fh;
|
|
}
|
|
##############################################################################
|
|
# ProcessCommit is the primary function that processes a commit to generate
|
|
# the blame data. If $fileListRef is defined, it should be a list reference,
|
|
# where the list contains a list of pathnames to run git blame on. If it is
|
|
# undefined, then the file list will be chosen from the commit id
|
|
|
|
sub ProcessCommit($$;$) {
|
|
my($commitId, $pid, $fileListRef) = @_;
|
|
my $fh = StartChildLog($commitId, $pid);
|
|
sleep 5;
|
|
EndChildLog($fh, $commitId, $pid);
|
|
}
|
|
##############################################################################
|
|
sub GitBlameDataToFile($$) {
|
|
my($filename, $dataListRef) = @_;
|
|
print STDERR "Writing git blame data to $filename\n" if $VERBOSE >= 6;
|
|
|
|
open(SPARSE_FILE, ">", $filename) or die "Unable to open $filename: $!";
|
|
|
|
foreach my $line (@$dataListRef) {
|
|
die "invalid line: $line in blame output" unless ($line =~
|
|
/^\s*(\S+)\s+\S+\s+\d+\s+\((.+)\s+(\d{4,4}\-\d{2,2}\-\d{2,2}\s+\d{2,2}:\d{2,2}:\d{2,2})\s+([\+\-\d]+)\s+(\d+)\s*\)\s+(.*)$/);
|
|
my($commitID, $name, $date, $tz, $curLineNumber, $actualCurrentLine) = ($1, $2, $3, $4, $5, $6);
|
|
if (defined $WHITELIST_COMMIT_IDS{$commitID}) {
|
|
print SPARSE_FILE "$actualCurrentLine\n";
|
|
} else {
|
|
print SPARSE_FILE "\n";
|
|
}
|
|
}
|
|
close SPARSE_FILE;
|
|
}
|
|
##############################################################################
|
|
sub RunCentralCommitMode($) {
|
|
my($centralCommitId) = @_;
|
|
|
|
my $centralOutputDir = File::Spec->catfile($OUTPUT_DIRECTORY, $centralCommitId);
|
|
make_path($centralOutputDir, {mode => 0750});
|
|
|
|
print "Creating Repository object with args $GIT_REPOSITORY_PATH\n" if ($VERBOSE >= 6);
|
|
my $gitRepository = Git::Repository->new(git_dir => $GIT_REPOSITORY_PATH);
|
|
|
|
my %files;
|
|
foreach my $commitId (keys %WHITELIST_COMMIT_IDS) {
|
|
my(@commitFiles) = $gitRepository->run('show', '--pretty=format:', '--name-only', $commitId);
|
|
foreach my $file (@commitFiles) {
|
|
$files{$file} = $commitId if not defined $files{$file};
|
|
}
|
|
}
|
|
$SIG{CHLD} = $HANDLE_BLAME_WORKERS_SUB;
|
|
my $cnt = scalar(keys %files);
|
|
print STDERR "Git blame operations to complete: $cnt\n" if $PROGRESS;
|
|
foreach my $file (keys %files) {
|
|
if ($PROGRESS) {
|
|
print STDERR "."; $cnt--; if ($cnt % 100 == 0) { print "\nOperations Remaining: $cnt\n"; }
|
|
}
|
|
my($vv, $path, $filename) = File::Spec->splitpath($file);
|
|
$path = File::Spec->catfile($centralOutputDir, $path);
|
|
make_path($path, 0750);
|
|
my $remainingCount = scalar(keys %childProcesses);
|
|
while ($remainingCount >= $FORK_LIMIT) {
|
|
print STDERR "Sleep a bit while $remainingCount children going for these commits ",
|
|
join(", ", sort values %childProcesses), "\n" if $VERBOSE;
|
|
sleep 10;
|
|
$remainingCount = scalar(keys %childProcesses);
|
|
}
|
|
my $forkCount = scalar(keys %childProcesses) + 1;
|
|
my $pid = fork();
|
|
die "cannot fork: $!" unless defined $pid;
|
|
if ($pid == 0) { # The new child process is here
|
|
$SIG{CHLD} = 'DEFAULT';
|
|
my $logFH = StartChildLog($filename, $$);
|
|
$0 = "$path/$filename git blame subprocess";
|
|
my(@blameData);
|
|
print $logFH "running: git",
|
|
'blame', '-w', '-f', '-n', '-l', @ADDITIONAL_BLAME_OPTS, $centralCommitId, '--', $file, "\n";
|
|
eval {
|
|
@blameData = $gitRepository->run('blame', '-w', '-f', '-n', '-l', @ADDITIONAL_BLAME_OPTS,
|
|
$centralCommitId, '--', $file);
|
|
};
|
|
my $err = $@;
|
|
if (defined $err and $err =~ /fatal.*no\s+such\s+path/) {
|
|
# ignore this file; it isn't present anymore in the central commit.
|
|
} elsif (defined $err and $err !~ /^\s*$/) {
|
|
print $logFH "unrecoverable git blame error: $err";
|
|
} else {
|
|
my $f = File::Spec->catfile($path, $filename);
|
|
GitBlameDataToFile($f, \@blameData);
|
|
}
|
|
EndChildLog($logFH, $filename, $$);
|
|
exit 0;
|
|
} else { # The parent is here
|
|
$childProcesses{$pid} = $file;
|
|
print STDERR "Launched $forkCount child to handle $filename\n" if $VERBOSE;
|
|
}
|
|
}
|
|
}
|
|
##############################################################################
|
|
sub RunComprehensiveMode() {
|
|
foreach my $commitId (keys %WHITELIST_COMMIT_IDS) {
|
|
my $remainingCount = scalar(keys %childProcesses);
|
|
while ($remainingCount >= $FORK_LIMIT) {
|
|
print STDERR "Sleep a bit while $remainingCount children going for these commits ",
|
|
join(", ", sort values %childProcesses), "\n" if $VERBOSE;
|
|
sleep 10;
|
|
$remainingCount = scalar(keys %childProcesses);
|
|
}
|
|
my $forkCount = scalar(keys %childProcesses) + 1;
|
|
my $pid = fork();
|
|
die "cannot fork: $!" unless defined $pid;
|
|
if ($pid == 0) { # The new child process is here
|
|
$0 = "$commitId git blame subprocess";
|
|
ProcessCommit($commitId, $$);
|
|
exit 0;
|
|
} else { # The parent is here
|
|
print STDERR "Launched $forkCount child to handle $commitId\n" if $VERBOSE;
|
|
$childProcesses{$pid} = $commitId;
|
|
}
|
|
}
|
|
|
|
while (scalar(keys %childProcesses) > 0) {
|
|
if ($VERBOSE) {
|
|
print STDERR "Sleep a bit because these are still running ";
|
|
foreach my $pid (keys %childProcesses) { print STDERR " $pid for $childProcesses{$pid}\n"; }
|
|
}
|
|
sleep 10;
|
|
}
|
|
|
|
|
|
##############################################################################
|
|
# Main line of script
|
|
|
|
if (defined $CENTRAL_COMMIT) {
|
|
RunCentralCommitMode($CENTRAL_COMMIT);
|
|
}
|
|
|
|
exit 0;
|
|
my $startCnt = scalar(keys %WHITELIST_COMMIT_IDS);
|
|
my $doneCnt = scalar(keys %finishedOperations);
|
|
print STDERR "ERROR: all children completed but ", $doneCnt - $startCnt, " not completed\n";
|
|
|
|
foreach my $commitId (keys %finishedOperations) {
|
|
print STDERR "Completed $commitId at $finishedOperations{$commitId}{time} in $finishedOperations{$commitId}{pid}\n" if $VERBOSE;
|
|
print STDERR "ERROR: $commitId had non-zero exit status of $finishedOperations{$commitId}{errCode} ",
|
|
"with message \"$finishedOperations{$commitId}{errString}\"",
|
|
" at $finishedOperations{$commitId}{now} in $finishedOperations{$commitId}{pid}\n"
|
|
unless $finishedOperations{$commitId}{errCode} == 0;
|
|
}
|
|
###############################################################################
|
|
#
|
|
# Local variables:
|
|
# compile-command: "perl -c extract-code-added-in-commits.plx"
|
|
# End:
|