#!/usr/bin/gawk -f
# Copyright (C) 2013 Ryan Kavanagh <rak@debian.org>
# Given a series of lines in the format
# Copyright (c) NNNN, MMMM-MMMM, ..., NNNN John Smith <jsmith@example.org>
# group years and emails by person.
{
match($0, /.*Copyright.*[0-9][,]? +/);
DATE_LENGTH = RLENGTH;
match($0, /<.*>/);
EMAIL_START = RSTART;
if (RLENGTH != -1) {
NAME = substr($0, DATE_LENGTH + 1, EMAIL_START - DATE_LENGTH - 2);
EMAIL = substr($0, EMAIL_START);
} else {
# No email on this line
NAME = substr($0, DATE_LENGTH + 1);
}
match($0, /.*Copyright +\([cC]\) +/);
DATE_START = RLENGTH + 1;
YEARS = substr($0, DATE_START, DATE_LENGTH - DATE_START);
gsub(/, +/, " ", YEARS);
gsub(/,/, " ", YEARS);
people_years[NAME] = people_years[NAME] " " YEARS;
if (EMAIL_LENGTH != -1) {
email_pattern = "/.*" EMAIL ".*/";
if (!(NAME in people_emails)) {
people_emails[NAME] = EMAIL;
} else if (!match(people_emails[NAME], EMAIL)) {
people_emails[NAME] = people_emails[NAME] "," EMAIL;
}
}
} END {
for (person in people_years) {
delete years_array;
split(people_years[person], years_array);
# Split any hyphenated years;
for (year in years_array) {
if (years_array[year] ~ /[0-9]+-[0-9]+/) {
delete split_year;
split(years_array[year], split_year, /-/);
years_array[year] = split_year[1];
if (split_year[1] != split_year[2]) {
# Make sure it isn't some crappy input like 2012-2012
for (j = 1; j <= split_year[2] - split_year[1]; j++) {
years_array[length(years_array) + 1] = \
years_array[year] + j;
}
}
}
}
# Sort the years
asort(years_array);
# Delete any duplicates:
for (i = 1; i <= length(years_array); i++) {
if (i > 1 && years_array[i-1] == years_array[i]) {
# Delete years_array[i-1] instead of years_array[i] so that we
# can still check the next year with ease
delete years_array[i-1];
}
}
# Final sort
asort(years_array);
# Remove duplicates and generate year string
year_string = "";
# Force AWK to access the years in order
added_hyphen = 0;
for (i = 1; i <= length(years_array); i++) {
if (i > 1) {
if (years_array[i - 1] != years_array[i]) {
# added_hyphen tracks if the last character in the string is
# a hyphen
if ((!added_hyphen) && (years_array[i - 1] == years_array[i] - 1)) {
# year_string isn't terminated by a hyphen, and the year
# at i-1 is one less than the current one
year_string = year_string "-";
added_hyphen = 1;
} else if (added_hyphen && (years_array[i - 1] != years_array[i] - 1)) {
# The string is terminated by a hyphen, but the current
# year does not immediately follow the preceeding
# one
year_string = year_string years_array[i-1] ", " years_array[i];
added_hyphen = 0;
} else if (!added_hyphen) {
year_string = year_string ", " years_array[i];
}
}
} else {
year_string = years_array[i];
}
}
# We've added a hyphen, but run out of years to check, terminate it
if (added_hyphen) {
year_string = year_string years_array[length(years_array)];
}
final_line[years_array[length(years_array)]][length(years_array)][person] = \
"Copyright (C) " year_string "\t" person " " people_emails[person];
}
# We can't sort the years indices with asorti because we want a numerical,
# not lexicographic sort of the indices.
j = 0;
delete years_sorted;
for (i in final_line) years_sorted[j++] = i+0;
n_years_entries = asort(years_sorted);
# And output the lines with the most recent contributor first
for (y = n_years_entries; y >= 1; y--) {
# Sort the contributors with most recent contribution in year
# by_year[y] by number of years contributed:
j = 0;
delete contributions_sorted;
for (i in final_line[years_sorted[y]]) contributions_sorted[j++] = i+0;
n_contrib_entries = asort(contributions_sorted);
for (c = n_contrib_entries; c >= 1; c--) {
# Finally, sort by contributor name
asorti(final_line[years_sorted[y]][contributions_sorted[c]], by_person);
# And output the lines in alphabetical order by person name
for (n = 1; n <= length(by_person); n++) {
print final_line[years_sorted[y]][contributions_sorted[c]][by_person[n]];
}
}
}
}