#!/bin/bash
###
### AUTHORSTATS - Get author statistics from Internet Drafts and RFCs
###
### Version 2.5.2
###
### Written in 2005-2008 by Jari Arkko
### Donated to the public domain.
###
### 2.5.0 Cleaned up HTML
### 2.5.1 Added support for authors moving, fixed country and company capitalization
### 2.5.2 Fixed monthly publication graph style
### 2.8.0 Fixed a bug in chair data retrieval
### 2.9.0 Added niger
### 2.9.2 Fixed bug with dashboard link for people with multi-part names
### 3.0.0 Integrated H-index calculation from Lars Eggert
### 4.0.0 New, improved graphics from Lars Eggert
### 4.0.1 Added a special case for christmas island population
###
### Usage:
###
### authorstats draft-foo-00.txt ...
###
###
### Initialize
###
rfcfilesdir=.
debug=0
doctype=draft
authordata=none
hindexdata=none
wgdata=none
oldwgdata=none
rfcdata=none
chairdata=none
iabdata=none
iaocdata=none
popdata=none
topic="active I-Ds"
tmpbase=/tmp/$$-tmp
wargs="--quiet --tries=1 --timeout=30 --no-cache --no-check-certificate -O -"
progdir=`dirname $0`
###
### Process options
###
for i in $*
do
case x$1 in
(x--rfcfilesdir) shift
rfcfilesdir=$1
shift;;
(x--doctype)shift
doctype=$1
shift;;
(x--debug) debug=1;
shift;;
(x--data) shift;
authordata=$1;
shift;;
(x--hindex) shift;
hindexdata=$1;
shift;;
(x--wgs) shift
wgdata=$1;
shift;;
(x--oldwgs) shift
oldwgdata=$1;
shift;;
(x--rfcs) shift
rfcdata=$1;
shift;;
(x--chairs) shift
chairdata=$1;
shift;;
(x--iab) shift
iabdata=$1;
shift;;
(x--iaoc) shift
iaocdata=$1;
shift;;
(x--population)
shift
popdata=$1;
shift;;
(x--topic) shift
topic="$1";
shift;;
(x-*) echo 'authorstats: Unrecognized option -- exit';
exit 1;;
esac
done
###
### from Henrik's idnits
###
lookfor() {
default="$1"; shift
for b in "$@"; do
found=$(which "$b" 2>/dev/null)
if [ -n "$found" ]; then
if [ -x "$found" -o -x "$found.exe" ]; then
echo "$found"
return
fi
fi
done
echo "$default"
}
# prefer faster awk and sed implementations
awk=$(lookfor gawk $awk gawk nawk awk)
sed=$(lookfor gsed $sed gsed sed)
if [[ $awk =~ gawk ]]; then
# too noisy, but flags many nits: awk="$awk -O --lint --lint-old"
nop=nop
# awk="$awk -O" # cannot be used, will complain on some systems
fi
###
### Get author data from the drafts
###
if [ x$authordata = xnone ]
then
authordata=${tmpbase}-a
getauthors $* > $authordata
fi
###
### Get citation data from the documents
###
if [ x$hindexdata = xnone ]
then
hindexdata=${tmpbase}-h
(cd $rfcfilesdir
unset LANG
maxrfc=0
shopt -s extglob
# find citations in documents
for f in `grep -E "^rfc[[:digit:]]" $authordata | cut -f1 -d: | sort -r | uniq `; do
# which RFCs are cited in document f (other than itself)?
frfcnr=$(echo $f | $sed -E 's/rfc([[:digit:]]+).*/\1/')
[ ${seen[$frfcnr]} ] && continue
cited=$(tr '[:lower:]' '[:upper:]' < $f | tr -d '\n\r' |
grep -Eoa 'RFC[[:space:]-]*[[:digit:]]{1,4}' |
tr -d 'RFC -' | grep -Ev $frfcnr'|^0+$' |
egrep -v '(65125)|(10336)|(8401)|(44520)|(8222)' | # special erroneous cases
sort | uniq)
echo 0-cites:$frfcnr:$(echo $cited | tr ' ' ,) >> $hindexdata
seen[$frfcnr]=1
# for all of those, increase their citation count
for rfc in $cited; do
origrfc=$rfc
rfc=${rfc##+(0)} # remove leading zeros
if [ x$rfc = x ]
then
echo "authorstats: problem with $origrfc while doing $f, cited = $cited"
exit 1
fi
citcount[$rfc]=$((${citcount[$rfc]} + 1))
if (( $frfcnr > $maxrfc )); then
maxrfc=$frfcnr
echo "setting maxrfc to $maxrfc"
fi
done
done
# dump the citation counts
for (( n = 1 ; n < $maxrfc ; n++ )); do
echo 0-citations:$n:${citcount[$n]:-0} >> $hindexdata
done)
fi
###
### Test that we have author and hindex data
###
if [ -s $authordata ]
then
ok=ok
else
echo 'There is no author data in file '$authordata' -- exit'
exit 1
fi
###
### Get WG data from the IETF site
###
if [ x$wgdata = xnone ]
then
wgdata=/tmp/wg-dir.html
rm -f $wgdata
wget $wargs http://www.ietf.org/dyn/wg/charter.html > $wgdata
fi
###
### Get old WG data from the IETF site
###
if [ x$oldwgdata = xnone ]
then
oldwgdata=/tmp/oldindex.html
rm -f $oldwgdata
wget $wargs http://www.ietf.org/html.charters/OLD/index.html > $oldwgdata
fi
###
### Get RFC data from the IETF tools site
###
if [ x$rfcdata = xnone ]
then
rfcdata=/tmp/rfc_id.txt
rm -f $rfcdata
wget $wargs http://tools.ietf.org/id/rfc_id.txt > $rfcdata
fi
###
### Get chair/AD data from the IETF site
###
if [ x$chairdata = xnone ]
then
chairdata=/tmp/1wg-summary.txt
rm -f $chairdata
wget $wargs http://www.ietf.org/ietf-ftp/1wg-summary.txt > $chairdata
fi
###
### Get IAB data from the IAB site
###
if [ x$iabdata = xnone ]
then
iabdata=/tmp/members.html
rm -f $iabdata
wget $wargs http://www.iab.org/about/members.html > $iabdata
fi
###
### Get IAOC data from the temporary IAOC site
###
if [ x$iaocdata = xnone ]
then
iaocdata=/tmp/index.html
rm -f $iaocdata
wget $wargs http://iaoc.ietf.org/ > $iaocdata
fi
###
### Get country population data from the wikipedia site
###
if [ x$popdata = xnone ]
then
popdata=/tmp/List_of_countries_by_population
rm -f $popdata
wget $wargs http://en.wikipedia.org/wiki/List_of_countries_by_population > $popdata
fi
###
### Process WG data
###
wgs=/tmp/wgdata.txt
rm -f $wgs
if [ $debug = 1 ]
then
echo authorstats: parsing wg data
fi
$awk < $wgdata '
BEGIN {
area = "";
}
/.* Area<.h[23]>(<.a>)*$/ {
#
# Process this area
#
searchstring = "
";
i1 = index($0,searchstring);
if (i1 == 0) {
searchstring = "";
i1 = index($0,searchstring);
}
if (i1 == 0) {
searchstring = ">";
i1 = index($0,searchstring);
}
area = substr($0,i1+length(searchstring));
i2 = index(area," Area<");
area = substr(area,1,i2 - 1);
printf("0-areadefinition:%s\n", area);
next;
}
/-charter.html/ {
#
# Process this WG
#
if (area != "") {
i1string = "html.charters/";
i1 = index($0,i1string);
if (i1 == 0) {
i1string = "/charter/";
i1 = index($0,i1string);
}
wg = substr($0,i1 + length(i1string));
i2 = index(wg,"-charter.html");
wg = substr(wg,1,i2 - 1);
printf("0-wgareadefinition:%s:%s\n", area, wg);
}
}
/^.*<.td>$/ {
#
# Process this WG
#
if (area != "") {
i1string = "/wg/";
i1 = index($0,i1string);
wg = substr($0,i1 + length(i1string));
i2 = index(wg,"/");
wg = substr(wg,1,i2 - 1);
printf("0-wgareadefinition:%s:%s\n", area, wg);
}
}
/.*/ {
next;
}
END {
}
' > $wgs
###
### Process old WG data
###
oldwgs=/tmp/oldindex.txt
rm -f $oldwgs
if [ $debug = 1 ]
then
echo authorstats: parsing old wg data
fi
$awk < $oldwgdata '
BEGIN {
area = "";
}
/.* Area<.[hH]2>/ {
i1 = index($0,"");
area = substr($0,i1+4);
i2 = index(area," Area<");
area = substr(area,1,i2 - 1);
#printf("9-debug: saw area %s on line %s\n", area, $0);
}
/-charter.html/ {
#printf("9-debug: saw potential wg on line %s\n", $0);
if (area != "") {
i1 = index($0,"HREF=");
wg = substr($0,i1 + 5 + 1 );
i2 = index(wg,"-charter.html");
wg = substr(wg,1,i2 - 1);
printf("0-oldwgareadefinition:%s:%s\n", area, wg);
}
}
/.*/ {
next;
}
END {
}
' > $oldwgs
###
### Process IAB data
###
if [ $debug = 1 ]
then
echo authorstats: parsing iab data
fi
iab=/tmp/iab.txt
rm -f $iab
$awk < $iabdata '
BEGIN {
iniab = 0;
}
/IAB MEMBERS/ {
iniab = 1;
next;
}
/EX-OFFICIO/ {
iniab = 0;
next;
}
/vacancy/ {
next;
}
/blue-dot/ {
if (iniab) {
z = $0;
gsub(/^.*[<]b[>]/,"",z);
gsub(/[<].b[>].*$/,"",z);
printf("z3-iab:%s\n",z);
}
next;
}
/.*/ {
next;
}
END {
}' |
$sed 's/ä/a/g' |
$sed 's/ö/o/g' |
cat > $iab
###
### Process IAOC data
###
if [ $debug = 1 ]
then
echo authorstats: parsing iaoc data
fi
iaoc=/tmp/iaoc.txt
rm -f $iaoc
$awk < $iaocdata '
BEGIN {
iniaoc = 0;
}
/IAOC Membership/ {
iniaoc = 1;
#printf("starting...\n");
}
/IAOC Responsibilities/ {
iniaoc = 0;
#printf("ending...\n");
}
/^[<]li[>] .*,/ {
#printf("candidate: %s\n", $0);
if (iniaoc) {
z = $0;
z = substr(z,6);
z = substr(z,1,index(z,",") - 1);
printf("z4-iaoc:%s\n", z);
}
next;
}
/.*/ {
next;
}
END {
}' |
$sed 's/Kurtis Lindquist/Kurtis Lindqvist/' |
cat > $iaoc
###
### Process chair data
###
if [ $debug = 1 ]
then
echo authorstats: parsing chair data
fi
chairs=/tmp/chairdata.txt
rm -f $chairs
$awk < $chairdata '
BEGIN {
area = "";
wg = "";
}
/IETF Working Group Summary .By Area./ {
next;
}
/-----------------/ {
next;
}
/^ +WG Mail:/ {
next;
}
/^ +To Join:/ {
next;
}
/^ +In Body:/ {
next;
}
/^ +Archive:/ {
next;
}
/^[A-Z].* Area .[a-z]+.$/ {
pos = index($0," Area ");
area = substr($0,1,pos - 1);
wg = "";
next;
}
/^[A-Z].* [(].*[)]$/ {
z = $0;
gsub(/^.*[(]/,"",z);
gsub(/[)]$/,"",z);
wg = z;
}
/^ +(([A-Za-z .0-9():])|(-))+ [<].*[>]$/ {
z = $0;
gsub(/ +Chair[s()]*: +/,"",z);
gsub(/^ +/,"",z);
gsub(/ [<].*$/,"",z);
if (wg == "") {
printf("z1-ad:%s:%s\n", area, z);
} else {
printf("z2-chair:%s:%s\n", wg, z);
}
next;
}
END {
}
' |
$sed 's/Kurt Zeilenga/Kurt D. Zeilenga/' |
$sed 's/Russ Housley/Russell Housley/' |
$sed 's/Gregory M[.] Lebovitz/Gregory Lebovitz/' |
cat > $chairs
###
### Process population data
###
pops=/tmp/popdata.txt
rm -f $pops
cat $popdata |
tr -d "'" |
tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz |
$sed 's/[Ff]lag[ _]+of[ _]+//g' |
$sed 's/[Ff]lag of //g' |
$sed 's/svg:/:/g' |
fgrep -v 'cite_ref-overseas_france' | # avoid overriding france
tee /tmp/poptmp.txt |
$awk '
function isalpha(c) {
return((c >= "a" && c <= "z") ||
(c >= "A" && c <= "Z"));
}
function findchar(c) {
for (y = 1; y <= 255; y++) {
buf = sprintf("%c", y);
if (buf == c) return(y);
}
return(0);
}
function printstring(what,s) {
printf("# debug 4: %s\n", what);
for (h = 1; h <= length(s); h++) {
printf("# %s (%d)\n", substr(s,h,1), findchar(substr(s,h,1)));
}
}
BEGIN {
inskip=1;
#
# Not all names in Wikipedia are in the same form
# as the ones used by getauthors. Map the special
# ones so that they match.
#
mapit["serbia"] = "serbia and montenegro";
mapit["belgium civil"] = "belgium";
mapit["the czech republic"] = "czech republic";
mapit["the solomon islands"] = "solomon islands";
mapit["the peoples republic of china"] = "china";
mapit["peoples republic of china"] = "china";
mapit["the united states"] = "usa";
mapit["united states"] = "usa";
mapit["the united kingdom"] = "united kingdom";
mapit["republic of ireland"] = "ireland";
mapit["netherlands"] = "the netherlands";
mapit["philippines"] = "the philippines";
mapit["the republic of china"] = "taiwan";
mapit["republic of china"] = "taiwan";
mapit["the united arab emirates"] = "united arab emirates";
mapit["the central african republic"] = "central african republic";
mapit["the gambia"] = "gambia";
mapit["the comoros"] = "comoros";
mapit["the bahamas"] = "bahamas";
mapit["the vatican city"] = "vatican city";
mapit["christmas_island"] = "christmas island";
}
/^.td.1..td.$/ {
inskip = 0;
next;
}
/^.td align="left"./ {
if (!inskip) {
s = $0;
while (length(s) > 0 && substr(s,1,6) != "title=") s = substr(s,2);
s = substr(s,8);
n = "";
while (length(s) > 0 && substr(s,1,1) != sprintf("%c",34)) {
if (isalpha(substr(s,1,1)) || substr(s,1,1) == " " ||
substr(s,1,1) == "-" || substr(s,1,1) == "_") {
n = n substr(s,1,1);
}
s = substr(s,2);
}
country = tolower(n);
gsub(/[.]*svg/,"",country);
# printf("# debug 1: %s - %s\n", n, country);
# printf("# debug 2: %s - %s\n", country, mapit[country]);
# country == "republic of ireland");
# printstring("input", country);
# printstring("const", "republic of ireland");
if (mapit[country] != "") country = mapit[country];
}
next;
}
/^.td.[0-9][0-9.,e+]+..td.$/ {
if (inskip == 0 && country != "") {
p = substr($0,5);
p = substr(p,1,length(p) - 5);
gsub(/,/,"",p);
pv = p + 0;
printf("0-population:%s:%d:%d\n",country,pv,NR);
# printf("# debug 4: %s\n", country);
country = "";
}
next;
}
/^.td.[0-9][0-9,]+.sup id.*..sup...td.$/ {
if (inskip == 0 && country != "") {
p = substr($0,5);
p = substr(p,1,index(p,"sup") - 2);
gsub(/,/,"",p);
printf("0-population:%s:%s:%d\n",country,p,NR);
# printf("# debug 5: %s\n", country);
country = "";
}
next;
}
/^.li id=.cite_note-aus-[0-9]+...b..a href=.#cite_ref-aus_[0-9]+-[0-9]+.....a...b. includes .a href=..wiki.christmas.island. title=.christmas island..christmas island..a. .[0-9,]+., / {
pv = $0;
sub(/, .*$/,"",pv);
sub(/^.*[(]/,"",pv);
sub(/[)].*$/,"",pv);
sub(/,/,"",pv);
pv = pv + 0;
printf("0-population:christmas island:%d:%d\n", pv, NR);
next;
}
/.li..span class=.citation wikicite. id=.*...b..a href=..ref_n5.....a...b...span. includes .a href=..wiki.christmas_island. title=.christmas island..christmas island..a. [(]/ {
x = $0;
sub(/^.*[(]/,"",x);
sub(/[)].*$/,"",x);
sub(/,/,"",x);
printf("0-population:christmas island:%s:%d\n",x,NR);
country = "";
next;
}
/.*/ {
next;
}
' > $pops
###
### Process RFC data
###
if [ $debug = 1 ]
then
echo authorstats: parsing rfc data
fi
rfcs=/tmp/rfcdata.txt
rm -f $rfcs
###
### Safety check to avoid bad characters etc
###
if [ $debug = 1 ]
then
echo authorstats: process the database
fi
(cat $wgs;
cat $pops;
cat $chairs;
cat $iab;
cat $iaoc;
cat $hindexdata;
cat $authordata) |
sort |
tr -d '*?"{}/%";&<>\341\207\351\355' |
tr -d "'" |
tee /tmp/inputdb.txt |
###
### Process the database
###
$awk -v topic="$topic" \
-v doctype=$doctype \
-v debug=$debug \
-f $progdir/authorstats.awk
###
### Cleanup and exit
###
if [ $debug -eq 1 ]
then
echo 'Debug: Base temp = '${tmpbase}
else
rm -f ${tmpbase}-*
fi
exit 0