#!/bin/bash ### ### AUTHORSTATS - Get author statistics from Internet Drafts and RFCs ### ### Version 2.5.2 ### ### Written in 2005-2008 by Jari Arkko ### Donated to the public domain. ### ### 2.5.0 Cleaned up HTML ### 2.5.1 Added support for authors moving, fixed country and company capitalization ### 2.5.2 Fixed monthly publication graph style ### 2.8.0 Fixed a bug in chair data retrieval ### 2.9.0 Added niger ### 2.9.2 Fixed bug with dashboard link for people with multi-part names ### 3.0.0 Integrated H-index calculation from Lars Eggert ### 4.0.0 New, improved graphics from Lars Eggert ### 4.0.1 Added a special case for christmas island population ### ### Usage: ### ### authorstats draft-foo-00.txt ... ### ### ### Initialize ### rfcfilesdir=. debug=0 doctype=draft authordata=none hindexdata=none wgdata=none oldwgdata=none rfcdata=none chairdata=none iabdata=none iaocdata=none popdata=none topic="active I-Ds" tmpbase=/tmp/$$-tmp wargs="--quiet --tries=1 --timeout=30 --no-cache --no-check-certificate -O -" progdir=`dirname $0` ### ### Process options ### for i in $* do case x$1 in (x--rfcfilesdir) shift rfcfilesdir=$1 shift;; (x--doctype)shift doctype=$1 shift;; (x--debug) debug=1; shift;; (x--data) shift; authordata=$1; shift;; (x--hindex) shift; hindexdata=$1; shift;; (x--wgs) shift wgdata=$1; shift;; (x--oldwgs) shift oldwgdata=$1; shift;; (x--rfcs) shift rfcdata=$1; shift;; (x--chairs) shift chairdata=$1; shift;; (x--iab) shift iabdata=$1; shift;; (x--iaoc) shift iaocdata=$1; shift;; (x--population) shift popdata=$1; shift;; (x--topic) shift topic="$1"; shift;; (x-*) echo 'authorstats: Unrecognized option -- exit'; exit 1;; esac done ### ### from Henrik's idnits ### lookfor() { default="$1"; shift for b in "$@"; do found=$(which "$b" 2>/dev/null) if [ -n "$found" ]; then if [ -x "$found" -o -x "$found.exe" ]; then echo "$found" return fi fi done echo "$default" } # prefer faster awk and sed implementations awk=$(lookfor gawk $awk gawk nawk awk) sed=$(lookfor gsed $sed gsed sed) if [[ $awk =~ gawk ]]; then # too noisy, but flags many nits: awk="$awk -O --lint --lint-old" nop=nop # awk="$awk -O" # cannot be used, will complain on some systems fi ### ### Get author data from the drafts ### if [ x$authordata = xnone ] then authordata=${tmpbase}-a getauthors $* > $authordata fi ### ### Get citation data from the documents ### if [ x$hindexdata = xnone ] then hindexdata=${tmpbase}-h (cd $rfcfilesdir unset LANG maxrfc=0 shopt -s extglob # find citations in documents for f in `grep -E "^rfc[[:digit:]]" $authordata | cut -f1 -d: | sort -r | uniq `; do # which RFCs are cited in document f (other than itself)? frfcnr=$(echo $f | $sed -E 's/rfc([[:digit:]]+).*/\1/') [ ${seen[$frfcnr]} ] && continue cited=$(tr '[:lower:]' '[:upper:]' < $f | tr -d '\n\r' | grep -Eoa 'RFC[[:space:]-]*[[:digit:]]{1,4}' | tr -d 'RFC -' | grep -Ev $frfcnr'|^0+$' | egrep -v '(65125)|(10336)|(8401)|(44520)|(8222)' | # special erroneous cases sort | uniq) echo 0-cites:$frfcnr:$(echo $cited | tr ' ' ,) >> $hindexdata seen[$frfcnr]=1 # for all of those, increase their citation count for rfc in $cited; do origrfc=$rfc rfc=${rfc##+(0)} # remove leading zeros if [ x$rfc = x ] then echo "authorstats: problem with $origrfc while doing $f, cited = $cited" exit 1 fi citcount[$rfc]=$((${citcount[$rfc]} + 1)) if (( $frfcnr > $maxrfc )); then maxrfc=$frfcnr echo "setting maxrfc to $maxrfc" fi done done # dump the citation counts for (( n = 1 ; n < $maxrfc ; n++ )); do echo 0-citations:$n:${citcount[$n]:-0} >> $hindexdata done) fi ### ### Test that we have author and hindex data ### if [ -s $authordata ] then ok=ok else echo 'There is no author data in file '$authordata' -- exit' exit 1 fi ### ### Get WG data from the IETF site ### if [ x$wgdata = xnone ] then wgdata=/tmp/wg-dir.html rm -f $wgdata wget $wargs http://www.ietf.org/dyn/wg/charter.html > $wgdata fi ### ### Get old WG data from the IETF site ### if [ x$oldwgdata = xnone ] then oldwgdata=/tmp/oldindex.html rm -f $oldwgdata wget $wargs http://www.ietf.org/html.charters/OLD/index.html > $oldwgdata fi ### ### Get RFC data from the IETF tools site ### if [ x$rfcdata = xnone ] then rfcdata=/tmp/rfc_id.txt rm -f $rfcdata wget $wargs http://tools.ietf.org/id/rfc_id.txt > $rfcdata fi ### ### Get chair/AD data from the IETF site ### if [ x$chairdata = xnone ] then chairdata=/tmp/1wg-summary.txt rm -f $chairdata wget $wargs http://www.ietf.org/ietf-ftp/1wg-summary.txt > $chairdata fi ### ### Get IAB data from the IAB site ### if [ x$iabdata = xnone ] then iabdata=/tmp/members.html rm -f $iabdata wget $wargs http://www.iab.org/about/members.html > $iabdata fi ### ### Get IAOC data from the temporary IAOC site ### if [ x$iaocdata = xnone ] then iaocdata=/tmp/index.html rm -f $iaocdata wget $wargs http://iaoc.ietf.org/ > $iaocdata fi ### ### Get country population data from the wikipedia site ### if [ x$popdata = xnone ] then popdata=/tmp/List_of_countries_by_population rm -f $popdata wget $wargs http://en.wikipedia.org/wiki/List_of_countries_by_population > $popdata fi ### ### Process WG data ### wgs=/tmp/wgdata.txt rm -f $wgs if [ $debug = 1 ] then echo authorstats: parsing wg data fi $awk < $wgdata ' BEGIN { area = ""; } /.* Area<.h[23]>(<.a>)*$/ { # # Process this area # searchstring = "

"; i1 = index($0,searchstring); if (i1 == 0) { searchstring = "

"; i1 = index($0,searchstring); } if (i1 == 0) { searchstring = ">"; i1 = index($0,searchstring); } area = substr($0,i1+length(searchstring)); i2 = index(area," Area<"); area = substr(area,1,i2 - 1); printf("0-areadefinition:%s\n", area); next; } /-charter.html/ { # # Process this WG # if (area != "") { i1string = "html.charters/"; i1 = index($0,i1string); if (i1 == 0) { i1string = "/charter/"; i1 = index($0,i1string); } wg = substr($0,i1 + length(i1string)); i2 = index(wg,"-charter.html"); wg = substr(wg,1,i2 - 1); printf("0-wgareadefinition:%s:%s\n", area, wg); } } /^.*<.td>$/ { # # Process this WG # if (area != "") { i1string = "/wg/"; i1 = index($0,i1string); wg = substr($0,i1 + length(i1string)); i2 = index(wg,"/"); wg = substr(wg,1,i2 - 1); printf("0-wgareadefinition:%s:%s\n", area, wg); } } /.*/ { next; } END { } ' > $wgs ### ### Process old WG data ### oldwgs=/tmp/oldindex.txt rm -f $oldwgs if [ $debug = 1 ] then echo authorstats: parsing old wg data fi $awk < $oldwgdata ' BEGIN { area = ""; } /.* Area<.[hH]2>/ { i1 = index($0,"

"); area = substr($0,i1+4); i2 = index(area," Area<"); area = substr(area,1,i2 - 1); #printf("9-debug: saw area %s on line %s\n", area, $0); } /-charter.html/ { #printf("9-debug: saw potential wg on line %s\n", $0); if (area != "") { i1 = index($0,"HREF="); wg = substr($0,i1 + 5 + 1 ); i2 = index(wg,"-charter.html"); wg = substr(wg,1,i2 - 1); printf("0-oldwgareadefinition:%s:%s\n", area, wg); } } /.*/ { next; } END { } ' > $oldwgs ### ### Process IAB data ### if [ $debug = 1 ] then echo authorstats: parsing iab data fi iab=/tmp/iab.txt rm -f $iab $awk < $iabdata ' BEGIN { iniab = 0; } /IAB MEMBERS/ { iniab = 1; next; } /EX-OFFICIO/ { iniab = 0; next; } /vacancy/ { next; } /blue-dot/ { if (iniab) { z = $0; gsub(/^.*[<]b[>]/,"",z); gsub(/[<].b[>].*$/,"",z); printf("z3-iab:%s\n",z); } next; } /.*/ { next; } END { }' | $sed 's/ä/a/g' | $sed 's/ö/o/g' | cat > $iab ### ### Process IAOC data ### if [ $debug = 1 ] then echo authorstats: parsing iaoc data fi iaoc=/tmp/iaoc.txt rm -f $iaoc $awk < $iaocdata ' BEGIN { iniaoc = 0; } /IAOC Membership/ { iniaoc = 1; #printf("starting...\n"); } /IAOC Responsibilities/ { iniaoc = 0; #printf("ending...\n"); } /^[<]li[>] .*,/ { #printf("candidate: %s\n", $0); if (iniaoc) { z = $0; z = substr(z,6); z = substr(z,1,index(z,",") - 1); printf("z4-iaoc:%s\n", z); } next; } /.*/ { next; } END { }' | $sed 's/Kurtis Lindquist/Kurtis Lindqvist/' | cat > $iaoc ### ### Process chair data ### if [ $debug = 1 ] then echo authorstats: parsing chair data fi chairs=/tmp/chairdata.txt rm -f $chairs $awk < $chairdata ' BEGIN { area = ""; wg = ""; } /IETF Working Group Summary .By Area./ { next; } /-----------------/ { next; } /^ +WG Mail:/ { next; } /^ +To Join:/ { next; } /^ +In Body:/ { next; } /^ +Archive:/ { next; } /^[A-Z].* Area .[a-z]+.$/ { pos = index($0," Area "); area = substr($0,1,pos - 1); wg = ""; next; } /^[A-Z].* [(].*[)]$/ { z = $0; gsub(/^.*[(]/,"",z); gsub(/[)]$/,"",z); wg = z; } /^ +(([A-Za-z .0-9():])|(-))+ [<].*[>]$/ { z = $0; gsub(/ +Chair[s()]*: +/,"",z); gsub(/^ +/,"",z); gsub(/ [<].*$/,"",z); if (wg == "") { printf("z1-ad:%s:%s\n", area, z); } else { printf("z2-chair:%s:%s\n", wg, z); } next; } END { } ' | $sed 's/Kurt Zeilenga/Kurt D. Zeilenga/' | $sed 's/Russ Housley/Russell Housley/' | $sed 's/Gregory M[.] Lebovitz/Gregory Lebovitz/' | cat > $chairs ### ### Process population data ### pops=/tmp/popdata.txt rm -f $pops cat $popdata | tr -d "'" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | $sed 's/[Ff]lag[ _]+of[ _]+//g' | $sed 's/[Ff]lag of //g' | $sed 's/svg:/:/g' | fgrep -v 'cite_ref-overseas_france' | # avoid overriding france tee /tmp/poptmp.txt | $awk ' function isalpha(c) { return((c >= "a" && c <= "z") || (c >= "A" && c <= "Z")); } function findchar(c) { for (y = 1; y <= 255; y++) { buf = sprintf("%c", y); if (buf == c) return(y); } return(0); } function printstring(what,s) { printf("# debug 4: %s\n", what); for (h = 1; h <= length(s); h++) { printf("# %s (%d)\n", substr(s,h,1), findchar(substr(s,h,1))); } } BEGIN { inskip=1; # # Not all names in Wikipedia are in the same form # as the ones used by getauthors. Map the special # ones so that they match. # mapit["serbia"] = "serbia and montenegro"; mapit["belgium civil"] = "belgium"; mapit["the czech republic"] = "czech republic"; mapit["the solomon islands"] = "solomon islands"; mapit["the peoples republic of china"] = "china"; mapit["peoples republic of china"] = "china"; mapit["the united states"] = "usa"; mapit["united states"] = "usa"; mapit["the united kingdom"] = "united kingdom"; mapit["republic of ireland"] = "ireland"; mapit["netherlands"] = "the netherlands"; mapit["philippines"] = "the philippines"; mapit["the republic of china"] = "taiwan"; mapit["republic of china"] = "taiwan"; mapit["the united arab emirates"] = "united arab emirates"; mapit["the central african republic"] = "central african republic"; mapit["the gambia"] = "gambia"; mapit["the comoros"] = "comoros"; mapit["the bahamas"] = "bahamas"; mapit["the vatican city"] = "vatican city"; mapit["christmas_island"] = "christmas island"; } /^.td.1..td.$/ { inskip = 0; next; } /^.td align="left"./ { if (!inskip) { s = $0; while (length(s) > 0 && substr(s,1,6) != "title=") s = substr(s,2); s = substr(s,8); n = ""; while (length(s) > 0 && substr(s,1,1) != sprintf("%c",34)) { if (isalpha(substr(s,1,1)) || substr(s,1,1) == " " || substr(s,1,1) == "-" || substr(s,1,1) == "_") { n = n substr(s,1,1); } s = substr(s,2); } country = tolower(n); gsub(/[.]*svg/,"",country); # printf("# debug 1: %s - %s\n", n, country); # printf("# debug 2: %s - %s\n", country, mapit[country]); # country == "republic of ireland"); # printstring("input", country); # printstring("const", "republic of ireland"); if (mapit[country] != "") country = mapit[country]; } next; } /^.td.[0-9][0-9.,e+]+..td.$/ { if (inskip == 0 && country != "") { p = substr($0,5); p = substr(p,1,length(p) - 5); gsub(/,/,"",p); pv = p + 0; printf("0-population:%s:%d:%d\n",country,pv,NR); # printf("# debug 4: %s\n", country); country = ""; } next; } /^.td.[0-9][0-9,]+.sup id.*..sup...td.$/ { if (inskip == 0 && country != "") { p = substr($0,5); p = substr(p,1,index(p,"sup") - 2); gsub(/,/,"",p); printf("0-population:%s:%s:%d\n",country,p,NR); # printf("# debug 5: %s\n", country); country = ""; } next; } /^.li id=.cite_note-aus-[0-9]+...b..a href=.#cite_ref-aus_[0-9]+-[0-9]+.....a...b. includes .a href=..wiki.christmas.island. title=.christmas island..christmas island..a. .[0-9,]+., / { pv = $0; sub(/, .*$/,"",pv); sub(/^.*[(]/,"",pv); sub(/[)].*$/,"",pv); sub(/,/,"",pv); pv = pv + 0; printf("0-population:christmas island:%d:%d\n", pv, NR); next; } /.li..span class=.citation wikicite. id=.*...b..a href=..ref_n5.....a...b...span. includes .a href=..wiki.christmas_island. title=.christmas island..christmas island..a. [(]/ { x = $0; sub(/^.*[(]/,"",x); sub(/[)].*$/,"",x); sub(/,/,"",x); printf("0-population:christmas island:%s:%d\n",x,NR); country = ""; next; } /.*/ { next; } ' > $pops ### ### Process RFC data ### if [ $debug = 1 ] then echo authorstats: parsing rfc data fi rfcs=/tmp/rfcdata.txt rm -f $rfcs ### ### Safety check to avoid bad characters etc ### if [ $debug = 1 ] then echo authorstats: process the database fi (cat $wgs; cat $pops; cat $chairs; cat $iab; cat $iaoc; cat $hindexdata; cat $authordata) | sort | tr -d '*?"{}/%";&<>\341\207\351\355' | tr -d "'" | tee /tmp/inputdb.txt | ### ### Process the database ### $awk -v topic="$topic" \ -v doctype=$doctype \ -v debug=$debug \ -f $progdir/authorstats.awk ### ### Cleanup and exit ### if [ $debug -eq 1 ] then echo 'Debug: Base temp = '${tmpbase} else rm -f ${tmpbase}-* fi exit 0