#!/bin/bash

###
### AUTHORSTATS - Get author statistics from Internet Drafts and RFCs
###
### Version 2.5.2
###
### Written in 2005-2008 by Jari Arkko
### Donated to the public domain.
###
###   2.5.0   Cleaned up HTML
###   2.5.1   Added support for authors moving, fixed country and company capitalization
###   2.5.2   Fixed monthly publication graph style
###   2.8.0   Fixed a bug in chair data retrieval
###   2.9.0   Added niger
###   2.9.2   Fixed bug with dashboard link for people with multi-part names
###   3.0.0   Integrated H-index calculation from Lars Eggert
###   4.0.0   New, improved graphics from Lars Eggert
###   4.0.1   Added a special case for christmas island population
###   6.0.1   Extra dots removed from personal pages, date format changed per Tero's request, etc.
###
### Usage:
###
###   authorstats draft-foo-00.txt ...
###

###
### Initialize
###

nocitations=0
rfcfilesdir=.
debug=0
doctype=draft
authordata=none
hindexdata=none
wgdata=none
oldwgdata=none
rfcdata=none
chairdata=none
iabdata=none
iaocdata=none
popdata=none
topic="active I-Ds"
tmpbase=/tmp/$$-tmp
wargs="--quiet --tries=1 --timeout=30 --no-cache --no-check-certificate -O -"
progdir=`dirname $0`
genders=$progdir/authorstats.data
firstnamedb=$HOME/LME/IETF/GenderTools/GenderChecker/names.csv

###
### Process options
###

for i in $*
do
  case x$1 in

    (x--nocitations) shift
                     nocitations=1;;
                       
    (x--rfcfilesdir) shift
                     rfcfilesdir=$1
                     shift;;

    (x--doctype)shift
                doctype=$1
                shift;;

    (x--debug)  debug=1;
                shift;;

    (x--data)   shift;
                authordata=$1;
                shift;;

    (x--hindex) shift;
                hindexdata=$1;
                shift;;

    (x--wgs)    shift
                wgdata=$1;
                shift;;

    (x--oldwgs) shift
                oldwgdata=$1;
                shift;;

    (x--rfcs)   shift
                rfcdata=$1;
                shift;;

    (x--chairs) shift
                chairdata=$1;
                shift;;

    (x--iab)    shift
                iabdata=$1;
                shift;;

    (x--iaoc)   shift
                iaocdata=$1;
                shift;;

    (x--population)
                shift
                popdata=$1;
                shift;;

    (x--topic)  shift
                topic="$1";
                shift;;

    (x-*)       echo 'authorstats: Unrecognized option -- exit';
                exit 1;;

  esac
done

###
### from Henrik's idnits
###

lookfor() {
    default="$1"; shift
    for b in "$@"; do
        found=$(which "$b" 2>/dev/null)
        if [ -n "$found" ]; then
            if [ -x "$found" -o -x "$found.exe" ]; then
                echo "$found"
                return
            fi
        fi
    done
    echo "$default"
}

# prefer faster awk and sed implementations
awk=$(lookfor gawk $awk gawk nawk awk)
sed=$(lookfor gsed $sed gsed sed)

if [[ $awk =~ gawk ]]; then
# too noisy, but flags many nits:  awk="$awk -O --lint --lint-old"
   nop=nop
#  awk="$awk -O"  # cannot be used, will complain on some systems
fi

###
### Get author data from the drafts
###

if [ x$authordata = xnone ]
then
  authordata=${tmpbase}-a
  getauthors $* | grep -iv mcsweeney > $authordata
fi

###
### Get citation data from the documents
###

if [ x$hindexdata = xnone ]
then
  hindexdata=${tmpbase}-h
  rm -f $hindexdata
  touch $hindexdata
  (cd $rfcfilesdir
  unset LANG
  maxrfc=0
  shopt -s extglob

  # find citations in documents
  for f in `grep -E "^rfc[[:digit:]]" $authordata | cut -f1 -d: | sort -r | uniq `; do
    # which RFCs are cited in document f (other than itself)?
     frfcnr=$(echo $f | $sed -E 's/rfc([[:digit:]]+).*/\1/')
     [ ${seen[$frfcnr]} ] && continue     
     cited=$(tr '[:lower:]' '[:upper:]' < $f | tr -d '\n\r' |
             grep -Eoa 'RFC[[:space:]-]*[[:digit:]]{1,4}' |
             tr -d 'RFC -' | grep -Ev $frfcnr'|^0+$' |
             egrep -v '(65125)|(10336)|(8401)|(44520)|(8222)' | # special erroneous cases
             sort | uniq)
    echo 0-cites:$frfcnr:$(echo $cited | tr ' ' ,) >> $hindexdata
    seen[$frfcnr]=1

    # for all of those, increase their citation count        
    for rfc in $cited; do
      origrfc=$rfc
      rfc=${rfc##+(0)} # remove leading zeros
      if [ x$rfc = x ]
      then
        echo "authorstats: problem with $origrfc while doing $f, cited = $cited"
        exit 1
      fi
      citcount[$rfc]=$((${citcount[$rfc]} + 1))
      if (( $frfcnr > $maxrfc )); then
        maxrfc=$frfcnr
        echo "setting maxrfc to $maxrfc"
      fi
    done
  done

  # dump the citation counts
  for (( n = 1 ; n < $maxrfc ; n++ )); do
    echo 0-citations:$n:${citcount[$n]:-0} >> $hindexdata
  done)
fi

###
### Test that we have author and hindex data
###

if [ -s $authordata ]
then
  ok=ok
else
  echo 'There is no author data in file '$authordata' -- exit'
  exit 1
fi

###
### Get WG data from the IETF site
###

if [ x$wgdata = xnone ]
then
  wgdata=/tmp/wg-dir.html
  rm -f $wgdata
  wget $wargs http://www.ietf.org/dyn/wg/charter.html > $wgdata
fi

###
### Get old WG data from the IETF site
###

if [ x$oldwgdata = xnone ]
then
  oldwgdata=/tmp/oldindex.html
  rm -f $oldwgdata
  wget $wargs http://www.ietf.org/html.charters/OLD/index.html > $oldwgdata
fi

###
### Get RFC data from the IETF tools site
###

if [ x$rfcdata = xnone ]
then
  rfcdata=/tmp/rfc_id.txt
  rm -f $rfcdata
  wget $wargs http://tools.ietf.org/id/rfc_id.txt > $rfcdata
fi

###
### Get chair/AD data from the IETF site
###

if [ x$chairdata = xnone ]
then
  chairdata=/tmp/1wg-summary.txt
  rm -f $chairdata
  wget $wargs https://datatracker.ietf.org/wg/1wg-summary.txt > $chairdata
fi

###
### Get IAB data from the IAB site
###

if [ x$iabdata = xnone ]
then
  iabdata=/tmp/members.html
  rm -f $iabdata
  wget $wargs http://www.iab.org/about/members.html > $iabdata
fi

###
### Get IAOC data from the temporary IAOC site
###

if [ $debug = 1 ]
then
  echo authorstats: fetching iaoc data
fi

if [ x$iaocdata = xnone ]
then
  iaocdata=/tmp/index.html
  rm -f $iaocdata
  wget $wargs http://iaoc.ietf.org/ > $iaocdata
fi

###
### Get country population data from the wikipedia site
###

if [ $debug = 1 ]
then
  echo authorstats: fetching population data
fi

if [ x$popdata = xnone ]
then
  popdata=/tmp/List_of_countries_by_population
  rm -f $popdata
  wget $wargs http://en.wikipedia.org/wiki/List_of_countries_by_population > $popdata
fi

###
### Process WG data
###

wgs=/tmp/wgdata.txt
rm -f $wgs
rm -f $cleanwgs

if [ $debug = 1 ]
then
  echo authorstats: parsing wg data
fi

# the following should really be updated to use the datatracker API:
#     https://datatracker.ietf.org/api/v1/group/group/?type=area&state=active&format=json&limit=999
#     jättämällä format:in pois tulee xml:ää ja laittamanalla type=wg tai
#     type=rg saa working/research groupit, etc.

$awk < $wgdata '
BEGIN {
  area = "";
}

/.* Area [(][a-z]+[)]<.h[23]>$/ {

  #
  # Process this area
  #

  searchstring = "<h2>";
  i1 = index($0,searchstring);
  if (i1 == 0) {
    searchstring = "<h3>";
    i1 = index($0,searchstring);
  }
  if (i1 == 0) {
    searchstring = ">";
    i1 = index($0,searchstring);
  }
  area = substr($0,i1+length(searchstring));
  i2 = index(area," Area");
  area = substr(area,1,i2 - 1);
  printf("0-areadefinition:%s\n", area);
  next;
}

/ *<h2 class=.anchor-target. .* Area ([a-z]*)<.h2>$/ {

  #
  # Process this area
  #

  searchstring = ">";
  i1 = index($0,searchstring);
  area = substr($0,i1+length(searchstring));
  i2 = index(area," Area (");
  area = substr(area,1,i2 - 1);
  printf("0-areadefinition:%s\n", area);
  next;
}

/-charter.html/ {

  #
  # Process this WG
  #

  if (area != "") {
    i1string = "html.charters/"; 
    i1 = index($0,i1string);
    if (i1 == 0) {
      i1string = "/charter/";
      i1 = index($0,i1string);
    }
    wg = substr($0,i1 + length(i1string));
    i2 = index(wg,"-charter.html");
    wg = substr(wg,1,i2 - 1);
    printf("0-wgareadefinition:%s:%s\n", area, wg);
  }
}

/^<tr><td.*<a href=".wg..+.">.*<.td>$/ {

  #
  # Process this WG
  #

  if (area != "") {
    i1string = "/wg/"; 
    i1 = index($0,i1string);
    wg = substr($0,i1 + length(i1string));
    i2 = index(wg,"/");
    wg = substr(wg,1,i2 - 1);
    printf("0-wgareadefinition:%s:%s\n", area, wg);
  }
}

/^[ 	]*<td.*<a href=".wg..+.">.*<.td>$/ {

  #
  # Process this WG
  #

  if (area != "") {
    i1string = "/wg/"; 
    i1 = index($0,i1string);
    wg = substr($0,i1 + length(i1string));
    i2 = index(wg,"/");
    wg = substr(wg,1,i2 - 1);
    printf("0-wgareadefinition:%s:%s\n", area, wg);
  }
}

/.*/ {
  next;
}

END {
}
' > $wgs

###
### Process old WG data
###

oldwgs=/tmp/oldindex.txt
rm -f $oldwgs

if [ $debug = 1 ]
then
  echo authorstats: parsing old wg data
fi

$awk < $oldwgdata '
BEGIN {
  area = "";
}

/.* Area<.[hH]2>/ {
  i1 = index($0,"<h2>");
  area = substr($0,i1+4);
  i2 = index(area," Area<");
  area = substr(area,1,i2 - 1);
  #printf("9-debug: saw area %s on line %s\n", area, $0);
}

/-charter.html/ {
  #printf("9-debug: saw potential wg on line %s\n", $0);
  if (area != "") {
    i1 = index($0,"HREF=");
    wg = substr($0,i1 + 5 + 1 );
    i2 = index(wg,"-charter.html");
    wg = substr(wg,1,i2 - 1);
    printf("0-oldwgareadefinition:%s:%s\n", area, wg);
  }
}

/.*/ {
  next;
}

END {
}
' > $oldwgs


###
### Process IAB data
###

if [ $debug = 1 ]
then
  echo authorstats: parsing iab data
fi

iab=/tmp/iab.txt
rm -f $iab
$awk < $iabdata '
BEGIN {
  iniab = 0;
}
/IAB MEMBERS/ {
  iniab = 1;
  next;
}
/EX-OFFICIO/ {
  iniab = 0;
  next;
}
/vacancy/ {
  next;
}
/blue-dot/ {
  if (iniab) {
    z = $0;
    gsub(/^.*[<]b[>]/,"",z);
    gsub(/[<].b[>].*$/,"",z);
    printf("z3-iab:%s\n",z);
  }
  next;
}
/.*/ {
  next;
}
END {
}' |
$sed 's/&auml;/a/g' |
$sed 's/&ouml;/o/g' |
cat > $iab

###
### Process IAOC data
###

if [ $debug = 1 ]
then
  echo authorstats: parsing iaoc data
fi

iaoc=/tmp/iaoc.txt
rm -f $iaoc
$awk < $iaocdata '
BEGIN {
  iniaoc = 0;
}

/IAOC Membership/ {
  iniaoc = 1;
  #printf("starting...\n");
}

/IAOC Responsibilities/ {
  iniaoc = 0;
  #printf("ending...\n");
}

/^[<]li[>] .*,/ {
  #printf("candidate: %s\n", $0);
  if (iniaoc) {
    z = $0;
    z = substr(z,6);
    z = substr(z,1,index(z,",") - 1);
    printf("z4-iaoc:%s\n", z);
  }
  next;
}

/.*/ {
  next;
}
END {
}' |
$sed 's/Kurtis Lindquist/Kurtis Lindqvist/' |
cat > $iaoc

###
### Process chair data
###

if [ $debug = 1 ]
then
  echo authorstats: parsing chair data
fi

chairs=/tmp/chairdata.txt
cleanchairs=/tmp/cleanchairdata.txt
rm -f $chairs
$awk < $chairdata '
BEGIN {
  area = "";
  wg = "";
}

/IETF Working Group Summary .By Area./ {
  next;
}

/-----------------/ {
  next;
}

#
# Recognise information fields that we dont care about
#

/^ +WG Mail:/ {
  next;
}

/^ +To Join:/ {
  next;
}

/^ +In Body:/ {
  next;
}

/^ +Archive:/ {
  next;
}

#
# Recognise area definitions
#

/^[A-Z].* Area .[a-z]+.$/ {
  pos = index($0," Area ");
  area = substr($0,1,pos - 1);
  wg = "";
  next;
}

#
# Recognise WGs
#

/^[A-Z].* [(].*[)]$/ {
  z = $0;
  gsub(/^.*[(]/,"",z);
  gsub(/[)]$/,"",z);
  wg = z;
}

/^ +(([A-Za-z .0-9():])|(-))+ [<].*[>]$/ {
  z = $0;
  gsub(/ +Chair[s()]*: +/,"",z);
  gsub(/^ +/,"",z);
  gsub(/ [<].*$/,"",z);
  if (wg == "") {
    printf("z1-ad:%s:%s\n", area, z);
  } else {
    printf("z2-chair:%s:%s\n", wg, z);
  }
  next;
}

END {
}
' |
$sed 's/Kurt Zeilenga/Kurt D. Zeilenga/' |
$sed 's/Russ Housley/Russell Housley/' |
$sed 's/Gregory M[.] Lebovitz/Gregory Lebovitz/' |
cat > $chairs

###
### Clean up the chair database so that it can be copied to the net
###

sed 's/z2-//' $chairs | sed 's/z1-//' > $cleanchairs

###
### Process population data
###

pops=/tmp/popdata.txt
rm -f $pops

if [ $debug = 1 ]
then
  echo authorstats: parsing population data
fi

echo "start" > "/tmp/popdebugs.txt"

cat $popdata |
tr -d "'" |
tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz |
$sed 's/[Ff]lag[ _]+of[ _]+//g' |
$sed 's/[Ff]lag of //g' |
$sed 's/svg:/:/g' |
$sed 's/S.o Tome and Príncipe/st. tome and principe/g' |
$sed 's/[dD]emographics of //g' |
fgrep -v 'cite_ref-overseas_france' | # avoid overriding france
tee /tmp/poptmp.txt |
$awk '
function isalpha(c) {
  return((c >= "a" && c <= "z") ||
         (c >= "A" && c <= "Z"));
}

function findchar(c) {
  for (y = 1; y <= 255; y++) {
    buf = sprintf("%c", y);
    if (buf == c) return(y);
  }
  return(0);
}

function printstring(what,s) {
  printf("# debug 4: %s\n", what);
  for (h = 1; h <= length(s); h++) {
      printf("#          %s (%d)\n", substr(s,h,1), findchar(substr(s,h,1)));
  }
}

function processcountry() {
  printf("item 2 (inskip %u) %s\n", inskip, $0) >> "/tmp/popdebugs.txt";
  if (!inskip) {
    s = $0;
    if (s ~ /title=/) {
      printf("  title\n") >> "/tmp/popdebugs.txt";
      while (length(s) > 0 && substr(s,1,6) != "title=") s = substr(s,2);
      s = substr(s,8);
    } else if (s ~/flag_of_/) {
      printf("  flag\n") >> "/tmp/popdebugs.txt";
      while (length(s) > 0 && substr(s,1,8) != "flag_of_") s = substr(s,2);
      printf("  after scan %s\n", s) >> "/tmp/popdebugs.txt";
      s = substr(s,9);
      printf("  after removal %s\n", s) >> "/tmp/popdebugs.txt";
    } else {
      printf("  not recognized\n") >> "/tmp/popdebugs.txt";
      return;
    }
    n = "";
    while (length(s) > 0 && substr(s,1,1) != sprintf("%c",34)) {
      if (isalpha(substr(s,1,1)) || substr(s,1,1) == " " ||
          substr(s,1,1) == "-" || substr(s,1,1) == "_") {
        n = n substr(s,1,1);
      }
      s = substr(s,2);
    }
    country = tolower(n);
    printf("  country remains as %s\n", country) >> "/tmp/popdebugs.txt";
    gsub(/[.]*svg.*$/,"",country);
    gsub(/_/," ",country);
    printf("  country remains as %s\n", country) >> "/tmp/popdebugs.txt";
#   printf("# debug 1: %s - %s\n", n, country);
#   printf("# debug 2: %s - %s\n", country, mapit[country]);
#           country == "republic of ireland");
#    printstring("input", country);
#    printstring("const", "republic of ireland");
    if (mapit[country] != "") country = mapit[country];
  }
}

BEGIN {
  inskip = 1;
  
  #
  # Not all names in Wikipedia are in the same form
  # as the ones used by getauthors. Map the special
  # ones so that they match.
  #

  mapit["serbia"] = "serbia and montenegro";
  mapit["belgium civil"] = "belgium";
  mapit["the czech republic"] = "czech republic";
  mapit["the solomon islands"] = "solomon islands";
  mapit["the peoples republic of china"] = "china";
  mapit["peoples republic of china"] = "china";
  mapit["the united states"] = "usa";
  mapit["united states"] = "usa";
  mapit["the united kingdom"] = "united kingdom";
  mapit["republic of ireland"] = "ireland";
  mapit["netherlands"] = "the netherlands";
  mapit["philippines"] = "the philippines";
  mapit["the republic of china"] = "taiwan";
  mapit["republic of china"] = "taiwan";
  mapit["the united arab emirates"] = "united arab emirates";
  mapit["the central african republic"] = "central african republic";
  mapit["the gambia"] = "gambia";
  mapit["the comoros"] = "comoros";
  mapit["the bahamas"] = "bahamas";
  mapit["the vatican city"] = "vatican city";
  mapit["christmas_island"] = "christmas island";
  mapit["saint helena ascension and tristan da cunha"] = "st. helena";
  mapit["georgia country"] = "georgia";

  printf("begin pop process\n") >> "/tmp/popdebugs.txt";
}

/^.th.1$/ {
  inskip = 0;
  next;
  printf("item 1_ %s\n", $0) >> "/tmp/popdebugs.txt";
}

/^.td.1..td.$/ {
  inskip = 0;
  next;
  printf("item 1a %s\n", $0) >> "/tmp/popdebugs.txt";
}

/^.td.1$/ {
  inskip = 0;
  next;
  printf("item 1b %s\n", $0) >> "/tmp/popdebugs.txt";
}

/^.td align="left"./ {
  processcountry();
  next;
}

/^.td style="text-align: *left;"./ {
  processcountry();
  next;
}

/^href=".wiki.[a-z ]+" title="[A-Za-z ]+">[A-Za-z ]+<.a><.td>/ {
  processcountry();
  next;
}

/^.td.[0-9][0-9.,e+]+..td.$/ {
  printf("item 3 %s\n", $0) >> "/tmp/popdebugs.txt";
  if (inskip == 0 && country != "") {
    p = substr($0,5);
    p = substr(p,1,length(p) - 5);
    gsub(/,/,"",p);
    pv = p + 0;
    printf("0-population:%s:%d:%d\n",country,pv,NR);
#   printf("# debug 4: %s\n", country);
    country = "";
  }
  next;
}

/^.td style=.text-align:right..[0-9][0-9.,e+]+..td.$/ {
  printf("item 3b i=%u c=%s, %s\n", inskip, country, $0) >> "/tmp/popdebugs.txt";
  if (inskip == 0 && country != "") {
    gr = index($0,">");
    p = substr($0,gr+1);
    p = substr(p,1,length(p) - 5);
    gsub(/,/,"",p);
    pv = p + 0;
    printf("0-population:%s:%d:%d\n",country,pv,NR);
#   printf("# debug 4: %s\n", country);
    country = "";
  }
  next;
}

/^.td.[0-9][0-9,]+..td.$/ {
  printf("processing td %s (%d %s)...\n", $0, inskip, country) >> "/tmp/popdebugs.txt";
  if (inskip == 0 && country != "") {
    p = substr($0,5);
    p = substr(p,1,length(p) - 5);
    gsub(/,/,"",p);
    pv = p + 0;
    printf("0-population:%s:%d:%d\n",country,pv,NR);
#   printf("# debug 4: %s\n", country);
    country = "";
  }
  next;
}

/^.td.[0-9][0-9,]+$/ {
  printf("processing td %s (%d %s)...\n", $0, inskip, country) >> "/tmp/popdebugs.txt";
  if (inskip == 0 && country != "") {
    p = substr($0,5);
    gsub(/,/,"",p);
    pv = p + 0;
    printf("0-population:%s:%d:%d\n",country,pv,NR);
#   printf("# debug 4: %s\n", country);
    country = "";
  }
  next;
}

/^.td.[0-9][0-9,]+.sup id.*..sup...td.$/ {
  printf("item 4 %s\n", $0) >> "/tmp/popdebugs.txt";
  if (inskip == 0 && country != "") {
    p = substr($0,5);
    p = substr(p,1,index(p,"sup") - 2);
    gsub(/,/,"",p);
    printf("0-population:%s:%s:%d\n",country,p,NR);
#   printf("# debug 5: %s\n", country);
    country = "";
  }
  next;
}

/^.li id=.cite_note-aus-[0-9]+...b..a href=.#cite_ref-aus_[0-9]+-[0-9]+.....a...b. includes .a href=..wiki.christmas.island. title=.christmas island..christmas island..a. .[0-9,]+., / {
  printf("item 5 %s\n", $0) >> "/tmp/popdebugs.txt";
  pv = $0;
  sub(/, .*$/,"",pv);
  sub(/^.*[(]/,"",pv);
  sub(/[)].*$/,"",pv);
  sub(/,/,"",pv);
  pv = pv + 0;
  printf("0-population:christmas island:%d:%d\n", pv, NR);
  next;
}

/.li..span class=.citation wikicite. id=.*...b..a href=..ref_n5.....a...b...span. includes .a href=..wiki.christmas_island. title=.christmas island..christmas island..a. [(]/ {
  printf("item 6 %s\n", $0) >> "/tmp/popdebugs.txt";
  x = $0;
  sub(/^.*[(]/,"",x);
  sub(/[)].*$/,"",x);
  sub(/,/,"",x);
  printf("0-population:christmas island:%s:%d\n",x,NR);
  country = "";
  next;
}

/class=.flagicon...img alt.*[a-z]..a...b...td.$/ {
  printf("processing %s (%d %s)...\n", $0, inskip, country) >> "/tmp/popdebugs.txt";
  if (inskip == 0 && country != "") {
    x = substr($0,1,length($0) - 12);
    y = "";
    while (substr(x,length(x),1) != ">") {
      y = substr(x,length(x),1) y;
      x = substr(x,1,length(x) - 1);
    }
    country = tolower(y);
    gsub(/[.]*svg/,"",country);
    if (mapit[country] != "") country = mapit[country];
  }
  next;
}

/.*/ {
  next;
}

' > $pops

###
### Process RFC data
###

if [ $debug = 1 ]
then
  echo authorstats: parsing rfc data
fi

rfcs=/tmp/rfcdata.txt
rm -f $rfcs

###
### Safety check to avoid bad characters etc
###

if [ $debug = 1 ]
then
  echo authorstats: process the database
fi

###
### Process our own config database
###

cat $genders |
egrep -v '(^male:)|(^female:)|(^unknown:)' |
$awk '/.*/ { printf("0-%s\n", $0); }' > /tmp/inputdb0.txt;

###
### Process our own gender database
###

cat $genders |
egrep '(^male:)|(^female:)|(^unknown:)' |
$awk '/.*/ { printf("0-%s\n", $0); }' > /tmp/inputdb1.txt;

###
### Process the external names gender database
###

(if [ -f $firstnamedb ]
 then
   cat $firstnamedb |
   grep -v Gender |
   $awk '
/.*/ {
  lin = tolower($0);
  gsub(/"/,"",lin);
  printf("%s\n", lin);
}
' | tee /tmp/inputdb2.txt |
   $awk '

BEGIN {
  FS=",";
}

/.*/ {
  firstname=$1;
  gender=$2;
  if (gender ~ /^male/) gender = "male";
  if (gender ~ /^female/) gender = "female";
  if (gender ~ /^unisex/) gender = "unisex";
  printf("0-%s:%s\n", gender, firstname);
}
' > /tmp/inputdb3.txt
 else
   cat /dev/null > /tmp/inputdb3.txt
 fi)

(while read entry
 do
   name=`echo $entry | cut -f2 -d:`
   if egrep "((^male:$entry)|(^female:$entry)|(^unknown:$entry)"')$' /tmp/inputdb1.txt > /dev/null
   then
     nop=nop
   else
     echo $entry
   fi
 done) < /tmp/inputdb3.txt > /tmp/inputdb4.txt

###
### Collect all inputs together
###

(cat /tmp/inputdb0.txt;
 cat /tmp/inputdb1.txt;
 cat /tmp/inputdb4.txt;
 cat $wgs;
 cat $pops;
 cat $chairs;
 cat $iab;
 cat $iaoc;
 cat $hindexdata;
 cat $authordata) |
sort |
tr -d '*?"{}/%";&<>\341\207\351\355' |
tr -d "'" |
tee /tmp/inputdb.txt |

###
### Process the database
###

$awk -v topic="$topic" \
     -v doctype=$doctype \
     -v debug=$debug \
     -v nocitations=$nocitations \
     -f $progdir/authorstats.awk

###
### Cleanup and exit
###

if [ $debug -eq 1 ]
then
  echo 'Debug: Base temp = '${tmpbase}
else
  rm -f ${tmpbase}-*
fi

exit 0

