#!/usr/bin/mawk -We
# *********************************************************************
# nltable: inserts a unique record identifier into a table.
# Copyright (c) 2001,2006 Carlo Strozzi
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# *********************************************************************
#  $Id: nltable,v 1.3 2006/03/10 11:26:13 carlo Exp $

BEGIN {
  NULL = ""; FS = OFS = "\t"; key_name = "_id"

  # ISO-8859-1 translation lookup tables.

  delete map

  map["\001"] = "01"
  map["\002"] = "02"
  map["\003"] = "03"
  map["\004"] = "04"
  map["\005"] = "05"
  map["\006"] = "06"
  map["\007"] = "07"
  map["\010"] = "08"
  map["\011"] = "09"		# may not actually occur in the data
  map["\020"] = "10"
  map["\021"] = "11"
  map["\022"] = "12"
  map["\023"] = "13"
  map["\024"] = "14"
  map["\025"] = "15"
  map["\026"] = "16"
  map["\027"] = "17"
  map["\030"] = "18"
  map["\031"] = "19"
  map["\040"] = "20"
  map["\041"] = "21"
  map["\042"] = "22"
  map["\043"] = "23"
  map["\044"] = "24"
  map["\046"] = "26"
  map["\050"] = "28"
  map["\051"] = "29"
  map["\060"] = "30"
  map["\061"] = "31"
  map["\062"] = "32"
  map["\063"] = "33"
  map["\064"] = "34"
  map["\065"] = "35"
  map["\066"] = "36"
  map["\067"] = "37"
  map["\070"] = "38"
  map["\071"] = "39"
  map["\100"] = "40"
  map["\101"] = "41"
  map["\102"] = "42"
  map["\103"] = "43"
  map["\104"] = "44"
  map["\105"] = "45"
  map["\106"] = "46"
  map["\107"] = "47"
  map["\110"] = "48"
  map["\111"] = "49"
  map["\120"] = "50"
  map["\121"] = "51"
  map["\122"] = "52"
  map["\123"] = "53"
  map["\124"] = "54"
  map["\125"] = "55"
  map["\126"] = "56"
  map["\127"] = "57"
  map["\130"] = "58"
  map["\131"] = "59"
  map["\140"] = "60"
  map["\141"] = "61"
  map["\142"] = "62"
  map["\143"] = "63"
  map["\144"] = "64"
  map["\145"] = "65"
  map["\146"] = "66"
  map["\147"] = "67"
  map["\150"] = "68"
  map["\151"] = "69"
  map["\160"] = "70"
  map["\161"] = "71"
  map["\162"] = "72"
  map["\163"] = "73"
  map["\164"] = "74"
  map["\165"] = "75"
  map["\166"] = "76"
  map["\167"] = "77"
  map["\170"] = "78"
  map["\171"] = "79"

  map["\012"] = "0a"		# may not actually occur in the data
  map["\013"] = "0b"
  map["\014"] = "0c"
  map["\015"] = "0d"
  map["\016"] = "0e"
  map["\017"] = "0f"

  map["\032"] = "1a"
  map["\033"] = "1b"
  map["\034"] = "1c"
  map["\035"] = "1d"
  map["\036"] = "1e"
  map["\037"] = "1f"

  map["\052"] = "2a"
  map["\053"] = "2b"
  map["\054"] = "2c"
  map["\055"] = "2d"
  map["\056"] = "2e"
  map["\057"] = "2f"

  map["\072"] = "3a"
  map["\073"] = "3b"
  map["\074"] = "3c"
  map["\075"] = "3d"
  map["\076"] = "3e"
  map["\077"] = "3f"

  map["\112"] = "4a"
  map["\113"] = "4b"
  map["\114"] = "4c"
  map["\115"] = "4d"
  map["\116"] = "4e"
  map["\117"] = "4f"

  map["\132"] = "5a"
  map["\133"] = "5b"
  map["\134"] = "5c"
  map["\135"] = "5d"
  map["\136"] = "5e"
  map["\137"] = "5f"

  map["\152"] = "6a"
  map["\153"] = "6b"
  map["\154"] = "6c"
  map["\155"] = "6d"
  map["\156"] = "6e"
  map["\157"] = "6f"

  map["\172"] = "7a"
  map["\173"] = "7b"
  map["\174"] = "7c"
  map["\175"] = "7d"
  map["\176"] = "7e"
  map["\177"] = "7f"

  map["\240"] = "a0"
  map["\241"] = "a1"
  map["\242"] = "a2"
  map["\243"] = "a3"
  map["\244"] = "a4"
  map["\245"] = "a5"
  map["\246"] = "a6"
  map["\247"] = "a7"
  map["\250"] = "a8"
  map["\251"] = "a9"
  map["\252"] = "aa"
  map["\253"] = "ab"
  map["\254"] = "ac"
  map["\255"] = "ad"
  map["\256"] = "ae"
  map["\257"] = "af"
  map["\260"] = "b0"
  map["\261"] = "b1"
  map["\262"] = "b2"
  map["\263"] = "b3"
  map["\264"] = "b4"
  map["\265"] = "b5"
  map["\266"] = "b6"
  map["\267"] = "b7"
  map["\270"] = "b8"
  map["\271"] = "b9"
  map["\272"] = "ba"
  map["\273"] = "bb"
  map["\274"] = "bc"
  map["\275"] = "bd"
  map["\276"] = "be"
  map["\277"] = "bf"
  map["\300"] = "c0"
  map["\301"] = "c1"
  map["\302"] = "c2"
  map["\303"] = "c3"
  map["\304"] = "c4"
  map["\305"] = "c5"
  map["\306"] = "c6"
  map["\307"] = "c7"
  map["\310"] = "c8"
  map["\311"] = "c9"
  map["\312"] = "ca"
  map["\313"] = "cb"
  map["\314"] = "cc"
  map["\315"] = "cd"
  map["\316"] = "ce"
  map["\317"] = "cf"
  map["\320"] = "d0"
  map["\321"] = "d1"
  map["\322"] = "d2"
  map["\323"] = "d3"
  map["\324"] = "d4"
  map["\325"] = "d5"
  map["\326"] = "d6"
  map["\327"] = "d7"
  map["\330"] = "d8"
  map["\331"] = "d9"
  map["\332"] = "da"
  map["\333"] = "db"
  map["\334"] = "dc"
  map["\335"] = "dd"
  map["\336"] = "de"
  map["\337"] = "df"
  map["\340"] = "e0"
  map["\341"] = "e1"
  map["\342"] = "e2"
  map["\343"] = "e3"
  map["\344"] = "e4"
  map["\345"] = "e5"
  map["\346"] = "e6"
  map["\347"] = "e7"
  map["\350"] = "e8"
  map["\351"] = "e9"
  map["\352"] = "ea"
  map["\353"] = "eb"
  map["\354"] = "ec"
  map["\355"] = "ed"
  map["\356"] = "ee"
  map["\357"] = "ef"
  map["\360"] = "f0"
  map["\361"] = "f1"
  map["\362"] = "f2"
  map["\363"] = "f3"
  map["\364"] = "f4"
  map["\365"] = "f5"
  map["\366"] = "f6"
  map["\367"] = "f7"
  map["\370"] = "f8"
  map["\371"] = "f9"
  map["\372"] = "fa"
  map["\373"] = "fb"
  map["\374"] = "fc"
  map["\375"] = "fd"
  map["\376"] = "fe"
  map["\377"] = "ff"

  # Get local settings.
  nosql_install = ENVIRON["NOSQL_INSTALL"]
  stdout = ENVIRON["NOSQL_STDOUT"]
  stderr = ENVIRON["NOSQL_STDERR"]

  # Set default values if necessary.
  if (nosql_install == NULL) nosql_install = "/usr/local/nosql"
  if (stdout == NULL) stdout = "/dev/stdout"
  if (stderr == NULL) stderr = "/dev/stderr"

  while (ARGV[++i] != NULL) {
    if (ARGV[i] == "-l" || ARGV[i] == "--last") pick_last = 1
    else if (ARGV[i] == "-e" || ARGV[i] == "--encode") enc = 1
    else if (ARGV[i] == "-N" || ARGV[i] == "--no-header") no_hdr = 1
    else if (ARGV[i] == "-K" || ARGV[i] == "--key") key_name = ARGV[++i]
    else if (ARGV[i] == "-d" || ARGV[i] == "--delimiter") dlm = ARGV[++i]
    else if (ARGV[i] == "-i" || ARGV[i] == "--input") i_file = ARGV[++i]
    else if (ARGV[i] == "-o" || ARGV[i] == "--output") o_file = ARGV[++i]
    else if (ARGV[i] == "-s" || ARGV[i] == "--start") start = ARGV[++i]
    else if (ARGV[i] == "-n" || ARGV[i] == "--allow-null") null_ok = 1
    else if (ARGV[i] == "-h" || ARGV[i] == "--help") {
       system("grep -v '^#' " nosql_install "/help/nltable.txt")
       exit(rc=1)
    }
    else if (ARGV[i] == "--show-copying") {
       system("cat " nosql_install "/doc/COPYING")
       exit(rc=1)
    }
    else if (ARGV[i] == "--show-warranty") {
       system("cat " nosql_install "/doc/WARRANTY")
       exit(rc=1)
    }
    else target_cols[ARGV[i]] = ARGV[i]
  }

  ARGC = 1					# Fix argv[]

  if (o_file == NULL) o_file = stdout
  if (i_file != NULL) { ARGV[1] = i_file; ARGC = 2 }
}

#
# Main loop
#

NR == 1 {

  gsub(/\001/,"")			# Remove SOH markers.

  gsub(/ +/,"")				# trim blanks in names.

  # Load the column position array.
  while (++p <= NF) {

    # Unless '-l' was specified, make sure we pick the first occurrence
    # of duplicated column names (it may happen after a join).

    if (P[$p] == NULL) auto_col = auto_col " " $p

    if (pick_last) { P[$p] = p; N[p] = $p }
    else {
      if (P[$p] == NULL) { P[$p] = p; N[p] = $p }
    }
  }

  $0 = key_name OFS $0			# Insert the key column.
  if (!no_hdr) {
     printf("\001"); gsub(/\t/,"\t\001"); print > o_file
  }
  next
}

# Table body.
{
  out = k = NULL; j = 0
  for (i=1; i<=NF; i++) {
      if (i > 1) out = out OFS
      if (target_cols[N[i]] != NULL) {
	 if ($i == NULL && !null_ok)
	    print "nltable: invalid N[i]=NULL at record No. " \
	    NR-2 > stderr
	 if (!j++) k = $i
	 else k = k dlm $i
      }
      out = out $i
  }

  # Resort to a generic id if necessary.
  if (k == NULL && !null_ok) k = NR-2+start

  # perform encoding if requested.
  if (enc) {
     j = split(k,a,"")
     for (i=1; i<=j; i++) {
	 s = map[a[i]]
	 if (s == "") s = "_"
     }
  }
  else s = k

  print s OFS out > o_file
}

# End of program.
