#!/usr/bin/mawk -We
# *********************************************************************
# muxtotable: turns an unordered sequence of name/value pairs into a
# 	      NoSQL table, allowing for short/long records.
# Copyright (c) 2004,2006 Carlo Strozzi
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# *********************************************************************
# $Id: muxtotable,v 1.3 2006/03/10 11:26:13 carlo Exp $

BEGIN {
  NULL = "" ; OFS = "\t"; cont = "^@"

  # Get local settings.
  nosql_install = ENVIRON["NOSQL_INSTALL"]
  stdout = ENVIRON["NOSQL_STDOUT"]
  stderr = ENVIRON["NOSQL_STDERR"]

  # Set default values if necessary.
  if (nosql_install == NULL) nosql_install = "/usr/local/nosql"
  if (stdout == NULL) stdout = "/dev/stdout"
  if (stderr == NULL) stderr = "/dev/stderr"

  while (ARGV[++i] != NULL) {
    if (ARGV[i] == "-N" || ARGV[i] == "--no-header") no_hdr = 1
    else if (ARGV[i] == "-c" || ARGV[i] == "--continuation") {
       cont = ARGV[++i]
       gsub(/[]\\\$()\[\|\^\*\?\.]/,"\\\\&",cont)	# sanitize input.
       cont = "^" cont
    }
    else if (ARGV[i] == "-K" || ARGV[i] == "--key-columns")
					klist = "," ARGV[++i] ","
    else if (ARGV[i] == "-s" || ARGV[i] == "--skip") skip_re = ARGV[++i]
    else if (ARGV[i] == "-h" || ARGV[i] == "--help") {
       system("grep -v '^#' " nosql_install "/help/muxtotable.txt")
       exit(rc=1)
    }
    else if (ARGV[i] == "--show-copying") {
       system("cat " nosql_install "/doc/COPYING")
       exit(rc=1)
    }
    else if (ARGV[i] == "--show-warranty") {
       system("cat " nosql_install "/doc/WARRANTY")
       exit(rc=1)
    }
    else if (ARGV[i] !~ /^-/) table = ARGV[i]
  }

  ARGC = 1				# Fix argv[]

  # infer the input key column names from file name, unless they
  # have been explicitly specified on the command line.

  if (klist == NULL && table != NULL) {
     klist = table

     # Handle both table and index file names (the index first!). Note
     # that _k and _x were choosen because no real column name can begin
     # with an uderscore, so there's no risk of ambiguities. Note also
     # that we need to strip everything up to _x first, as in index 
     # files the actual key columns are those that come after _x, and   
     # they may not necessarily be the same as the key columns of the       
     # main table. That is, given the main table 'table._k.col1.col2',
     # it is quite possible to have an index file name like this:
     # 'table._k.col1.col2._x.col3.col4.col5

     if (sub(/.*\._x\./,"",klist) || sub(/.*\._k\./,"",klist)) {
	gsub(/\./,",",klist)
	sub(/-.*$/,"",klist)		# remove possible "-suffix".
	klist = "," klist ","
     }
     else klist = NULL
  }

  if (klist != NULL) {
     nkeys = split(klist,a,",") - 2		# count key fields.
  }
}

# Auto-detect whether the input stream is already de-muxed (i.e. it is
# already in table format) in which case pass it unchanged through to
# stdout.

NR == 1 && /^\001/ {
   tbl = 1
   if (!no_hdr) print
   next
}

tbl { print; next }

# The whole input stream must be stuffed into an array in memory
# for this program to work. This is normally not a problem though,
# as the amount of input is usually small in most practical cases.

# Accept also input in NoSQL 'list' format.
/^$/ { next }

skip_re != "" && $1 ~ skip_re { next }		# Handle '-s' option.

# Handle continuation records. Preserve output formatting by
# inserting an escaped newline '\n'.

name != NULL && sub(cont,"\\n") {

  value[rec,p[name]] = value[rec,p[name]] $0
  next
}

{
  # skip invalid names, verbosely.
  if ($1 !~ /^[A-Za-z][A-Za-z0-9_]*$/) {
     print "muxtotable: invalid column name '" $1 "'" > stderr
     next
  }

  name = $1
  sub(/^[A-Za-z0-9_]+[ \t]*/,NULL)

  # new column ?
  if (!p[name]) {
     p[name] = ++col
     n[col] = name
     if (header == NULL) header = "\001" name
     else header = header "\t\001" name
  }

  # check whether it is a key field. 
  if (klist == NULL) {
     klist = "," name ","			# take as key field
     nkeys = 1
  }

  # When a new key value comes in:
  #
  # 1) if this is the first record then place the new value
  #    in the relevant column position.
  #
  # 2) if this is not the first record, then if the column
  #    corresponding to the new key value is null then set
  #    it to the new value, otherwise start a new record
  #    and then insert the new value in the relevant position.
  #
  # If non-key fields are encountered in the input stream
  # before at least one value for each key is received, the
  # program stops with an error.

  kre = "," name ","
  if (klist ~ kre) {
     if (nkeys) nkeys--
     if (!rec || value[rec,p[name]] != NULL) rec++
  }
  else if (nkeys > 0) {
     print "muxtotable: null input key(s), aborting!" > stderr
     exit(rc=1)
  }
  value[rec,p[name]] = $0
}

END {

  if (rc) exit(rc)

  if (tbl) exit(rc=0)

  # be completely silent if no valid input.
  if (header == "") exit(rc=0)

  if (!no_hdr) print header

  for (i=1; i<=rec; i++) {
      out = ""
      for (j=1; j<=col; j++) {
	  if (j > 1) out = out "\t"
	  gsub(/\t/,"\\t",value[i,j])		# escape TABs in values
	  out = out value[i,j]
      }
      if (out !~ /^[\t]*$/) print out		# omit empty records.
  }
}

# End of program.
