#!/usr/bin/mawk -We
# *********************************************************************
# julian: translate selected date columns from calendar to julian
#	  format, for mathematical computations.
# Copyright (c) 2001,2006 Carlo Strozzi
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# *********************************************************************
# $Id: julian,v 1.4 2006/03/10 11:26:13 carlo Exp $

BEGIN {
  NULL = ""; FS = OFS = "\t"

  # Get local settings.
  nosql_install = ENVIRON["NOSQL_INSTALL"]
  stdout = ENVIRON["NOSQL_STDOUT"]
  stderr = ENVIRON["NOSQL_STDERR"]

  # Set default values if necessary.
  if (nosql_install == NULL) nosql_install = "/usr/local/nosql"
  if (stdout == NULL) stdout = "/dev/stdout"
  if (stderr == NULL) stderr = "/dev/stderr"

  while (ARGV[++i] != NULL) {
    if (ARGV[i] == "-l" || ARGV[i] == "--last") pick_last = 1
    else if (ARGV[i] == "-N" || ARGV[i] == "--no-header") no_hdr = 1
    else if (ARGV[i] == "-i" || ARGV[i] == "--input") i_file = ARGV[++i]
    else if (ARGV[i] == "-o" || ARGV[i] == "--output") o_file = ARGV[++i]
    else if (ARGV[i] == "-h" || ARGV[i] == "--help") {
       system("grep -v '^#' " nosql_install "/help/julian.txt")
       exit(rc=1)
    }
    else if (ARGV[i] == "--show-copying") {
       system("cat " nosql_install "/doc/COPYING")
       exit(rc=1)
    }
    else if (ARGV[i] == "--show-warranty") {
       system("cat " nosql_install "/doc/WARRANTY")
       exit(rc=1)
    }
    else target_cols[ARGV[i]] = ARGV[i]
  }

  ARGC = 1					# Fix argv[]

  if (o_file == NULL) o_file = stdout
  if (i_file != NULL) { ARGV[1] = i_file; ARGC = 2 }
}

#
# Main loop
#

NR == 1 {

  gsub(/[\001 ]+/,"")			# Remove SOH and blanks

  # Load the column position array.
  while (++p <= NF) {

    # Unless '-l' was specified, make sure we pick the first occurrence
    # of duplicated column names (it may happen after a join).

    if (P[$p] == NULL) auto_col = auto_col " " $p

    if (pick_last) { P[$p] = p; N[p] = $p }
    else {
      if (P[$p] == NULL) { P[$p] = p; N[p] = $p }
    }
  }

  if (!no_hdr) {
     printf("\001"); gsub(/\t/,"\t\001"); print > o_file
  }
  next
}

# Table body.
{
  for (i=1; i<=NF; i++) {
      if (i > 1) printf(OFS) > o_file
      if (target_cols[N[i]] == NULL) printf("%s", $i) > o_file
      else {
	 # Auto-detect input date format.
	 if ($i ~ /^[0-9]+$/) {
	    #
	    # Computer format: [yy]yymmdd
	    #
	    if (length($i) == 6) $i = "20" $i
	    y = substr($i,1,4); m = substr($i,5,2); d = substr($i,7,2)
	 }  else if ($i ~ /-/) {
	    #
	    # ISO format: [yy]yy-mm-dd
	    #
	    split($i, a, "-")
	    y = a[1]; m = a[2]; d = a[3]
	 }  else if ($i ~ /\//) {
	    #
	    # U.S. format: mm/dd/yyyy
	    #
	    split($i, a, "/")
	    y = a[3]; m = a[1]; d = a[2]
	 }  else if ($i ~ /\./) {
	    #
	    # European format: dd.mm.yyyy
	    #
	    split($i, a, ".")
	    y = a[3]; m = a[2]; d = a[1]
	 }  else {
	    # Unrecognized format or not a date.
	    printf("%s", $i) > o_file; continue
	 }
	 if (m > 12 || d > 31) {		# Not a date.
	    printf("%s", $i) > o_file
	    continue
	 }
	 if (y < 100) y += 2000
	 era = "CE"				# Set default.
	 if (y < 1582) era = "BCE"
	 else if (y > 1582) era = "CE"
	 else if (m < 10) era = "BCE"
	 else if (m > 10) era = "CE"
	 else if (d < 5) era = "BCE"
	 else if (d > 14) era = "CE"
	 printf("%f", cal_to_jd(era,y,m,d)) > o_file
      }
  }
  printf("\n") > o_file
}

# ---------------------------------------------------------------------
# cal_to_jd()
#
# Convert calendar to Julian date
# (Julian day number algorithm adopted from Press et al.)
# Note: although this function can handle hh:mm:ss as well, I'm
# currently only interested in yyyymmdd, so I set the formers to "0".
# CE = Common Era (AD), i.e. dates after 1582-10-14
# BCE = Before Common Era, i.e. dates before 1582-10-04
# ---------------------------------------------------------------------
function cal_to_jd(era,y,m,d,h,mn,s,	jy,ja,jm,intgr,gregcal,frac, \
					dayfrac,jd,jd0) {

   # Make sure args cast to numbers, or rounding errors may occur.
   y += 0; m += 0; d += 0; h += 0; mn += 0; s += 0

   if (y == 0) {
      print "There is no year 0 in the Julian system!" > stderr
      return 1
   }

   if (y == 1582 && m == 10 && d > 4 && d < 15) {
      print "The dates 5 through 14 October, 1582, do not exist" \
	    "in the Gregorian system!" > stderr
      return 1
   }

#  if (y < 0) ++y
   if (era == "BCE") y = -y + 1
   if (m > 2) {
      jy = y
      jm = m + 1
   }  else {
      jy = y - 1
      jm = m + 13
   }

   intgr = int(int(365.25*jy) + int(30.6001*jm) + d + 1720995)

   # check for switch to Gregorian calendar
   gregcal = 15 + 31*(10 + 12*1582)
   if (d + 31*(m + 12*y) >= gregcal) {
      ja = int(0.01*jy)
      intgr += 2 - ja + int(0.25*ja)
   }

   # correct for half-day offset
   dayfrac = h/24.0 - 0.5
   if (dayfrac < 0.0) {
      dayfrac += 1.0
      --intgr
   }

   # now set the fraction of a day
   frac = dayfrac + (mn + s/60.0)/60.0/24.0

   # round to nearest second
   jd0 = (intgr + frac)*100000
   jd  = int(jd0)
   if (jd0 - jd > 0.5) ++jd
   return jd/100000
}

# End of program.
