#!/usr/bin/mawk -We
# *********************************************************************
# soundex: classifies input values into categories, using Knuth's
# soundex codes, useful for building a table secondary index.
#
# Copyright (c) 2001,2006 Carlo Strozzi
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# *********************************************************************
# $Id: soundex,v 1.4 2006/03/10 11:26:13 carlo Exp $

BEGIN {
  NULL = ""; FS = OFS = "\t"; id = "soundex"

  # Get local settings.
  nosql_install = ENVIRON["NOSQL_INSTALL"]
  stdout = ENVIRON["NOSQL_STDOUT"]
  stderr = ENVIRON["NOSQL_STDERR"]

  # Set default values if necessary.
  if (nosql_install == NULL) nosql_install = "/usr/local/nosql"
  if (stdout == NULL) stdout = "/dev/stdout"
  if (stderr == NULL) stderr = "/dev/stderr"

  while (ARGV[++i] != NULL) {
    if (ARGV[i] == "-l" || ARGV[i] == "--last") pick_last = 1
    else if (ARGV[i] == "-N" || ARGV[i] == "--no-header") no_hdr = 1
    else if (ARGV[i] == "-s" || ARGV[i] == "--soundex-column") id = ARGV[++i]
    else if (ARGV[i] == "-i" || ARGV[i] == "--input") i_file = ARGV[++i]
    else if (ARGV[i] == "-o" || ARGV[i] == "--output") o_file = ARGV[++i]
    else if (sub(/^-S/,NULL,ARGV[i]) || sub(/^--soundex=/,NULL,ARGV[i])) {
	 if ((ARGV[i]+=0 > 3) && ARGV[i] < 11) sl = ARGV[i]
	 else sl = 4				# default Soundex length
    }
    else if (ARGV[i] == "-h" || ARGV[i] == "--help") {
       system("grep -v '^#' " nosql_install "/help/soundex.txt")
       exit(rc=1)
    }
    else if (ARGV[i] == "--show-copying") {
       system("cat " nosql_install "/doc/COPYING")
       exit(rc=1)
    }
    else if (ARGV[i] == "--show-warranty") {
       system("cat " nosql_install "/doc/WARRANTY")
       exit(rc=1)
    }
    else {
       j++
       target_cols[ARGV[i]] = ARGV[i]
    }
  }

  ARGC = 1					# Fix argv[]

  if (o_file == NULL) o_file = stdout
  if (i_file != NULL) { ARGV[1] = i_file; ARGC = 2 }
}

#
# Main loop
#

NR == 1 {

  gsub(/\001/, "")			# Remove SOH markers

  gsub(/ +/,"")				# trim blanks in names.

  # Load the column position array.
  while (++p <= NF) {

    # Unless '-l' was specified, make sure we pick the first occurrence
    # of duplicated column names (it may happen after a join).

    if (P[$p] == NULL) auto_col = auto_col " " $p

    if (pick_last) { P[$p] = p; N[p] = $p }
    else {
      if (P[$p] == NULL) { P[$p] = p; N[p] = $p }
    }
  }

  $0 = id OFS $0			# Insert the Soundex column.
  if (!no_hdr) {
     printf("\001"); gsub(/\t/,"\t\001"); print > o_file
  }

  # Set default soundex column if not specified.
  if (!j) target_cols[N[1]] = N[1]

  next
}

# Table body.
{
  out = k = NULL
  for (i=1; i<=NF; i++) {
      if (i > 1) out = out OFS
      if (target_cols[N[i]] != NULL) {
	 k = k $i
      }
      out = out $i
  }
  print soundex(k,sl) OFS out > o_file
}

#
# Function section.
#

# *********************************************************************
# Soundex algorithm taken from the following URL:
# http://www.archives.gov/research_room/genealogy/census/soundex.html
# 
# Here's another good artichle on Soundex:
# http://www.creativyst.com/Doc/Articles/SoundEx1/SoundEx1.htm
#
# The theory
#
# The method is based on the six phonetic classifications of human
# speech sounds (bilabial, labiodental, dental, alveolar, velar, and
# glottal), which in turn are based on where you put your lips and
# tongue to make the sounds.
#
# The Soundex Algorithm
#
# Soundex codes begin with the first letter of the surname followed by
# a three-digit code that represents the first three remaining consonants.
# Zeros will be added to names that do not have enough letters to be coded.
# Soundex Coding Guide (Consonants that sound alike have the same code)
#
# 1 - B, F, P, V
# 2 - C, G, J, K, Q, S, X, Z
# 3 - D, T
# 4 - L
# 5 - M, N
# 6 - R
#
# - The letters A,E,I,O,U,Y,H, and W are not coded.
# - Names with adjacent letters having the same equivalent number are coded
#   as one letter with a single number.
# - Surname prefixes are generally not used in the soundex. 
# - Fill any trailing unused positions with zeros e.g.. Lee is L000,
#   Bailey is B400.
# - There is always one letter followed by 3 numbers.
#
# Note that the above consonant grouping/weeding criteria are based on
# English pronunciation. Being the algorithm based on pronunciation,
# not spelling, european names may not always be soundexed correctly,
# although the resulting code can still be used as useful a form of   
# fuzzy string hasing.
#
# Here's Knuth's test set (1973):
#
#  Euler Gauss Hilbert Knuth Lloyd Lukasiewicz
#  E460  G200  H416    K530  L300  L222
#
# Additional test values:  
#
#  Pippo Cpppo Gutierrez Pfister Jackson Tymczak Ashcroft
#  P100  C100  G362      P236    J250    T522    A261
#
#  Washington Lee
#  W252       L000
#
#
# Function arguments:
#
# 'string' is the name to be soundexed
# 'sl' is the soundex code length (default = 4)
# 'ct' is the "census type":
#      zero = normal census
#      non-zero = special census
#
# See http://www.creativyst.com/Doc/Articles/SoundEx1/SoundEx1.htm
# for more info on the difference between the two methods. Basically,
# with "normal" census type, letters H and W are considered the same as
# non-letters, so that 'Ashcroft' codes to A261 because 's' and 'c' are
# collapsed together since they code the same. With the "special" census
# type H and W are considered as separators, thus causing 'Ashcroft'
# to code to A226. Note that the correct method is the "normal" census
# type.
# 
# As far as I know, some implementations consider a sequence of one or
# more blank spaces as a separator, i.e. the same as H and W in the
# "special" census type, but they are very few and most don't, so I think
# the correct behaviour is to completely ignore blanks. The converter 
# form at http://www.creativyst.com/Doc/Articles/SoundEx1/SoundEx1.htm
# does consider blanks in that way, but most other converters available
# online do not, so again, I think the correct behaviour is the latter,
# i.e. consider blanks the same as vowels and non-letters.
#
# If anything is changed in the behaviour of this function, it will
# have to be changed also in it's C counterpart in 'keysearch.c'.
# *********************************************************************

function soundex(string,sl,ct,		field,a,i,j,s,code,old) {

   if (sl < 4 || sl > 10) sl = 4		# Default code length.

   field = toupper(string)
   gsub(/[^A-Z]/, NULL,field)			# Keep only letters.
   if (field == NULL)
      return substr("Z000000000",1,sl)		# No encodable letters.
   string = substr(field,1,1)			# Save 1st letter.
   old = getcode(string)
   i = split(field, a, NULL)
   for (j=2; j <= i; j++) {
       if (a[j] != a[j-1]) s = s a[j]		# Remove duplicates
   }
   #gsub(/[^BCDFGJ-NP-TVXZ]/,NULL,s)	# Keep only the soundex subset

   i = split(s, a, NULL)

   for (j=1; j <= i; j++) {

       # in the "normal" census type, H and W do not act as separators,
       # i.e. they are completely ignored, like vowels.

       if (!ct && a[j] ~ /[HW]/) continue

       if ((code=getcode(a[j]))==old) continue
       old = code
       string = string code
   }

   return substr(string "000000000",1,sl)
}

function getcode(c) {

   if (c ~ /[BFPV]/)		return "1"
   if (c ~ /[CGJKQSXZ]/)	return "2"
   if (c ~ /[DT]/)		return "3"
   if (c == "L")		return "4"
   if (c ~ /[MN]/)		return "5"
   if (c == "R")		return "6"

   # prevent a lot of re-casting
   return ""
}

# End of program.
