#!/usr/bin/perl
# *********************************************************************
# Original code: search,v 2.11 1994/03/14 15:10:02 hobbs
#
# Adapted to NoSQL by Carlo Strozzi
#
# searchtable: fast-search of an indexed NoSQL table.
# Copyright (c) 1998,2006 Carlo Strozzi
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.#
#
# *********************************************************************
# $Id: searchtable,v 1.3 2006/03/10 11:26:13 carlo Exp $

# Get local settings and set defaults.

$NOSQL_INSTALL = $ENV{'NOSQL_INSTALL'};
$NOSQL_INSTALL = "/usr/local/nosql" if not $NOSQL_INSTALL;

$0 =~ s-.*/-- ;
$ACLIM = 31 ;   # max nr of access attempts (safety valve).
while ( $ARGV[0] =~ /^-/ ){             # Get args
    $_ = shift ;
    if( /-I.*/ || /^--index$/ ){ $INDX++ ; $NHCM++ ; next ; }
    if( /-nc.*/ ){ $NHCM++ ; next ; }
    if( /-N.*/ || /^--no-header$/ ){ $NHDR++ ; $NHCM++ ; next ; }
    if( /-p.*/ || /^--partial$/ ){ $PART++ ; next ; }
    if( /-s.*/ || /^--first-match$/ ){ $SGL++ ; next ; }
    if( /-v.*/ || /^--reverse$/ ){ $REVO++ ; next ; }
    if( /-t.*/ || /^--test$/ ){ $VOM++ ; next ; }
    if( /-x.*/ || /^--debug$/ ){ $XBUG++ ; next ; }
    if( /-S(\d*)/ || /^--soundex=?(\d*)/ ){
	if ( $1 ) { $SOUNDEX = $1; }
	else { $SOUNDEX = 4; }			# default code length
	next;
    }
    if( /-h.*/ || /^--help$/ ){
	$HelpInfo = `grep -v '^#' $NOSQL_INSTALL/help/searchtable.txt`;
	print $HelpInfo ;
	exit 1;
    }
    if( /^--show-copying.*/ ){
      system "cat $NOSQL_INSTALL/doc/COPYING" ;
      exit 1 ;
    }
    if( /^--show-warranty.*/ ){
      system "cat $NOSQL_INSTALL/doc/WARRANTY" ;
      exit 1 ;
    }
    die "\n$0: unknown option: $_\n" ; 
}
die "\n$0: no file name given.\n", "For help type \"$0 --help\".\n"
    unless @ARGV ;

$PART = 0 if $SOUNDEX;			# '-S' overrides '-p'.

$intbl = shift ;
if( $INDX ){
    if( @ARGV ){
    $mtbl = shift ; }
    else{
    ($base = $intbl) =~ s/\._x\..*$// ;
    # ($base = $intbl) =~ s/\.rdb\._x\..*$// ;
    # $mtbl = "$base.rdb" ; }
    $mtbl = "$base" ; }
    open( MT, $mtbl ) || die "$0: can't open input: $mtbl\n" ;
    while( <MT> ){
    print unless $NHDR || $VOM ;
    #next if /^\s*#/ ;   # comment line
    #next unless $second++ ;	# Column defs.
    last ; }
}
open( RR, $intbl ) || die "$0: can't open input: $intbl\n" ;
while( <RR> ){                      # read rdbtbl header
    #if( /^\s*#/ ){      # comment 
    #print unless $NHCM || $VOM ;
    #next ; }
    print unless $NHDR || $VOM || $INDX ;
    chop ;
    if( ++$lln == 1 ){
    $_ =~ s/[\001 ]+//g;		# remove SOH markers/blanks.
    @CN = split( /\t/, $_ );# col names
    #next ; }
    #@CD = split( /\t/, $_ );    # col definitions
    #for (@CD){
    #s/^\s*\S+/$&/ ;
    #($_) = /(\S+)/ ; }  # keep only 1st word
    last ; } };
$lowz = tell ;          # curr position is starting low position
$hiz = (stat( $intbl ))[7] ;    # end of file is starting hi position
while( <STDIN> ){                   # read keytbl header
    #next if /^\s*#/ ;       # comment 
    if( ++$kln == 1 ){      # column names
    chop ;
    $_ =~ s/[\001 ]+//g;		# remove SOH markers/blanks.
    @K = split( /\t/, $_ );
    for (@K){
        for( $k=$i=0 ; $i < @CN ; $i++ ){
        if( $_ eq $CN[$i] ){
            $k++ ;
            push( @KEY, $i ) ;  # keys for tbl rows
            #$x = ($CD[$i] =~ /N/i ? 1 : 0 ) ;
            #push( @numcmp, $x ) ;
            #warn "$x .. $CD[$i],\n" if $XBUG ;
            last ; }
        }
        die "$0: keytbl name no match: $_\n" unless $k ;
    }
    #next ; }			    # skip column definitions
    last ; } };
while( <STDIN> ){                   # read keytbl data
    $arg = $_ ;
    chop ;
    @kt = split( /\t/, $_ );

    # Turn value into the corresponding soundex code, if requested.
    if ( $SOUNDEX ) { foreach (@kt){ $_ = SoundEx($_,$SOUNDEX,1); } };

    @spos = () ;    # sort pos in main tbl
    &do_bin ;
    if( $INDX ){
    $x = @spos ;
    warn "Nr Hits: $x\n" if ($x && $XBUG) ;
    for (@spos){
        seek( MT, $_, 0 ) ;
        $_ = <MT> ;
        print unless $VOM ; }
    }
}
exit $errval ;

sub do_bin {                    # do the binary search
    $low = $lowz ;
    $hi = $hiz ;
    $uplim = $cnt = $pmid = $ppmid = $close = $multimode = 0 ;
    while( 1 ){
    $mid = ($hi + $low) / 2.0 ; # next search point
    $cnt++ ;
    seek( RR, $mid, 0 ) ;
    <RR> ;      # get to end of line
    $amid = tell ;  # actual read point of next row
    if( $amid == $pmid || $amid == $ppmid ){
        &do_close ;
        last ; }
    $ppmid = $pmid ;    # prior previous point
    $pmid  = $amid ;    # previous point
    if( $amid >= $hiz ){    # high end of tbl
        &do_close ;     # special case
        last ; }
    $_ = <RR> ; # read complete row
    chop ;
    @a = split( /\t/, $_ );
    &x_info if $XBUG ;
    $phi = $hi ;        # previous hi
    if( $multimode ){   # in multi arg search process
        if( &cmp_key == 0 ){
        $hi = $mid ; }
        else{
        $low = $mid ; }
    }
    else{           # no match yet
        if( ($cv = &cmp_key) == 0 ){
        if( $SGL ){ # single row key match request
            if( $INDX ){
            push( @spos, $a[$#a] ) ; }
            else{
            unless( $VOM ){
                print $_, "\n" ; } }
            #else{
            #    warn "ok\n" ; } }
            last ; }
        else{       # multi row key match request
            $multimode++ ;
            $uplim = $amid ;
            $hi = $mid ; } }
        else{
        if( $cv < 0 ){
            $hi = $mid ; }
        else{
            $low = $mid ; } }
    }
    if( $cnt >= $ACLIM ){   # safety valve, if tbl not sorted, or ...
        warn "Access limit: $arg\n" ;
        return ; }
    }
}
sub do_close {      # find all match rows, in order, starting at $low
    local( $hit ) = 0 ;
    warn "CLOSE...\t($amid)\n" if $XBUG ;
    $uplim = $phi unless $uplim ;
    if( ( $low - $lowz) < ($hi - $low) ){ # close to init low point
    $low = $amid = $lowz ; }
    seek( RR, $low, 0 ) ;
    <RR> unless $low == $lowz ; # special: the first row
    $amid = tell ;
    while( <RR> ){
    last if $amid > $uplim ;# upper limit of search (for 1st match)
    chop ;
    @a = split( /\t/, $_ );
    &x_info if $XBUG ;
    unless( &cmp_key == 0 ){
        $amid = tell ;
        next ; }
    $hit++ ;
    if( $INDX ){
        push( @spos, $a[$#a] ) ; }
    else{
        unless( $VOM ){
        print $_, "\n" ; } }
        #else{
        #warn "ok\n" ; } }
    $amid = tell ;
    last if $SGL || $VOM ;  # stop if only single match wanted or VOM
    while( <RR> ){
        chop ;
        @a = split( /\t/, $_ );
        return unless &cmp_key == 0 ;
        &x_info if $XBUG ;
        if( $INDX ){
        push( @spos, $a[$#a] ) ; }
        else{
        print $_, "\n" ; }
        $amid = tell ; }
    }
    &no_find unless $hit ;
}
sub cmp_key {   # cmp key cols in @kt, @a. Return -1, 0, 1 as appropriate.
    local( $less, $greater ) = ( -1, 1 ) ;
    if( $REVO ){ $less = 1, $greater = -1 ; }   # if reverse sort order
    for( $i=0 ; $i < @KEY ; $i++ ){
    $k = $KEY[$i] ;
    #if( $numcmp[$i] ){      # numeric comparsion
    #    if( $kt[$i] < $a[$k] ){
    #    return $less ; }
    #    if( $kt[$i] > $a[$k] ){
    #    return $greater ; }
    #}
    #else{               # string comparson
        print STDERR "\t($a[$k])  " if $XBUG ;
        next if $PART && (substr($a[$k],0,length($kt[$i])) eq $kt[$i]);

	if( $kt[$i] lt $a[$k] ){
		warn "<<<<<\n" if $XBUG ;
		return $less ;
	}
	if( $kt[$i] gt $a[$k] ){
		warn ">>>>>\n" if $XBUG ;
		return $greater ;
	}
    }
    warn "MATCH\n" if $XBUG ;
    0 ;
}
sub x_info {                    # print debug info to STDERR
    printf STDERR "%2d %8.1f %8.1f (%8.1f) %8.1f %s\n",
    $cnt, $low, $mid, $amid, $hi, $a[$k] ;
}
sub no_find {
    $errval++ ;
    warn "Not found: $arg" if $XBUG ;
    #warn "\n" unless $VOM ;
}

# The following SoundEx function is:
# 
#    (C) Copyright 2002 - 2004, Creativyst, Inc.
#               ALL RIGHTS RESERVED
#
# Modifications for NoSQL by Carlo Strozzi <carlos@linux.it>
# The changes are distributed under the same terms as the original
# code.
# 
# For more information go to:
#           http://www.Creativyst.com
# or email:
#           Support@Creativyst.com
# 
# Redistribution and use in source and binary 
# forms, with or without modification, are 
# permitted provided that the following conditions 
# are met: 
# 
#   1. Redistributions of source code must 
#      retain the above copyright notice, this 
#      list of conditions and the following 
#      disclaimer. 
# 
#   2. Redistributions in binary form must 
#      reproduce the above copyright notice, 
#      this list of conditions and the 
#      following disclaimer in the 
#      documentation and/or other materials 
#      provided with the distribution. 
# 
#   3. All advertising materials mentioning 
#      features or use of this software must 
#      display the following acknowledgement: 
#      This product includes software developed 
#      by Creativyst, Inc. 
# 
#   4. The name of Creativyst, Inc. may not be 
#      used to endorse or promote products 
#      derived from this software without 
#      specific prior written permission. 
# 
# THIS SOFTWARE IS PROVIDED BY CREATIVYST CORPORATION
# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
# THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
#
#
sub SoundEx
{
    my($WordString, $LengthOption, $CensusOption) = @_;
    my($WordStr, $CurChar, $LastChar, $SoundExLen);
    my($WSLen, $FirstLetter, $TmpStr);

    # Enforce compliancy, if requested.
    #if($CensusOption) { $LengthOption = 4;}

    if($LengthOption) {
	$SoundExLen = $LengthOption;
    }

    if($SoundExLen < 4 || $SoundExLen > 10) {
	$SoundExLen = 4;
    }

    if(!$WordString) {
	 return("");
    }

    $WordString = uc($WordString);
    # Clean and tidy
    #
    $WordStr = $WordString;

    # replace non-chars with space, unless standard compliancy was requested

    if($CensusOption) { $WordStr =~ s/[^A-Z]//sig; }
    else { $WordStr =~ s/[^A-Z]/ /sig; }

    $WordStr =~ s/^\s//sg;        # remove leading space
    $WordStr =~ s/\s$//sg;        # remove trailing space

    if(!$CensusOption) {
        # Some of our own improvements
        #
        $WordStr =~ s/DG/G/sg;          # Change DG to G
        $WordStr =~ s/GH/H/sg;          # Change GH to H
        $WordStr =~ s/KN/N/sg;          # Change KN to N
        $WordStr =~ s/GN/N/sg;          # Change GN to N
        $WordStr =~ s/MB/M/sg;          # Change MB to M
        $WordStr =~ s/PH/F/sg;          # Change PH to F
        $WordStr =~ s/TCH/CH/sg;        # Change TCH to CH
        $WordStr =~ s/MP([STZ])/M$1/sg; # MP if follwd by S|T|Z
        $WordStr =~ s/^PS/S/sg;         # Change leading PS to S
        $WordStr =~ s/^PF/F/sg;         # Change leading PF to F
    }

    # Done here because the
    # above improvements could
    # change this first letter
    #
    $FirstLetter = substr($WordStr,0,1);

    if ($CensusOption) {
	 $LastChar = $FirstLetter;
	 $LastChar =~ s/[AEIOUYHW]/0/;
	 $LastChar =~ s/[BPFV]/1/;
	 $LastChar =~ s/[CSGJKQXZ]/2/;
	 $LastChar =~ s/[DT]/3/;
	 $LastChar =~ s/L/4/;
	 $LastChar =~ s/[MN]/5/;
	 $LastChar =~ s/R/6/;
    }
    else { $LastChar = ""; }

    # in case 1st letter is
    # an H or W and we're in
    # CensusOption = 1
    #
    $TmpStr = substr($WordStr,1);
    $WordStr = "-$TmpStr";

    # In properly done census
    # SoundEx: the H and W will
    # be squeezed out before
    # performing the test for
    # adjacent digits
    # (this differs from how
    # the 'real' vowels are
    # handled)
    #
    if($CensusOption == 1) {
        $WordStr =~ s/[HW]/\./sg;
    }

    # Begin Classic SoundEx
    #
    $WordStr =~ s/[AEIOUYHW]/0/sg;
    $WordStr =~ s/[BPFV]/1/sg;
    $WordStr =~ s/[CSGJKQXZ]/2/sg;
    $WordStr =~ s/[DT]/3/sg;
    $WordStr =~ s/L/4/sg;
    $WordStr =~ s/[MN]/5/sg;
    $WordStr =~ s/R/6/sg;

    # Properly done census:
    # squeeze H and W out
    # before doing adjacent
    # digit removal.
    #
    if($CensusOption == 1) {
        $WordStr =~ s/\.//sg;
    }

    # Remove extra equal adjacent digits
    #
    $WSLen = length($WordStr);

    $TmpStr = "-";    # replace skipped first character

    for($i = 1; $i < $WSLen;$i++) {
        $CurChar = substr($WordStr,$i,1);
        if($CurChar eq $LastChar) {
            $TmpStr .= " ";
        }
        else {
            $TmpStr .= $CurChar;
            $LastChar = $CurChar;
        }
    }
    $WordStr = $TmpStr;

    $WordStr = substr($WordStr,1);      # Drop first letter code
    $WordStr =~ s/\s//sg;               # remove spaces
    $WordStr =~ s/0//sg;                # remove zeros
    $WordStr .= "0000000000";           # pad with zeros on right

    $WordStr = "$FirstLetter$WordStr";  # Add first letter of word

    $WordStr = substr($WordStr,0,$SoundExLen);  # size to taste

    return($WordStr);
}

# End of program.
