#!/usr/bin/awk -f
#****************************************************************************
#  ##   ##         #####   #####  ##     **       NoSQL RDBMS - cgi2rc      *
#  ###  ##        ####### ####### ##     **        $Revision: 1.1.1.1 $       *
#  #### ##        ###     ##   ## ##     ************************************
#  #######  ####  #####   ##   ## ##     **   Carlo Strozzi (c) 1998-2000   *
#  ####### ######   ##### ## # ## ##     ************************************
#  ## #### ##  ##     ### ##  ### ##     **           Written by            *
#  ##  ### ###### ####### ######  ###### **          Carlo Strozzi          *
#  ##   ##  ####   #####   #### # ###### **     e-mail: carlos@linux.it     *
#****************************************************************************
#   NoSQL RDBMS, Copyright (C) 1998 Carlo Strozzi.                          *
#   This program comes with ABSOLUTELY NO WARRANTY; for details             *
#   refer to the GNU General Public License.                                *
#****************************************************************************
# NOTE: to edit, set ts=8 in 'vi' (or equivalent)
#       to print, pipe through 'pr -t -e8'
#****************************************************************************
#
# Translates special characters in CGI environment variables into their ISO
# entities. Alternatively, encoding can be done as per RFC 1378
# (URI-encoding). Prints the new assignments to STDOUT in a (safe) format,
# suitable for CGI programs written in the Rc shell.
#
# Usage:  cgi2rc
# 
# Note: options must be passed through the environment variable _awk_args
#
# Options:
#     -p|--prefix P
#           Prefix each default output variable name with string 'P'.
# 
#     -m|--match R
#           Only do those variables that match the given regular expression.
#           R must be a valid awk(1) pattern, without surrounding slashes.
#
#     -u|--uri [uri_prefix]
#           By default, only the NoSQL special characters \t and \n, plus
#           single-quotes (that may be harmful to the shell) and a small
#           subset of other characters are ISO encoded. With this option,
#           a much wider set of characters are encoded, and in this case
#           encoding is done according to RFC 1378 instead. The full set of
#           characters that are URI-encoded if '-u' is specified is :
#
#           % # & ? + SPACE = " \t ' \n \r $ ( ) [ ] ` < > \ / . | ; ! *
#           { } : ~
#
#           Note how '%' must be escaped first, and SPACE must be done
#           after '+'.
#           This option is mainly for sending data back to the client inside
#           a URL string.
#
#     -U|--also-uri [uri_prefix]
#           Same as '-u' but does also the default encoding.
# 
#     -d|--delete R
#           Delete anything that match the regular expression R in variable
#           values. R must be a valid awk(1) pattern, without surrounding
#           slashes. If both '-d' and '-b' are specified, they can affect 
#           each other, in that '-d' is done before '-b'.
#
#     -b|--blank R
#           Anything that matches the regular expression R in variable
#           values is replaced with one single blank. R must be a valid
#           awk(1) pattern, without surrounding slashes. If both '-d' and
#           '-b' are specified, they can affect each other, in that '-d'
#           is done before '-b'.
#
#     -r|--rewind
#           Before applying the forward-escaping mechanism, do the
#           inverse transformation. This is useful to re-process
#           data that are already escaped in the input values, not
#           to escape it twice.
#
#           Note: '-d', '-b' and '-r' do not apply to the 'uri' mode.
#
#     -c|--cookie [cookie_prefix]
#           Beside encoding, also parse the HTTP_COOKIE header and
#	    print the corresponding shell assignments.
#
#     -x|--debug
#           Print all the regular expressions give with command line
#           arguments to STDERR.
#
#
# Encode special characters in environment variables. Prints the results to
# STDOUT in the form of VARIABLE='encoded_value' pairs, suitable for being
# reused by the invoking shell to make new assignments.
#
# This operator reads the environment and prints the encoded assignments
# to STDOUT.
# 
# An example of usage of this operator from within an Rc shell script is :
# 
#	    _awk_args='-m ^WWW_' ; eval `cgi2rc
#
# Note how the command should be enclosed in double quotes, to
# Preserve any newlines in variable values.
#
# The program operates on environment variables rather than on STDIN,
# because this makes much easier to handle variables containing embedded
# newlines (\n), i.e. multi-line input records.
#
# If no variables match the given pattern, then the program prints the
# pseudo-noop shell instruction { } , to make sure that a non-empty
# list is returned to the calling program.
#
# Environment variable names that do not match the regular expression
# /^[A-Za-z0-9_]+$/ are skipped, and a warning message is printed
# to STDERR.
# 
########################################################################

BEGIN {
  NULL = ""; do_default = 1; printf("{")

  # Use environment, as args may contain backslash-escapes.
  split( ENVIRON["_awk_args"], args, " " )

  while ( args[++i] != NULL )
  {
    if ( args[i] == "-m" || args[i] == "--match" ) m_pattern = args[++i]
    else if ( args[i] == "-p" || args[i] == "--prefix" ) prefix = args[++i]
    else if ( args[i] == "-r" || args[i] == "--rewind" ) rewind = 1
    else if ( args[i] == "-u" || args[i] == "--uri" )
    {
      do_uri = 1
      if ( args[i+1] !~ /^-/ ) uri_prefix = args[++i]
      # This is for backward compatibility.
      do_default = 0
    }
    else if ( args[i] == "-U" || args[i] == "--also-uri" )
    {
      do_uri = 1
      if ( args[i+1] !~ /^-/ ) uri_prefix = args[++i]
    }
    else if ( args[i] == "-d" || args[i] == "--delete" )
    {
      remove = 1; d_pattern = args[++i]
    }
    else if ( args[i] == "-b" || args[i] == "--blank" )
    {
      blank = 1; b_pattern = args[++i]
    }
    else if ( args[i] == "-x" || args[i] == "--debug" ) debug = 1
    else if ( args[i] == "-c" || args[i] == "--cookie" )
    {
      do_cookie = 1
      if ( args[i+1] !~ /^-/ ) c_prefix = args[++i]
    }
  }

  if ( m_pattern == NULL ) m_pattern = ".*"
  if ( prefix != NULL ) do_default = 1           # For backward compat.
  if ( uri_prefix == NULL ) uri_prefix = prefix

  if ( debug )
  {
    print "-m pattern: " m_pattern    > "/dev/stderr"
    print "-d pattern: " d_pattern    > "/dev/stderr"
    print "-b pattern: " b_pattern    > "/dev/stderr"
  }

  for ( env in ENVIRON )
  {
    if ( env !~ m_pattern ) continue

    # Always skip rc(1) functions, options and NoSQL-specific stuff.
    if ( env ~ /^fn_/ || env ~ /^_nosql_/ || env == "_awk_args" ) continue

    if ( env == "HTTP_COOKIE" ) continue	# Cookies handled separately.

    # Skip invalid variable names. They may occur when we receive
    # them from a WWW Browser.

    if ( env !~ /^[A-Za-z0-9_]+$/ )
    {
      print "cgi2rc: bad variable name " env > "/dev/stderr"
      continue
    }

    var = ENVIRON[ env ]

    if ( do_uri )
    {
      # Encode a subset of RFC 1378 (partial encoding).

      # Order matters here.
      gsub("%", "%25", var)
      gsub("\+", "%2B", var)
      gsub(/  */, "+", var)

      # Order does not matter here.
      gsub("\t", "%09", var)
      gsub("\n", "%0A", var)
      gsub("'", "%27", var)
      gsub("#", "%23", var)
      gsub("&", "%26", var)
      gsub("\?", "%3F", var)
      gsub("=", "%3D", var)
      gsub("\"", "%22", var)
      gsub("\r", "%0D", var)
      gsub("\$", "%24", var)
      gsub("\(", "%28", var)
      gsub("\)", "%29", var)
      gsub("\[", "%5B", var)
      gsub("]", "%5D", var)
      gsub("`", "%60", var)
      gsub("<", "%3C", var)
      gsub(">", "%3E", var)
      gsub("\\", "%5C", var)
      gsub("/", "%2F", var)
      gsub("\.", "%2E", var)
      gsub("\|", "%7C", var)
      gsub(";", "%3B", var)
      gsub("!", "%21", var)
      gsub("\*", "%2A", var)
      gsub("{", "%7B", var)
      gsub("}", "%7D", var)
      gsub(":", "%3A", var)
      gsub("~", "%7E", var)

      # Add more encodings here, if necessary.

      # Print the new assignment.
      printf(" %s%s='%s';", uri_prefix,  env, var)

      # Restore original value of var.
      var = ENVIRON[ env ]
    }

    if ( do_default )
    {
      # Honour '-d' and '-b' first.
      if ( remove ) gsub( d_pattern, NULL, var )
      if ( blank ) gsub( b_pattern, " ", var )

      # The set of characters un-escaped here must be kept synchronized
      # with the one in 'envto*', and they must be in the reverse
      # order with respect to the ones escaped below.

      if ( rewind ) {
        gsub( "&#9;", "\t", var )                # tab
        gsub( "&#10;", "\n", var )               # newline
        gsub( "&#39;", "'", var )                # single quote
        gsub( "&#96;", "`", var )                # backtick
        gsub( "&#34;", "\"", var )               # double quote
	gsub( "&#62;", ">", var )                # Close tag
	gsub( "&#60;", "<", var )                # Open tag
        gsub( "&#35;", "#", var )                # Hash mark
        gsub( "&amp;", "\&", var )               # Ampersand
      }

      # Escape HTML special characters in input data.
      # Warning: '&' must be escaped first, then '#'. They need to
      # be escaped to prevent the data from containing statements
      # (both numeric and literal) that could be parsed by a Web server
      # as valid SSI calls, or ampersand-escaped sequences that would
      # act as formatting instructions to the Web browser.

      gsub( "&", "\&amp;", var )                # Ampersand
      gsub( "#", "\&#35;", var )                # Hash mark
      gsub( "<", "\&#60;", var )                # Open tag
      gsub( ">", "\&#62;", var )                # Close tag
      gsub( "\"", "\&#34;", var )               # double quote
      gsub( "'", "\&#39;", var )                # single quote
      gsub( "`", "\&#96;", var )                # backtick
      gsub( "\n", "\&#10;", var )               # newline
      gsub( "\t", "\&#9;", var )                # tab

      # Print the new assignment.
      printf(" %s%s='%s';", prefix,  env, var)
    }
  }

  if ( do_cookie && ENVIRON["HTTP_COOKIE"] != NULL ) {

     split( ENVIRON["HTTP_COOKIE"], c, "; " )	# Split cookies.

     while ( c[++j] != NULL ) {		# Print the new assignment.
	split( c[j], c1, "=" )
	gsub( "-", "_", c1[1] )
	if ( c1[1] !~ /^[A-Za-z0-9_]+$/ ) {
	   print "cgi2rc: bad cookie name " c1[1] > "/dev/stderr"
	   continue
	}

	# Decode a subset of RFC 1378 (partial decoding).

	# Order does not matter here.
	gsub("%09", "\t", c1[2])
	gsub("%0A", "\n", c1[2])
	gsub("%27", "'", c1[2])
	gsub("%23", "#", c1[2])
	gsub("%26", "&", c1[2])
	gsub("%3F", "?", c1[2])
	gsub("%3D", "=", c1[2])
	gsub("%22", "\"", c1[2])
	gsub("%0D", "\r", c1[2])
	gsub("%24", "$", c1[2])
	gsub("%28", "(", c1[2])
	gsub("%29", ")", c1[2])
	gsub("%5B", "[", c1[2])
	gsub("%5D", "]", c1[2])
	gsub("%60", "`", c1[2])
	gsub("%3C", "<", c1[2])
	gsub("%3E", ">", c1[2])
	gsub("%5C", "\\", c1[2])
	gsub("%2F", "/", c1[2])
	gsub("%2E", ".", c1[2])
	gsub("%7C", "|", c1[2])
	gsub("%3B", ";", c1[2])
	gsub("%21", "!", c1[2])
	gsub("%2A", "*", c1[2])
	gsub("%7B", "{", c1[2])
	gsub("%7D", "}", c1[2])
	gsub("%3A", ":", c1[2])
	gsub("%7E", "~", c1[2])

	# Order matters here.
	gsub("\+", " ", c1[2])
	gsub("%2B", "+", c1[2])
	gsub("%25", "%", c1[2])

	# Single-quotes must be doubled for Rc.
        gsub("'", "''", c1[2])

        printf(" %s%s='%s';", c_prefix,  c1[1], c1[2])
     }
  }

  # Close the shell {list} statement.
  printf("}\n")
}

########################################################################
# End of program.
########################################################################

