#!/usr/bin/perl -T
#
# W3C Markup Validation Service
# A CGI script to retrieve and validate a markup file
#
# Copyright (c) 1995-2006 World Wide Web Consortium
# originally written by Gerald Oskoboiny <gerald@w3.org>
# for additional contributors, see http://validator.w3.org/about.html#credits
# and http://dev.w3.org/cvsweb/validator/
#
# This source code is available under the license at:
#     http://www.w3.org/Consortium/Legal/copyright-software
#
# $Id: check,v 1.432.2.22 2006/11/10 02:51:50 ot Exp $

#
# Disable buffering on STDOUT!
$| = 1;

#
# We need Perl 5.6.0+.
use 5.006;

###############################################################################
#### Load modules. ############################################################
###############################################################################

#
# Pragmas.
use strict;
use warnings;

#
# Modules.  See also the BEGIN block further down below.
#
# Version numbers given where we absolutely need a minimum version of a given
# module (gives nicer error messages). By default, add an empty import list
# when loading modules to prevent non-OO or poorly written modules from
# polluting our namespace.
#
use CGI             2.81 qw(
                            -newstyle_urls
                            -private_tempfiles
                            redirect
                           ); # 2.81 for XHTML, and import redirect() function.

use CGI::Carp            qw(carp croak fatalsToBrowser);
use Config::General 2.19 qw(); # Need 2.19 for -AutoLaunder
use File::Spec           qw();
use HTML::Parser    3.25 qw(); # Need 3.25 for $p->ignore_elements.
use HTML::Template  2.6  qw();
use HTTP::Request        qw();
use HTTP::Headers::Auth  qw(); # Needs to be imported after other HTTP::*.
use IO::File             qw();
use IPC::Open3           qw(open3);
use Set::IntSpan         qw();
use Text::Iconv          qw();
use Text::Wrap           qw(wrap);
use URI                  qw();
use URI::Escape          qw(uri_escape);

###############################################################################
#### Constant definitions. ####################################################
###############################################################################

#
# Define global constants
use constant TRUE  => 1;
use constant FALSE => 0;

#
# Tentative Validation Severities.
use constant T_DEBUG =>  1; # 0000 0001
use constant T_INFO  =>  2; # 0000 0010
use constant T_WARN  =>  4; # 0000 0100
use constant T_ERROR =>  8; # 0000 1000
use constant T_FATAL => 16; # 0001 0000
use constant T_FALL  => 32; # 0010 0000, Fallback in effect.

#
# Output flags for error processing
use constant O_SOURCE  => 1; # 0000 0001
use constant O_CHARSET => 2; # 0000 0010
use constant O_DOCTYPE => 4; # 0000 0100
use constant O_NONE    => 8; # 0000 1000

#
# Parse mode tokens.
use constant MODE_SGML => 1; # 0000 0001
use constant MODE_XML  => 2; # 0000 0010
use constant MODE_TBD  => 4; # 0000 0100, needs further info to decide.
#@@FIXME: XML WF checking isn't implemented.
use constant MODE_WF   => 8; # 0000 1000, only XML well-formed checking.
#@@FIXME;

#
# Define global variables.
use vars qw($DEBUG $CFG $RSRC $VERSION);

#
# Things inside BEGIN don't happen on every request in persistent environments
# (such as mod_perl); so let's do the globals, eg. read config, here.
BEGIN {
  # Launder data for -T; -AutoLaunder doesn't catch this one.
  if (exists $ENV{W3C_VALIDATOR_HOME}) {
    $ENV{W3C_VALIDATOR_HOME} =~ /^(.*)$/;
    $ENV{W3C_VALIDATOR_HOME} = $1;
  }

  my $base_path = $ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator';
  #
  # Read Config Files.
  eval {
    my %config_opts = (
       -ConfigFile => ($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'),
       -MergeDuplicateOptions => TRUE,
       -MergeDuplicateBlocks  => TRUE,
       -SplitPolicy      => 'equalsign',
       -UseApacheInclude => TRUE,
       -IncludeRelative  => TRUE,
       -InterPolateVars  => TRUE,
       -AutoLaunder      => TRUE,
       -AutoTrue         => TRUE,
       -DefaultConfig    => {
          Protocols => {Allow => 'http,https'},
          Paths => {
            Base => $base_path,
            Templates => "$base_path/share/templates",
            SGML => {Parser => '/usr/bin/onsgmls', 
                     Library => "$base_path/htdocs/sgml-lib"},
          },
       },
      );
    my %cfg = Config::General->new(%config_opts)->getall();
    $CFG = \%cfg;
  };
  if ($@) {
    die <<".EOF.";
Could not read configuration.  Set the W3C_VALIDATOR_CFG environment variable
to point to the validator.conf file or copy htdocs/conf/* to /etc/w3c/. Make sure
that the configuration file and all included files are readable by the web
server user. The error was:\n'$@'
.EOF.
  }

  #
  # Check a filesystem path for existance and "readability".
  sub pathcheck (@) {
    my %paths = map { $_ => [-d $_, -r _] } @_;
    my @_d = grep {not $paths{$_}->[0]} keys %paths;
    my @_r = grep {not $paths{$_}->[1]} keys %paths;
    return TRUE if (scalar(@_d) + scalar(@_r) == 0);
    die <<".EOF." if scalar @_d;
Does not exist or is not a directory: @_d
.EOF.
    die <<".EOF." if scalar @_r;
Directory not readable (permission denied): @_r
.EOF.
  }

  #
  # Check paths in config...
  # @@FIXME: This does not do a very good job error-message-wise if a path is
  # @@FIXME: missing...;
  {
    my @dirs = ();
    push @dirs, $CFG->{Paths}->{Base};
    push @dirs, $CFG->{Paths}->{Templates};
    push @dirs, $CFG->{Paths}->{SGML}->{Library};
    &pathcheck(@dirs);
  }

  #
  # Split allowed protocols into a list.
  if (my $allowed = delete($CFG->{Protocols}->{Allow})) {
    $CFG->{Protocols}->{Allow} = [split(/\s*,\s*/, $allowed)];
  }

  #
  # Make sure onsgmls exists and is executable.
  unless (-x $CFG->{Paths}->{SGML}->{Parser}) {
    die qq(Configured SGML Parser "$CFG->{Paths}->{SGML}->{Parser}" not executable!\n);
  }

  { # Make types config indexed by FPI.
    my $_types = {};
    map { $_types->{$CFG->{Types}->{$_}->{PubID}} = $CFG->{Types}->{$_} }
      keys %{$CFG->{Types}};
    $CFG->{Types} = $_types;
  }

  #
  # Change strings to internal constants in MIME type mapping.
  for (keys %{$CFG->{MIME}}) {
    if    ($CFG->{MIME}->{$_} eq 'SGML') { $CFG->{MIME}->{$_} = MODE_SGML }
    elsif ($CFG->{MIME}->{$_} eq 'XML')  { $CFG->{MIME}->{$_} = MODE_XML }
    elsif ($CFG->{MIME}->{$_} eq 'TBD')  { $CFG->{MIME}->{$_} = MODE_TBD }
    else                                 { $CFG->{MIME}->{$_} = MODE_TBD }
  }

  #
  # Set debug flag.
  if ($CFG->{'Allow Debug'} == TRUE) {
    $DEBUG = TRUE if $ENV{W3C_VALIDATOR_DEBUG} || $CFG->{'Enable Debug'};
  } else {
    $DEBUG = FALSE;
  }

  #
  # Strings
  $VERSION =  q$Revision: 1.432.2.22 $;
  $VERSION =~ s/Revision: ([\d\.]+) /$1/;

  #
  # Use passive FTP by default.
  $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE});
} # end of BEGIN block.

#
# Get rid of (possibly insecure) $PATH.
delete $ENV{PATH};

#@@DEBUG: Dump $CFG datastructure. Used only as a developer aid.
#use Data::Dumper qw(Dumper);
#print Dumper($CFG);
#exit;
#@@DEBUG;

###############################################################################
#### Process CGI variables and initialize. ####################################
###############################################################################

#
# Create a new CGI object.
my $q = new CGI;

#
# The data structure that will hold all session data.
my $File;

##############################################
# Populate $File->{Env} -- Session Metadata. #
##############################################

#
# The URL to this CGI Script.
$File->{Env}->{'Self URI'} = $q->url(-query => 0);

#################################
# Initialize the datastructure. #
#################################

#
# Charset data (casing policy: lowercase early).
$File->{Charset}->{Use}      = ''; # The charset used for validation.
$File->{Charset}->{Auto}     = ''; # Autodetection using XML rules (Appendix F)
$File->{Charset}->{HTTP}     = ''; # From HTTP's "charset" parameter.
$File->{Charset}->{META}     = ''; # From HTML's <meta http-equiv>.
$File->{Charset}->{XML}      = ''; # From the XML Declaration.
$File->{Charset}->{Override} = ''; # From CGI/user override.

#
# Misc simple types.
$File->{Mode} = MODE_SGML; # Default parse mode is SGML.

#
# Array (ref) used to store character offsets for the XML report.
$File->{Offsets}->[0] = [0, 0]; # The first item isn't used...

#
# Listrefs.
$File->{Lines}      = []; # Line numbers for encoding errors.
$File->{Warnings}   = []; # Warnings...
$File->{Namespaces} = []; # Other (non-root) Namespaces.

###############################################################################
#### Generate Template for Result. ############################################
###############################################################################

my $lang = 'en_US'; # @@TODO: conneg;

my %cache_opts = ();
if (eval { require Storable }) {
  %cache_opts = (
    file_cache     => TRUE,
    file_cache_dir => File::Spec->catdir(File::Spec->tmpdir(), 'validator'),
  );
}

my $T = HTML::Template->new(
  filename          => File::Spec->catfile($CFG->{Paths}->{Templates},
                                           $lang, 'result.tmpl'),
  die_on_bad_params => FALSE,
  loop_context_vars => TRUE,
  %cache_opts,
);
my $E = HTML::Template->new(
  filename          => File::Spec->catfile($CFG->{Paths}->{Templates},
                                           $lang, 'fatal-error.tmpl'),
  die_on_bad_params => FALSE,
  %cache_opts,
);
my $H = HTML::Template->new(
  filename          => File::Spec->catfile($CFG->{Paths}->{Templates},
                                           $lang, 'http_401_authrequired.tmpl'),
  die_on_bad_params => FALSE,
  %cache_opts,
);

# templates for alternate output formats
my $XMLT = HTML::Template->new(
  filename          => File::Spec->catfile($CFG->{Paths}->{Templates},
                                          $lang, 'xml_output.tmpl'),
  die_on_bad_params => FALSE,
  loop_context_vars => TRUE,
  cache             => TRUE,
);
my $SOAPT = HTML::Template->new(
  filename          => File::Spec->catfile($CFG->{Paths}->{Templates},
                                          $lang, 'soap_output.tmpl'),
  die_on_bad_params => FALSE,
  loop_context_vars => TRUE,
  cache             => TRUE,
);
my $UCNT = HTML::Template->new(
  filename          => File::Spec->catfile($CFG->{Paths}->{Templates},
                                          $lang, 'ucn_output.tmpl'),
  die_on_bad_params => FALSE,
  loop_context_vars => TRUE,
  cache             => TRUE,
);
my $SOAPFT = HTML::Template->new(
  filename          => File::Spec->catfile($CFG->{Paths}->{Templates},
                                          $lang, 'soap_fault.tmpl'),
  die_on_bad_params => FALSE,
  loop_context_vars => TRUE,
  cache             => TRUE,
);
my $SOAPDIS = HTML::Template->new(
  filename          => File::Spec->catfile($CFG->{Paths}->{Templates},
                                          $lang, 'soap_disabled.tmpl'),
  die_on_bad_params => FALSE,
  loop_context_vars => TRUE,
  cache             => TRUE,
);
my $EARLT = HTML::Template->new(
  filename          => File::Spec->catfile($CFG->{Paths}->{Templates},
                                          $lang, 'earl_xml.tmpl'),
  die_on_bad_params => FALSE,
  loop_context_vars => TRUE,
  cache             => TRUE,
  global_vars       => TRUE,
);
my $N3T = HTML::Template->new(
  filename          => File::Spec->catfile($CFG->{Paths}->{Templates},
                                          $lang, 'earl_n3.tmpl'),
  die_on_bad_params => FALSE,
  loop_context_vars => TRUE,
  cache             => TRUE,
  global_vars       => TRUE,
);

$File->{T} = $T;
$File->{S} = $SOAPT;
$File->{E} = $E;
$File->{H} = $H;

# Read friendly error message file
my %rsrc = Config::General->new(
  -MergeDuplicateBlocks => 1,
  -ConfigFile           => File::Spec->catfile($CFG->{Paths}->{Templates},
                                               $lang, 'error_messages.cfg'),
  )->getall();
# Config::General workarounds for <msg 0> issues:
# http://lists.w3.org/Archives/Public/public-qa-dev/2006Feb/0022.html
# http://lists.w3.org/Archives/Public/public-qa-dev/2006Feb/0025.html
# https://rt.cpan.org/Public/Bug/Display.html?id=17852
$rsrc{msg}{0} ||=
  delete($rsrc{'msg 0'}) ||                   # < 2.31
  { original => delete($rsrc{msg}{original}), #   2.31
    verbose  => delete($rsrc{msg}{verbose}),
  };
$RSRC = \%rsrc;

$T->param(cfg_home_page => $CFG->{'Home Page'});
$SOAPT->param(cfg_home_page => $CFG->{'Home Page'});

undef $lang;
undef %cache_opts;

#########################################
# Populate $File->{Opt} -- CGI Options. #
#########################################

#
# Preprocess the CGI parameters.
$q = &prepCGI($File, $q);

#
# Set session switches.
$File->{Opt}->{'Outline'}        = $q->param('outline') ? TRUE                   :  FALSE;
$File->{Opt}->{'Show Source'}    = $q->param('ss')      ? TRUE                   :  FALSE;
$File->{Opt}->{'Show Parsetree'} = $q->param('sp')      ? TRUE                   :  FALSE;
$File->{Opt}->{'No Attributes'}  = $q->param('noatt')   ? TRUE                   :  FALSE;
$File->{Opt}->{'Show ESIS'}      = $q->param('esis')    ? TRUE                   :  FALSE;
$File->{Opt}->{'Show Errors'}    = $q->param('errors')  ? TRUE                   :  FALSE;
$File->{Opt}->{'Verbose'}        = $q->param('verbose') ? TRUE                   :  FALSE;
$File->{Opt}->{'Debug'}          = $q->param('debug')   ? TRUE                   :  FALSE;
$File->{Opt}->{'No200'}          = $q->param('No200')   ? TRUE                   :  FALSE;
$File->{Opt}->{'Charset'}        = $q->param('charset') ? lc $q->param('charset'):     '';
$File->{Opt}->{'DOCTYPE'}        = $q->param('doctype') ? $q->param('doctype')   :     '';
$File->{Opt}->{'Output'}         = $q->param('output')  ? $q->param('output')    : 'html';
$File->{Opt}->{'Max Errors'}     = $q->param('me')      ? $q->param('me')        :     '';

#
# "Fallback" info for Character Encoding (fbc), Content-Type (fbt),
# and DOCTYPE (fbd). If TRUE, the Override values are treated as
# Fallbacks instead of Overrides.
$File->{Opt}->{FB}->{Charset} = $q->param('fbc') ? TRUE : FALSE;
$File->{Opt}->{FB}->{Type}    = $q->param('fbt') ? TRUE : FALSE;
$File->{Opt}->{FB}->{DOCTYPE} = $q->param('fbd') ? TRUE : FALSE;

#
# If ";debug" was given, let it overrule the value from the config file,
# regardless of whether it's "0" or "1" (on or off), but only if config
# allows the debugging options.
if ($CFG->{'Allow Debug'}) {
  $DEBUG = $q->param('debug') if defined $q->param('debug');
  $File->{Opt}->{Verbose} = TRUE if $DEBUG;
} else {
  $DEBUG = FALSE; # The default.
}

&abort_if_error_flagged($File, O_NONE); # Too early to &print_table.

#
# Get the file and metadata.
if ($q->param('uploaded_file')) {
  $File = &handle_file($q, $File);
} elsif ($q->param('fragment')) {
  $File = &handle_frag($q, $File);
} elsif ($q->param('uri')) {
  $File = &handle_uri($q, $File);
}

#
# Abort if an error was flagged during initialization.
&abort_if_error_flagged($File, 0);

#
# Get rid of the CGI object.
undef $q;

#
# We don't need STDIN any more, so get rid of it to avoid getting clobbered
# by Apache::Registry's idiotic interference under mod_perl.
untie *STDIN;

###############################################################################
#### Output validation results. ###############################################
###############################################################################

#
# Find the XML Encoding.
$File = &find_xml_encoding($File);

#
# Decide on a charset to use (first part)
#
if ($File->{Charset}->{HTTP}) { # HTTP, if given, is authoritative.
  $File->{Charset}->{Use} = $File->{Charset}->{HTTP};
} elsif ($File->{ContentType} =~ m(^text/([-.a-zA-Z0-9]\+)?xml$)) {
  # Act as if $http_charset was 'us-ascii'. (MIME rules)
  $File->{Charset}->{Use} = 'us-ascii';

  &add_warning('W01', {
    W01_upload => $File->{'Is Upload'},
    W01_agent  => $File->{Server},
    W01_ct     => $File->{ContentType},
  });

} elsif ($File->{Charset}->{XML}) {
  $File->{Charset}->{Use} = $File->{Charset}->{XML};
} elsif ($File->{Charset}->{Auto} =~ /^utf-16[bl]e$/ && $File->{BOM} == 2) {
  $File->{Charset}->{Use} = 'utf-16';
} elsif ($File->{ContentType} =~ m(^application/([-.a-zA-Z0-9]+\+)?xml$)) {
  $File->{Charset}->{Use} = "utf-8";
} elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) {
  $File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.)
}

$File->{Content} = &normalize_newlines($File->{Bytes},
                       exact_charset($File, $File->{Charset}->{Use}));

#
# Try to extract META charset
# (works only if ascii-based and reasonably clean before <meta>)
$File = &preparse_meta($File); # First call. Repeated later to fetch the FPI.
unless ($File->{Charset}->{Use}) {
  $File->{Charset}->{Use} = $File->{Charset}->{META};
}

#
# Handle any Fallback or Override for the charset.
if (&conflict($File->{Opt}->{Charset}, '(detect automatically)')) {
  # charset=foo was given to the CGI and it wasn't "autodetect".

  #
  # Extract the user-requested charset from CGI param.
  my ($override, undef) = split(/\s/, $File->{Opt}->{Charset}, 2);
  $File->{Charset}->{Override} = lc($override);

  if ($File->{Opt}->{FB}->{Charset}) { # charset fallback mode
    unless ($File->{Charset}->{Use}) {
      &add_warning('W02', {W02_charset => $File->{Charset}->{Override}});

      $File->{Tentative} |= T_ERROR; # Tag it as Invalid.
      $File->{Charset}->{Use} = $File->{Charset}->{Override};
    }
  } else { # charset "hard override" mode
    if (! $File->{Charset}->{Use}) { # overriding "nothing"
      &add_warning('W04', {W04_charset => $File->{Charset}->{Override}, W04_override => TRUE});
      $File->{Tentative} |= T_ERROR;
      $File->{Charset}->{Use} = $File->{Charset}->{Override};
     }
     else { #actually overriding something
      # Warn about Override unless it's the same as the real charset...

      unless ($File->{Charset}->{Override} eq $File->{Charset}->{Use}) {
        &add_warning('W03', {
          W03_use => $File->{Charset}->{Use},
          W03_opt => $File->{Charset}->{Override},
        });

        $File->{Tentative} |= T_ERROR;
        $File->{Charset}->{Use} = $File->{Charset}->{Override};
      }
    }
  }
}

unless ($File->{Charset}->{Use}) { # No charset given...
  &add_warning('W04', {W04_charset => 'UTF-8'});
  $File->{Tentative} |= T_ERROR; # Can never be valid.
  $File->{Charset}->{Use} = 'utf-8';
}

#
# Abort if an error was flagged while finding the encoding.
&abort_if_error_flagged($File, O_CHARSET|O_DOCTYPE);

#
# Check the detected Encoding and transcode.
if (&conflict($File->{Charset}->{Use}, 'utf-8')) {
  $File = &transcode($File);
  &abort_if_error_flagged($File, O_CHARSET);
}

$File = &check_utf8($File); # always check
$File = &byte_error($File);

#
# Abort if an error was flagged during transcoding
&abort_if_error_flagged($File, O_SOURCE|O_CHARSET);

#
# Overall parsing algorithm for documents returned as text/html:
#
# For documents that come to us as text/html,
#
#  1. check if there's a doctype
#  2. if there is a doctype, parse/validate against that DTD
#  3. if no doctype, check for an xmlns= attribute on the first element
#  4. if there is an xmlns= attribute, check for XML well-formedness
#  5. if there is no xmlns= attribute, and no DOCTYPE, punt.
#

#
# Override DOCTYPE if user asked for it.
if ($File->{Opt}->{DOCTYPE}
    and not $File->{Opt}->{DOCTYPE} =~ /(Inline|detect)/i) {
  $File = &override_doctype($File);
}

#
# Try to extract a DOCTYPE or xmlns.
$File = &preparse_doctype($File);

#
# Set parse mode.
if ($File->{DOCTYPE}) {
  my $fpi = $File->{DOCTYPE};
  if (exists $CFG->{Types}->{$fpi}) {
    my $cfg  = $CFG->{Types}->{$fpi};
    my $mode = $cfg->{'Parse Mode'};

    if    ($mode eq 'SGML') { $mode = MODE_SGML }
    elsif ($mode eq 'XML')  { $mode = MODE_XML }
    else                    { $mode = MODE_TBD }

    if ($File->{Mode} == MODE_TBD) {
      if    ($mode == MODE_SGML) { $File->{Mode} = MODE_SGML }
      elsif ($mode == MODE_XML)  { $File->{Mode} = MODE_XML }
      else {
        $File->{Mode} = MODE_SGML;
        &add_warning('W06', {});
      }
    } else {
      unless ($mode == $File->{Mode}) {
        my $dtd = $mode;
        my $ct  = $File->{Mode};
        for ($dtd, $ct) {
          if    ($_ == MODE_SGML) { $_ = 'SGML' }
          elsif ($_ == MODE_XML)  { $_ =  'XML' }
          else                    { $_ = 'SGML' }
        }
        unless ($File->{Mode} == MODE_TBD) {
          &add_warning('W07', {
            W07_mime => $File->{ContentType},
            W07_ct   => $ct,
            W07_dtd  => $dtd,
          });
        }
      }
    }
  } else {
    &add_warning('W08', {W08_mime => $File->{ContentType}})
      if $File->{Mode} == MODE_TBD;
  }
}

#
# Sanity check Charset information and add any warnings necessary.
$File = &charset_conflicts($File);

#
# Abandon all hope ye who enter here...
$File = &parse($File);
sub parse (\$) {
  my $File = shift;

  #
  # By default, use SGML catalog file and SGML Declaration.
  my $catalog  = File::Spec->catfile($CFG->{Paths}->{SGML}->{Library}, 'sgml.soc');
  #
  # Note: if you feel the urge to remove -R from here, please understand that
  # doing so opens a potential security hole.  Don't do that; instead just
  # make sure you're running OpenSP 1.5 or later.
  my @spopt = qw(
                 -R
                 -wvalid
                 -wnon-sgml-char-ref
                 -wno-duplicate
                );

  #
  # Switch to XML semantics if file is XML.
  if (&is_xml($File)) {
    $catalog  = File::Spec->catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc');
    push(@spopt, '-wxml');
  }

  #
  # Defaults for SP; turn off fixed charset mode and set encoding to UTF-8.
  $ENV{SP_CHARSET_FIXED} = 'NO';
  $ENV{SP_ENCODING}      = 'UTF-8';
  $ENV{SP_BCTF}          = 'UTF-8';

  #
  # Tell onsgmls about the SGML Library.
  $ENV{SGML_SEARCH_PATH} = $CFG->{Paths}->{SGML}->{Library};

  #
  # Set the command to execute.
  my @cmd = ($CFG->{Paths}->{SGML}->{Parser}, '-n', '-c', $catalog, @spopt);

  #
  # Set debug info for HTML report.
  $T->param(opt_debug => $DEBUG);
  $T->param(debug =>
            [
             {name => 'Command',           value => "@cmd"},
             map({name => $_, value => $ENV{$_}},
               qw(SP_CHARSET_FIXED SP_ENCODING SP_BCTF SGML_SEARCH_PATH
                  no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)),
             {name => 'Content-Encoding',  value => $File->{ContentEnc}},
             {name => 'Transfer-Encoding', value => $File->{TransferEnc}},
            ],
           );

  #@@FIXME: This needs a UI and testing!
  #
  # Set onsgmls' -E switch to the number of errors requested.
  if ($File->{Opt}->{'Max Errors'} =~ m(^all$)i) {
    push @cmd, '-E0';
  } elsif ($File->{Opt}->{'Max Errors'} =~ m(^(\d+)$)) {
    my $numErr = $1;
    if ($numErr >= 200) {
      $numErr = 200;
    } elsif ($numErr <= 0) {
      $numErr = 0; #@@FIXME: Should add feature to supress error output in this case.;
    }
    push @cmd, '-E' . $numErr;
  } else {
    push @cmd, '-E' . ($CFG->{'Max Errors'} || 0); # "-E0" means "all".
  }
  #@@FIXME;

  #
  # Temporary filehandles.
  my $spin  = IO::File->new_tmpfile;
  my $spout = IO::File->new_tmpfile;
  my $sperr = IO::File->new_tmpfile;

  #
  # Dump file to a temp file for parsing.
  for (@{$File->{Content}}) {
    print $spin $_, "\n";
  }

  #
  # seek() to beginning of the file.
  seek $spin, 0, 0;

  #
  # Run it through SP, redirecting output to temporary files.
  my $pid = do {
    no warnings 'once';
    local (*SPIN, *SPOUT, *SPERR)  = ($spin, $spout, $sperr);
    open3("<&SPIN", ">&SPOUT", ">&SPERR", @cmd);
  };
  undef $spin;
  waitpid $pid, 0;

  #
  # Rewind temporary filehandles.
  seek $_, 0, 0 for $spout, $sperr;

  #
  # Proper text mode for Win32 systems
  binmode($spout, ':crlf') if $^O eq "MSWin32";

  $File = &parse_errors($File, $sperr); # Parse error output.
  undef $sperr; # Get rid of no longer needed filehandle.

  $File->{ESIS} = [];
  my $elements_found = 0;
  while (<$spout>) {
    $elements_found++ if /^\(/;

    if (/^Axmlns() \w+ (.*)/ or /^Axmlns:([^ ]+) \w+ (.*)/) {
      if (not $File->{Namespace}) {
        if ($elements_found == 0 and $1 eq "") {
          $File->{Namespace} = $2;
        } else {
          # @@FIXME: should not happen;
          push(@{$File->{Namespaces}}, $2);
        }
      } else {
        push(@{$File->{Namespaces}}, $2) if ($2 ne $File->{Namespace});
      }
    }

    next if / IMPLIED$/ && not $DEBUG;
    next if /^ASDAFORM CDATA /;
    next if /^ASDAPREF CDATA /;
    chomp; # Removes trailing newlines
    push @{$File->{ESIS}}, $_;
  }
  undef $spout;

  if (@{$File->{ESIS}} && $File->{ESIS}->[-1] =~ /^C$/) {
    pop(@{$File->{ESIS}});
    $File->{'Is Valid'} = TRUE;
  } else {
    $File->{'Is Valid'} = FALSE;
  }

  #
  # Set Version to be the FPI initially.
  $File->{Version} = $File->{DOCTYPE};

  #
  # Extract any version attribute from the ESIS.
  for (@{$File->{ESIS}}) {
    no warnings 'uninitialized';
    next unless /^AVERSION CDATA (.*)/i;
    push @{$File->{Version_ESIS}}, $1;
    if ($1 =~ '-//W3C//DTD (SGML|XML) Fallback//EN') {
      $File->{Tentative} |= (T_ERROR | T_FALL);
      my $dtd = $1 eq 'SGML' ? 'HTML 4.01 Transitional' : 'XHTML 1.0 Strict';
      &add_warning('W09', { W09_dtd => $dtd });
    }
  }

  return $File;
}

#
# Force "XML" if type is an XML type and an FPI was not found.
# Otherwise set the type to be the FPI.
if (&is_xml($File) and not $File->{DOCTYPE}) {
  $File->{Version} = 'XML';
} else {
  $File->{Version} = $File->{DOCTYPE} unless $File->{Version};
}

#
# Get the pretty text version of the FPI if a mapping exists.
if (my $prettyver = $CFG->{Types}->{$File->{Version}}->{Display}) {
  $File->{Version} = $prettyver;
}

#
# Warn about unknown, incorrect, or missing Namespaces.
if ($File->{Namespace}) {
  my $ns  = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE;

  if (&is_xml($File)) {
    if ($ns eq $File->{Namespace}) {
      &add_warning('W10', {
        W10_ns   => $File->{Namespace},
        W10_type => $File->{Type},
      });
    }
  } else {
    &add_warning('W11', {W11_ns => $File->{Namespace}});
  }
} else {
  if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) {
    &add_warning('W12', {});
  }
}

my $template;

if ($File->{Opt}->{Output} eq 'xml') {
  $template = $XMLT;
} elsif ($File->{Opt}->{Output} eq 'earl') {
  $template = $EARLT;
} elsif ($File->{Opt}->{Output} eq 'n3') {
  $template = $N3T;
} elsif ($File->{Opt}->{Output} eq 'ucn') {
  $template = $UCNT;
} elsif ($File->{Opt}->{Output} eq 'soap12') {
  if ($CFG->{'Enable SOAP'} != 1) { # API disabled - ideally this should have been sent before performing validation...
    print CGI::header(-status => 503, -content_language => "en",
          -type => "text/html", -charset => "utf-8"
    );
    $template = $SOAPDIS;
  } elsif ($File->{'Error Flagged'}) {
    $template = $SOAPFT;
  } else {
    $template = $SOAPT;
  }
} else {
    $template = $T;
}

&prep_template($File, $template);
&fin_template($File, $template);

$template->param(file_warnings => $File->{Warnings});
$template->param(file_source => &source($File))
  if $template->param('opt_show_source');
$template->param('opt_show_esis' => TRUE)
  if $File->{Opt}->{'Show ESIS'};
$template->param('opt_show_raw_errors' => TRUE)
  if $File->{Opt}->{'Show Errors'};
$template->param('file_raw_errors' => &show_errors($File))
  if $template->param('opt_show_raw_errors');
  $T->param(file_outline   => &outline($File)) if $T->param('opt_show_outline');
print $template->output;

#
# Get rid of $File object and exit.
undef $File;
exit;

#############################################################################
# Subroutine definitions
#############################################################################

#
sub prep_template ($$) {
  my $File = shift;
  my $T    = shift;

  #
  # XML mode...
  $T->param(is_xml => &is_xml($File));

  #
  # Upload?
  $T->param(is_upload => $File->{'Is Upload'});

  #
  # The URI...
  $T->param(file_uri => $File->{URI});
  $T->param(file_uri_param => uri_escape($File->{URI}));

  #
  # Set URL for page title.
  $T->param(page_title_url => $File->{URI});

  #
  # Metadata...
  $T->param(file_modified    => $File->{Modified});
  $T->param(file_server      => $File->{Server});
  $T->param(file_size        => $File->{Size});
  $T->param(file_contenttype => $File->{ContentType});
  $T->param(file_charset     => $File->{Charset}->{Use});
  $T->param(file_doctype     => $File->{DOCTYPE});

  #
  # Output options...
  $T->param(opt_show_source    => $File->{Opt}->{'Show Source'});
  $T->param(opt_show_outline   => $File->{Opt}->{'Outline'});
  $T->param(opt_show_parsetree => $File->{Opt}->{'Show Parsetree'});
  $T->param(opt_show_noatt     => $File->{Opt}->{'No Attributes'});
  $T->param(opt_verbose        => $File->{Opt}->{'Verbose'});
  $T->param(opt_no200          => $File->{Opt}->{'No200'});

  #
  # Tip of the Day...
  my $tip = &get_tip();
  $T->param(tip_uri  => $tip->[0]);
  $T->param(tip_slug => $tip->[1]);

  #
  # Namespaces...
  $T->param(file_namespace  => $File->{Namespace});
  my %seen_ns = ();
  my @bulk_ns =  @{$File->{Namespaces}};
  $File->{Namespaces} = []; # reinitialize the list of non-root namespaces
  my  $single_namespace;
  # ... and then get a uniq version of it
  foreach $single_namespace (@bulk_ns) {
     push(@{$File->{Namespaces}}, $single_namespace) unless $seen_ns{$single_namespace}++;
  }
  my @nss                   =  map({uri => $_}, @{$File->{Namespaces}});
  $T->param(file_namespaces => \@nss) if @nss;

  if ($File->{Opt}->{DOCTYPE}) {
      my $over_doctype_param = "override doctype $File->{Opt}->{DOCTYPE}";
       $T->param($over_doctype_param => TRUE);
  }

  if ($File->{Opt}->{Charset}) {
      my $over_charset_param = "override charset $File->{Opt}->{Charset}";
       $T->param($over_charset_param => TRUE);
  }
}
sub fin_template ($$) {
  my $File = shift;
  my $T    = shift;

  if (! $File->{Doctype} and ($File->{Version} eq 'unknown' or $File->{Version} eq 'SGML' or (!$File->{Version}))) {

    # @@TODO@@ we should try falling back on other version
    # info, such as the ones stored in Version_ESIS
    $T->param(file_version => '(no Doctype found)');
  }
  else {
    $T->param(file_version => $File->{Version});
  }
  my ($num_errors,$num_warnings, $num_info, $reported_errors) = &report_errors($File);
  if ($num_errors+$num_warnings > 0)
  {
    $T->param(has_errors => 1);
  }
  $T->param(valid_errors_num => $num_errors);
  $num_warnings += scalar @{$File->{Warnings}};
  $T->param(valid_warnings_num => $num_warnings);
  my $number_of_errors = ""; # textual form of $num_errors
  if ($num_errors > 1) {
    $number_of_errors = "$num_errors errors"
  }
  else {
    $number_of_errors = "$num_errors error"
  }
  $T->param(file_errors => $reported_errors);
   $T->param(number_of_errors => $number_of_errors);
  if ($File->{'Is Valid'}) {
    $T->param(VALID => TRUE);
    $T->param(valid_status => 'Valid');
    &report_valid($File, $T);
  } else {
    $T->param(VALID => FALSE);
    $T->param(valid_status => 'Invalid');
  }
}

#
# Output "This page is Valid" report.
sub report_valid {
  my $File = shift;
  my $T    = shift;

  unless ($File->{Version} eq 'unknown' or defined $File->{Tentative}) {

    if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}) {
      my $cfg = $CFG->{Types}->{$File->{DOCTYPE}};
      $T->param(have_badge => TRUE);
      $T->param(badge_uri  => $cfg->{Badge}->{URI});
      $T->param(badge_alt  => $cfg->{Badge}->{Alt});
      $T->param(badge_h    => $cfg->{Badge}->{Height});
      $T->param(badge_w    => $cfg->{Badge}->{Width});
      $T->param(badge_tagc => ($cfg->{'Parse Mode'} eq 'XML' ? ' /' : ''));
    }
  } elsif (defined $File->{Tentative}) {
    $T->param(is_tentative => TRUE);
  }
  my $thispage = $File->{Env}->{'Self URI'};
  my $escaped_uri = uri_escape($File->{URI});
  $thispage .= qq(?uri=$escaped_uri);
  $thispage .= ';ss=1'      if $File->{Opt}->{'Show Source'};
  $thispage .= ';sp=1'      if $File->{Opt}->{'Show Parsetree'};
  $thispage .= ';noatt=1'   if $File->{Opt}->{'No Attributes'};
  $thispage .= ';outline=1' if $File->{Opt}->{'Outline'};
  $T->param(file_thispage => $thispage);
}

#
# Add a waring message to the output.
sub add_warning ($$) {
  my $WID    = shift;
  my $params = shift;

  $File->{T}->param($WID => TRUE, %{$params});
  $File->{T}->param(have_warnings => TRUE);
  $File->{E}->param($WID => TRUE, %{$params});
  $File->{E}->param(have_warnings => TRUE);
  $File->{S}->param($WID => TRUE, %{$params});
  $File->{S}->param(have_warnings => TRUE);

}

#
# Proxy authentication requests.
# Note: expects the third argument to be a hash ref (see HTTP::Headers::Auth).
sub authenticate {
  my $File       = shift;
  my $resource   = shift;
  my $authHeader = shift || {};

  my $realm = $resource;
  $realm =~ s([^\w\d.-]*){}g;

  for my $scheme (keys(%$authHeader)) {
    my $origrealm = $authHeader->{$scheme}->{realm};
    if (not defined $origrealm or lc($scheme) !~ /^(?:basic|digest)$/) {
      delete($authHeader->{$scheme});
      next;
    }
    $authHeader->{$scheme}->{realm} = "$realm-$origrealm";
  }

  my $headers = HTTP::Headers->new(Connection => 'close');
  $headers->www_authenticate(%$authHeader);
  $headers = $headers->as_string();
  chomp($headers);

  $File->{H}->param(http_401_headers => $headers);
  $File->{H}->param(http_401_url     => $resource);

  print $File->{H}->output;

  exit; # Further interaction will be a new HTTP request.
}

#
# Fetch an URL and return the content and selected meta-info.
sub handle_uri {
  my $q    = shift; # The CGI object.
  my $File = shift; # The master datastructure.

  my $uri = new URI (ref $q ? $q->param('uri') : $q)->canonical();
  $uri->fragment(undef);

  my $ua = new W3C::Validator::UserAgent ($CFG, $File);
  $ua->env_proxy();
  $ua->agent("W3C_Validator/$VERSION");
  $ua->parse_head(0);  # Don't parse the http-equiv stuff.

  $ua->protocols_allowed($CFG->{Protocols}->{Allow} || ['http', 'https']);

  unless ($ua->is_protocol_supported($uri)) {
    $File->{'Error Flagged'} = TRUE;
    if (($uri->canonical() eq "1") )
    #if uri param is empty (also for empty direct or upload), it's been set to TRUE in sub prepCGI()
    {
      $File->{E}->param(fatal_no_content  => TRUE);
    }
    else {
      $File->{E}->param(fatal_uri_error  => TRUE);
      $File->{E}->param(fatal_uri_scheme => $uri->scheme());
    }
    return $File;
  }

  return $File unless $ua->uri_ok($uri);

  my $req = new HTTP::Request(GET => $uri);

  # If we got a Authorization header, the client is back at it after being
  # prompted for a password so we insert the header as is in the request.
  if($ENV{HTTP_AUTHORIZATION}){
    $req->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION});
  }

  my $res = $ua->request($req);

  return $File if $File->{'Error Flagged'}; # Redirect IP rejected?

  unless ($res->code == 200 or $File->{Opt}->{'No200'}) {
    if ($res->code == 401) {
      my %auth = $res->www_authenticate(); # HTTP::Headers::Auth
      &authenticate($File, $res->request->uri, \%auth);
    } else {
      $File->{'Error Flagged'} = TRUE;

      $File->{E}->param(fatal_http_error => TRUE);
      $File->{E}->param(fatal_http_uri   => $uri->as_string);
      $File->{E}->param(fatal_http_code  => $res->code);
      $File->{E}->param(fatal_http_msg   => $res->message);
      $File->{E}->param(fatal_http_dns   => TRUE) if $res->code == 500;
    }
    return $File;
  }

  #
  # Enforce Max Recursion level.
  &check_recursion($File, $res);

  my ($mode, $ct, $charset)
    = &parse_content_type(
                          $File,
                          $res->header('Content-Type'),
                          scalar($res->request->uri),
                         );

  my $lastmod = undef;
  if ( $res->last_modified ) {
    $lastmod = scalar(gmtime($res->last_modified));
  }

  my $content = $res->can('decoded_content') ?
    $res->decoded_content(charset => 'none') : $res->content;

  $File->{Bytes}           = $content;
  $File->{Mode}            = $mode;
  $File->{ContentType}     = $ct;
  $File->{ContentEnc}      = $res->content_encoding;
  $File->{TransferEnc}     = $res->header('Client-Transfer-Encoding');
  $File->{Charset}->{HTTP} = lc $charset;
  $File->{Modified}        = $lastmod;
  $File->{Server}          = scalar $res->server;
  $File->{Size}            = scalar $res->content_length;
  $File->{URI}             = scalar $res->request->uri->canonical;
  $File->{'Is Upload'}     = FALSE;
  $File->{'Direct Input'}  = FALSE;

  return $File;
}

#
# Handle uploaded file and return the content and selected meta-info.
sub handle_file {
  my $q    = shift; # The CGI object.
  my $File = shift; # The master datastructure.

  my $f = $q->param('uploaded_file');
  my $h = $q->uploadInfo($f);
  my $file;

  local $/ = undef; # set line delimiter so that <> reads rest of file
  $file = <$f>;

  my ($mode, $ct, $charset) = &parse_content_type($File, $h->{'Content-Type'});

  $File->{Bytes}           = $file;
  $File->{Mode}            = $mode;
  $File->{ContentType}     = $ct;
  $File->{Charset}->{HTTP} = lc $charset;
  $File->{Modified}        = $q->http('Last-Modified');
  $File->{Server}          = $q->http('User-Agent'); # Fake a "server". :-)
  $File->{Size}            = $q->http('Content-Length');
  $File->{URI}             = "$f"; # Need to stringify because we want ref
                                   # to return false later in add_table.  This
                                   # is also a file handle... see man CGI.
  $File->{'Is Upload'}     = TRUE;
  $File->{'Direct Input'}  = FALSE;

  return $File;
}

#
# Handle uploaded file and return the content and selected meta-info.
sub handle_frag {
  my $q    = shift; # The CGI object.
  my $File = shift; # The master datastructure.

  $File->{Bytes}       = $q->param('fragment');
  $File->{Mode}        = MODE_TBD;
  $File->{Modified}    = '';
  $File->{Server}      = '';
  $File->{Size}        = '';
  $File->{ContentType} = ''; # @@TODO?
  $File->{URI}         = 'upload://Form Submission';
  $File->{'Is Upload'} = TRUE;
  $File->{'Direct Input'} = TRUE;
  $File->{Charset}->{HTTP} = "utf-8"; # by default, the form accepts utf-8 chars

  return $File;
}

#
# Parse a Content-Type and parameters. Return document type and charset.
sub parse_content_type {
  my $File         = shift;
  my $Content_Type = shift;
  my $url          = shift;
  my $charset      = '';
  my $mode         = '';

  my ($ct, @param) = split /\s*;\s*/, lc $Content_Type;
  $ct =~ s/^\s+//; $ct =~ s/\s+$//;

  $mode = $CFG->{MIME}->{$ct} || $ct;

  foreach my $param (@param) {
    my ($p, $v) = split /\s*=\s*/, $param;
    next unless $p =~ m(charset)i;
    if ($v =~ m/([\'\"]?)(\S+)\1/i) {
      $charset = lc $2;
      last;
    }
  }

  if ($mode =~ m(/)) { # a "/" means it's unknown or we'd have a mode here.
    if ($ct =~ m(text/css) and defined $url) {
      print redirect
        'http://jigsaw.w3.org/css-validator/validator?uri='
          . uri_escape $url;
      exit;
    } else {
      $File->{'Error Flagged'} = TRUE;
      $File->{E}->param(fatal_mime_error => TRUE);
      $File->{E}->param(fatal_mime_ct    => $ct);
    }
  }

  return $mode, $ct, $charset;
}

#
# Check recursion level and enforce Max Recursion limit.
sub check_recursion ($$) {
  my $File = shift;
  my $res  = shift;

  # Not looking at our own output.
  return unless defined $res->header('X-W3C-Validator-Recursion');

  my $lvl = $res->header('X-W3C-Validator-Recursion');
  return unless $lvl =~ m(^\d+$); # Non-digit, i.e. garbage, ignore.

  if ($lvl >= $CFG->{'Max Recursion'}) {
    print redirect $CFG->{'Home Page'};
  } else {
    $T->param(depth => $lvl++); # Increase recursion level in output.
  }
}

#
# Normalize newline forms (CRLF/CR/LF) to native newline.
sub normalize_newlines {
  my $file = shift;
  local $_ = shift;  #charset
  my $pattern = '';

  # don't use backreference parentheses!
  $pattern = '\x00\x0D(?:\x00\x0A)?|\x00\x0A' if /^utf-16be$/;
  $pattern = '\x0D\x00(?:\x0A\x00)?|\x0A\x00' if /^utf-16le$/;
  # $pattern = '\x00\x00\x00\x0D(?:\x00\x00\x00\x0A)?|\x00\x00\x00\x0A' if /^UCS-4be$/;
  # $pattern = '\x0D\x00\x00\x00(?:\x0A\x00\x00\x00)?|\x0A\x00\x00\x00' if /^UCS-4le$/;
  # insert other special cases here, such as EBCDIC
  $pattern = '\x0D(?:\x0A)?|\x0A' if !$pattern;    # all other cases

  return [split /$pattern/, $file];
}

#
# find exact charset from general one (utf-16)
#
# needed for per-line conversion and line splitting
# (BE is default, but this will apply only to HTML)
sub exact_charset {
  my $File = shift;
  my $general_charset = shift;
  my $exact_charset = $general_charset;

  if ($general_charset eq 'utf-16') {
    if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) {
      $exact_charset = $File->{Charset}->{Auto};
    } else { $exact_charset = 'utf-16be'; }
  }
  # add same code for ucs-4 here
  return $exact_charset;
}

#
# Return $_[0] encoded for HTML entities (cribbed from merlyn).
#
# Note that this is used both for HTML and XML escaping.
#
sub ent {
  local $_ = shift;
  return '' unless defined; # Eliminate warnings
  s(["<&>"]){'&#' . ord($&) . ';'}ge;  # should switch to hex sooner or later
  return $_;
}

#
# Truncate source lines for report.
#
# This *really* wants Perl 5.8.0 and it's improved UNICODE support.
# Byte semantics are in effect on all length(), substr(), etc. calls,
# so offsets will be wrong if there are multi-byte sequences prior to
# the column where the error is detected.
#
sub truncate_line {
  my $line  = shift;
  my $col   = shift;

  my $start = $col;
  my $end   = $col;

  for (1..40) {
    $start-- if ($start - 1 >= 0);            # in/de-crement until...
    $end++   if ($end   + 1 <= length $line); # ...we hit end of line.
  }

  unless ($end - $start == 80) {
    if ($start == 0) { # Hit start of line, maybe grab more at end.
      my $diff = 40 - $col;
      for (1..$diff) {
        $end++ if ($end + 1 <= length $line);
      }
    } elsif ($end == length $line) { # Hit end of line, maybe grab more at beginning.
      my $diff = 80 - $col;
      for (1..$diff) {
        $start-- if ($start - 1 >= 0);
      }
    }
  }

  #
  # Add elipsis at end if necessary.
  unless ($end   == length $line) {substr $line, -3, 3, '...'};

  $col = $col - $start; # New offset is diff from $col to $start.
  $line = substr $line, $start, $end - $start; # Truncate.

  #
  # Add elipsis at start if necessary.
  unless ($start == 0)            {substr $line,  0, 3, '...'};

  return $line, $col;
}

#
# Suppress any existing DOCTYPE by commenting it out.
sub override_doctype {
  no strict 'vars';
  my $File = shift;
  my ($dt) =
    grep { $_->{Display} eq $File->{Opt}->{DOCTYPE} } values %{$CFG->{Types}};

  # @@TODO: abort/whine about unrecognized doctype if $dt is undef.;
  my $pubid = $dt->{PubID};
  my $sysid = $dt->{SysID};
  my $name  = $dt->{Name};
  local $dtd = qq(<!DOCTYPE $name PUBLIC "$pubid");
  $dtd .= qq( "$sysid") if $sysid; # We don't have one for all types.
  $dtd .= '>';

  local $org_dtd = '';
  local $HTML    = '';
  local $seen    = FALSE;

  my $declaration = sub {
    $seen = TRUE;

    # No Override if Fallback was requested.
    if ($File->{Opt}->{FB}->{DOCTYPE}) {
      $HTML .= $_[0]; # Stash it as is...
    } else {
      # Comment it out and insert the new one...
      $HTML .= "$dtd\n" . '<!-- ' . $_[0] . ' -->';
      $org_dtd = &ent($_[0]);
    }
  };

  HTML::Parser->new(default_h     => [sub {$HTML .= shift}, 'text'],
                    declaration_h => [$declaration, 'text']
                   )->parse(join "\n", @{$File->{Content}})->eof();

  $File->{Content} = [split /\n/, $HTML];

  if ($seen) {
    unless (($File->{Opt}->{FB}->{DOCTYPE}) or ($File->{Opt}->{DOCTYPE} eq $CFG->{Types}->{$File->{DOCTYPE}}->{Display} )) {
      &add_warning('W13', {
        W13_org => $org_dtd,
        W13_new => $File->{Opt}->{DOCTYPE},
      });
      $File->{Tentative} |= T_ERROR; # Tag it as Invalid.
    }
  } else {
    unshift @{$File->{Content}}, $dtd;

    if ($File->{Opt}->{FB}->{DOCTYPE}) {
      &add_warning('W16', {W16_dtd => $File->{Opt}->{DOCTYPE}});
      $File->{Tentative} |= T_ERROR; # Tag it as Invalid.
    } else {
      &add_warning('W15', {W15_dtd => $File->{Opt}->{DOCTYPE}});
      $File->{Tentative} |= T_ERROR; # Tag it as Invalid.
    }
  }

  return $File;
}

#
# Parse errors reported by SP.
sub parse_errors ($$) {
  my $File = shift;
  my $fh   = shift;

  $File->{Errors} = []; # Initialize to an (empty) anonymous array ref.
  for (<$fh>) {

    # remove SGML Parser path if it contains colons
    s/^\Q$CFG->{Paths}->{SGML}->{Parser}\E// if
         $CFG->{Paths}->{SGML}->{Parser} =~ /:/;

    push @{$File->{DEBUG}->{Errors}}, $_;
    chomp;
    my ($err, @errors);
    next if /^<OSFD>0:[0-9]+:[0-9]+:[^A-Z]/;
    next if /numbers exceeding 65535 not supported/;
    next if /URL Redirected to/;

    my (@_err) = split /:/;
    next unless $_err[1] eq '<OSFD>0'; #@@FIXME: This is a polite fiction!;
    if ($_err[1] =~ m(^<URL>)) {
      @errors = ($_err[0], join(':', $_err[1], $_err[2]), @_err[3..$#_err]);
    } else {
      @errors = @_err;
    }
    $err->{src}  = $errors[1];
    $err->{line} = $errors[2];
    $err->{char} = $errors[3];

    # Workaround for onsgmls 1.5 sometimes reporting errors beyond EOL.
    if ((my $l = length($File->{Content}->[$err->{line}-1])) < $err->{char}) {
      $err->{char} = $l;
    }
    $err->{num}  = $errors[4] || '';
    
    # the following is necessary for openSP 1.5.2+, 
    # which seems to have introduced a change in error messagenumbering
    # syntax appears to be session_id.error_id - We keep only the latter part
    $err->{num}  =~ s/(.*)\.//;
    
    $err->{type} = $errors[5] || '';
    if ($err->{type} eq 'E' or $err->{type} eq 'X' or $err->{type} eq 'Q') {
      $err->{msg}  = join ':', @errors[6 .. $#errors];
    } elsif ($err->{type} eq 'W') {

      #@@FIXME: This is borked after templatification.
      # &add_warning($File, 'fake', 'Warning:',
      #  "Line $err->{line}, column $err->{char}: " . &ent($errors[6]));
      #@@FIXME;
      $err->{msg}  = join ':', @errors[6 .. $#errors];
    } else {
      $err->{type} = 'I';
      $err->{num}  = '';
      $err->{msg}  = join ':', @errors[4 .. $#errors];
    }

    # No or unknown FPI and a relative SI.
    if ($err->{msg} =~ m(cannot (open|find))) {
      $File->{'Error Flagged'} = TRUE;
      $File->{E}->param(fatal_parse_extid_error => TRUE);
      $File->{E}->param(fatal_parse_extid_msg   => $err->{msg});
    }

    # No DOCTYPE.
    if ($err->{msg} =~ m(prolog can\'t be omitted)) {
      my $dtd = ($File->{Mode} == MODE_SGML ?
                   'HTML 4.01 Transitional' : 'XHTML 1.0 Transitional');
      &add_warning('W09', {W09_dtd => $dtd});
      next; # Don't report this as a normal error.
    }

    &abort_if_error_flagged($File, O_DOCTYPE);
    $err->{msg} =~ s/^\s*//;
    push @{$File->{Errors}}, $err;
  }
  undef $fh;
  return $File;
}

#
# Generate a HTML report of detected errors.
sub report_errors ($) {
  my $File = shift;
  my $Errors = [];
  my $number_of_errors = 0;
  my $number_of_warnings = 0;
  my $number_of_info = 0;

  # Hash to keep track of how many of each error is reported.
  my %Msgs; # Used to generate a UID for explanations.

  if (scalar @{$File->{Errors}}) {
    foreach my $err (@{$File->{Errors}}) {
      my ($line, $col) = &truncate_line($File->{Content}->[$err->{line}-1], $err->{char});

      $line = &mark_error($line, $col);

      my $explanation;
      if ($err->{num}) {
        my $num = $err->{num};
        if (exists $Msgs{$num}) { # We've already seen this message...
          if ($File->{Opt}->{Verbose}) { # ...so only repeat it in Verbose mode.
            $explanation = qq(\n    <div class="hidden mid-$num"></div>\n);
          }
        } else {
          $Msgs{$num} = 1;
          $explanation .= "\n    $RSRC->{msg}->{$num}->{verbose}\n"
            if exists $RSRC->{msg}->{$num}
            && exists $RSRC->{msg}->{$num}->{verbose};
        }
        my $_msg = $RSRC->{msg}->{nomsg}->{verbose};
        $_msg =~ s/<!--MID-->/$num/g;
        if ($File->{'Is Upload'})
        {
          $_msg =~ s/<!--URI-->//g
        }
        else
        {
          my $escaped_uri = uri_escape($File->{URI});
          $_msg =~ s/<!--URI-->/$escaped_uri/g;
        }
        $explanation .= "    $_msg\n"; # The send feedback plea.
      }

      $err->{src} = $line;
      $err->{col} = ' ' x $col;
      $err->{expl} = $explanation;
      if ($err->{type} eq 'I')
      {
        $err->{class} = 'msg_info';
        $err->{err_type_err} = 0;
        $err->{err_type_warn} = 0;
        $err->{err_type_info} = 1;
        $number_of_info += 1;
      }
      elsif ($err->{type} eq 'E')
      {
        $err->{class} = 'msg_err';
        $err->{err_type_err} = 1;
        $err->{err_type_warn} = 0;
        $err->{err_type_info} = 0;
        $number_of_errors += 1;
      }
      elsif (($err->{type} eq 'W') or ($err->{type} eq 'X') )
      {
        $err->{class} = 'msg_warn';
        $err->{err_type_err} = 0;
        $err->{err_type_warn} = 1;
        $err->{err_type_info} = 0;
        $number_of_warnings += 1;
      }
      # TODO other classes for "X" etc? FIXME find all types of message.

      push @{$Errors}, $err;
    }
  }
  return $number_of_errors, $number_of_warnings, $number_of_info, $Errors;
}


#
# Chop the source line into 3 pieces; the character at which the error
# was detected, and everything to the left and right of that position.
# That way we can add markup to the relevant char without breaking &ent().
sub mark_error (\$\$) {
  my $line = shift;
  my $col  = shift;

  #
  # Left side...
  my $left;
  {
    my $offset = 0; # Left side allways starts at 0.
    my $length;

    if ($col - 1 < 0) { # If error is at start of line...
      $length = 0; # ...floor to 0 (no negative offset).
    } elsif ($col == length $line) { # If error is at EOL...
      $length = $col - 1; # ...leave last char to indicate position.
    } else { # Otherwise grab everything up to pos of error.
      $length = $col;
    }
    $left = substr $line, $offset, $length;
    $left = &ent($left);
  }

  #
  # The character where the error was detected.
  my $char;
  {
    my $offset;
    my $length = 1; # Length is always 1; the char where error was found.

    if ($col == length $line) { # If err is at EOL...
      $offset = $col - 1; # ...then grab last char on line instead.
    } else {
      $offset = $col; # Otherwise just grab the char.
    }
    $char = substr $line, $offset, $length;
    $char = &ent($char);
  }

  #
  # The right side up to the end of the line...
  my $right;
  {
    my $offset;
    my $length;

    # Offset...
    if ($col == length $line) { # If at EOL...
      $offset = 0; # Don't bother as there is nothing left to grab.
    } else {
      $offset = $col + 1; # Otherwise get everything from char-after-error.
    }

    # Length...
    if ($col == length $line) { # If at end of line...
      $length = 0; # ...then don't grab anything.
    } else {
      $length = length($line) - ($col - 1); # Otherwise get the rest of the line.
    }
    $right = substr $line, $offset, $length;
    $right = &ent($right);
  }

  $char = qq(<strong title="Position where error was detected.">$char</strong>);
  $line = $left . $char . $right;

  return $line;
}

#
# Produce an outline of the document based on Hn elements from the ESIS.
sub outline {
  my $File = shift;

  my $outline = '';

  my $prevlevel = 0;
  my $level     = 0;

  for (1..$#{$File->{ESIS}}) {
    my $line = $File->{ESIS}->[$_];
    next unless ($line && $line =~ /^\(H([1-6])$/i);

    $prevlevel = $level;
    $level     = $1;

    my $TAB = $level + 2;

    if ($prevlevel == 0) {
      $outline .= "    <ul>\n";
    } else {
      if ($level < $prevlevel) {
        $outline .= "</li>\n";
        for (my $i = $prevlevel; $i > $level; $i--) {
          $outline .= "  " x ($i + 2)       . "</ul>\n";
          $outline .= "  " x (($i + 2) - 1) . "</li>\n";
        }
      } elsif ($level == $prevlevel) {
        $outline .= "</li>\n";
      } elsif ($level > $prevlevel) {
        if ($level - $prevlevel > 1) {
          foreach my $i (($prevlevel + 1) .. ($level - 1)) {
            $outline .= "\n". "  " x ($i + 2) . "<ul>\n" . "  " x ($i + 2);
            $outline .= qq(<li class="warning">A level $i heading is missing!);
          }
          $outline .= "\n" . "  " x $TAB . "<ul>\n";
        } else {
          $outline .= "\n" . "  " x $TAB;
          $outline .= "<ul>\n";
        }
      }
    }

    $line       = '';
    my $heading = '';
    until (substr($line, 0, 3) =~ /^\)H$level/i) {
      $line = $File->{ESIS}->[$_++];
      if ($line =~ /^-/) {
        my $headcont = $line;
        substr($headcont, 0, 1) = " ";
        $heading .= $headcont;
      } elsif ($line =~ /^AALT CDATA( .+)/i) {
        my $headcont = $1;
        $heading .= $headcont;
      }
    }

    $heading =~ s/\\011/ /g;
    $heading =~ s/\\012/ /g;
    $heading =~ s/\\n/ /g;
    $heading =~ s/\s+/ /g;
    $heading =~ s/^[- ]//;
    $heading = &ent($heading);
    $outline .= "    <li>$heading";
  }
  $outline .= "    </li></ul>\n" x $level;
  return $outline;
}

#
# Create a HTML representation of the document.
sub source {
  my $File = shift;

  # Remove any BOM since we're not at BOT anymore...
  $File->{Content}->[0] =
    substr $File->{Content}->[0], ($File->{BOM} ? 3 : 0); # remove BOM

  my @source = map({file_source_line => $_}, @{$File->{Content}});
  return \@source;
}

#
# Create a HTML Parse Tree of the document for validation report.
sub parsetree {
  my $File = shift;
  my $tree = '';

  $T->param(file_parsetree_noatt => TRUE) if $File->{Opt}->{'No Attributes'};

  my $indent   = 0;
  my $prevdata = '';

  foreach my $line (@{$File->{ESIS}}) {

    next if ($File->{Opt}->{'No Attributes'} && $line =~ /^A/);

    $line =~ s/\\n/ /g;
    $line =~ s/\\011/ /g;
    $line =~ s/\\012/ /g;
    $line =~ s/\s+/ /g;
    next if $line =~ /^-\s*$/;

    if ($line =~ /^-/) {
      substr($line, 0, 1) = ' ';
      $prevdata .= $line;
      next;
    } elsif ($prevdata) {
      $prevdata =~ s/\s+/ /g;
      local($Text::Wrap::huge) = 'overflow'; # bug 2623
      $tree .= &ent(wrap(' ' x $indent, ' ' x $indent, $prevdata)) . "\n";
      undef $prevdata;
    }

    $line = &ent($line);
    if ($line =~ /^\)/) {
      $indent -= 2;
    }

    my $printme;
    chomp($printme = $line);
    if (my ($close, $elem) = $printme =~ /^([()])(.+)/) {
      # reformat and add links on HTML elements
      $close = ($close eq ')') ? '/' : ''; # ")" -> close-tag
      if (my $u = $CFG->{Elements}->{lc($elem)}) {
        $elem = '<a href="' . $CFG->{'Element Ref URI'} . "$u\">$elem</a>";
      }
      $printme = "&lt;$close$elem&gt;";
    } else {
      $printme =~ s,^A,  A,; # indent attributes a bit
    }

    $tree .= ' ' x $indent . $printme . "\n";

    if ($line =~ /^\(/) {
      $indent += 2;
    }
  }
  return $tree;
}

#
# Do an initial parse of the Document Entity to extract FPI.
sub preparse_doctype {
  my $File = shift;

  #
  # Reset DOCTYPE, Root (for second invocation, probably not needed anymore).
  $File->{DOCTYPE}         = '';
  $File->{Root}            = '';

  my $dtd = sub {
    return if $File->{Root};
    ($File->{Root}, $File->{DOCTYPE}) = shift =~ m(<!DOCTYPE\s+(\w+)\s+(?:PUBLIC|SYSTEM)\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*>)si;
  };

  my $start = sub {
    my $tag  = shift;
    my $attr = shift;
    my %attr = map {lc($_) => $attr->{$_}} keys %{$attr};

    if ($File->{Root}) {
      return unless $tag eq $File->{Root};
    } else {
      $File->{Root} = $tag;
    }
    if ($attr->{xmlns}) {$File->{Namespace} = $attr->{xmlns}};
  };

  my $p = HTML::Parser->new(api_version => 3);
  $p->xml_mode(TRUE);
  $p->ignore_elements('BODY');
  $p->ignore_elements('body');
  $p->handler(declaration => $dtd, 'text');
  $p->handler(start => $start, 'tag,attr');
  $p->parse(join "\n", @{$File->{Content}});

  $File->{DOCTYPE} = '' unless defined $File->{DOCTYPE};
  $File->{DOCTYPE} =~ s(^\s+){ }g;
  $File->{DOCTYPE} =~ s(\s+$){ }g;
  $File->{DOCTYPE} =~ s(\s+) { }g;

  return $File;
}

#
# Print out the raw ESIS output for debugging.
sub show_esis ($) {
  my $file_esis = "";
  for (@{shift->{ESIS}}) {
    s/\\012//g;
    s/\\n/\n/g;
    $file_esis .= ent $_;
    $file_esis .= "\n";
  }
  return  $file_esis;
}

#
# Print out the raw error output for debugging.
sub show_errors ($) {
  my $file_raw_errors = "";
  for (@{shift->{DEBUG}->{Errors}}) {
  $file_raw_errors .= ent $_
  };
  return $file_raw_errors;
}

#
# Preprocess CGI parameters.
sub prepCGI {
  my $File = shift;
  my $q    = shift;

  # Avoid CGI.pm's "exists but undef" behaviour.
  if (scalar $q->param) {
    foreach my $param ($q->param) {
      next if $param eq 'uploaded_file'; # 'uploaded_file' contains data.
      next if $param eq 'fragment';      # Ditto 'fragment'.
      next if $q->param($param) eq '0';  # Keep false-but-set params.

      #
      # Parameters that are given to us without specifying a value get
      # set to "1" (the "TRUE" constant). This is so we can test for the
      # boolean value of a parameter instead of first checking whether
      # the param was given and then testing it's value. Needed because
      # CGI.pm sets ";param" and ";param=" to a boolean false value
      # (undef() or a null string, respectively).
      $q->param($param, TRUE) unless $q->param($param);
    }
  }

  # Futz the URL so "/referer" works.
  if ($q->path_info) {
    if ($q->path_info eq '/referer' or $q->path_info eq '/referrer') {
      if ($q->referer) {
        $q->param('uri', $q->referer);
        print redirect &self_url_q($q, $File);
        exit;
      } else {
        print redirect $q->url() . '?uri=' . 'referer';
        exit;
      }
    } else {
      print redirect &self_url_q($q, $File);
      exit;
    }
  }

  # Use "url" unless a "uri" was also given.
  if ($q->param('url') and not $q->param('uri')) {
    $q->param('uri', $q->param('url'));
  }

  # Munge the URL to include commonly omitted prefix.
  my $u = $q->param('uri');
  $q->param('uri', "http://$u") if $u && $u =~ m(^www)i;

  # Issue a redirect for uri=referer.
  if ($q->param('uri') and $q->param('uri') eq 'referer') {
    if ($q->referer) {
      $q->param('uri', $q->referer);
      print redirect &self_url_q($q, $File);
      exit;
    } else {

      # Redirected from /check/referer to /check?uri=referer because
      # the browser didn't send a Referer header, or the request was
      # for /check?uri=referer but no Referer header was found.
      $File->{'Error Flagged'} = TRUE;

      $File->{E}->param(fatal_referer_error => TRUE);
    }
  }

  # Supersede URL with an uploaded file.
  if ($q->param('uploaded_file')) {
    $q->param('uri', 'upload://' . $q->param('uploaded_file'));
    $File->{'Is Upload'} = TRUE; # Tag it for later use.
  }

  # Supersede URL with an uploaded fragment.
  if ($q->param('fragment')) {
    $q->param('uri', 'upload://Form Submission');
    $File->{'Is Upload'} = TRUE; # Tag it for later use.
  }

  # Redirect to a GETable URL if method is POST without a file upload.
  if ($q->request_method eq 'POST' and not $File->{'Is Upload'}) {
    my $thispage = &self_url_q($q, $File);
    print redirect $thispage;
    exit;
  }

  #
  # Flag an error if we didn't get a file to validate.
  unless ($q->param('uri')) {
    $File->{'Error Flagged'} = TRUE;
    $File->{E}->param(fatal_uri_error  => TRUE);
    $File->{E}->param(fatal_uri_scheme => 'undefined');
  }

  return $q;
}

#
# Preprocess SSI files.
sub prepSSI {
  my $opt = shift;

  my $fh = new IO::File "< $opt->{File}"
    or croak "open($opt->{File}) returned: $!\n";
  my $ssi = join '', <$fh>;
  close $fh or carp "close($opt->{File}) returned: $!\n";

  $ssi =~ s/<!--\#echo var="title" -->/$opt->{Title}/g
    if defined $opt->{Title};

  $ssi =~ s/<!--\#echo var="date" -->/$opt->{Date}/g
    if defined $opt->{Date};

  $ssi =~ s/<!--\#echo\s+var="revision"\s+-->/$opt->{Revision}/g
    if defined $opt->{Revision};

  # No need to parametrize this one, it's always "./" in this context.
  $ssi =~ s|<!--\#echo\s+var="relroot"\s+-->|./|g;

  return $ssi;
}

#
# Utility sub to tell if mode "is" XML.
sub is_xml {shift->{Mode} == MODE_XML};

#
# Do an initial parse of the Document Entity to extract charset from HTML <meta>.
# (still also extracts FPI, at least to some extent)
sub preparse_meta {
  my $File = shift;

  my $dtd = sub {
    return if $File->{Root};
    ($File->{Root}, $File->{DOCTYPE}) = shift =~ m(<!DOCTYPE\s+(\w+)\s+PUBLIC\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*>)si;
  };

  my $start = sub {
    my $tag  = shift;
    my $attr = shift;
    my %attr = map {lc($_) => $attr->{$_}} keys %{$attr};

    if ($File->{Root}) {
      if (lc $tag eq 'meta') {
        if (lc $attr{'http-equiv'} eq 'content-type') {
          if ($attr{content} =~ m(charset\s*=[\s\"\']*([^\s;\"\'>]*))si) {
            $File->{Charset}->{META} = lc $1;
          }
        }
      }
      return unless $tag eq $File->{Root};
    } else {
      $File->{Root} = $tag;
    }
  };

  my $p = HTML::Parser->new(api_version => 3);
  $p->xml_mode(TRUE);
  $p->ignore_elements('BODY');
  $p->ignore_elements('body');
  $p->handler(declaration => $dtd, 'text');
  $p->handler(start => $start, 'tag,attr');
  $p->parse(join "\n", @{$File->{Content}});

  return $File;
}

#
# Check charset conflicts and add any warnings necessary.
sub charset_conflicts {
  my $File = shift;

  #
  # Handle the case where there was no charset to be found.
  unless ($File->{Charset}->{Use}) {
    &add_warning('W17', {});
    $File->{Tentative} |= T_WARN;
  }

  my $cs_use  = $File->{Charset}->{Use}  ? &ent($File->{Charset}->{Use})  : '';
  my $cs_opt  = $File->{Opt}->{Charset}  ? &ent($File->{Opt}->{Charset})  : '';
  my $cs_http = $File->{Charset}->{HTTP} ? &ent($File->{Charset}->{HTTP}) : '';
  my $cs_xml  = $File->{Charset}->{XML}  ? &ent($File->{Charset}->{XML})  : '';
  my $cs_meta = $File->{Charset}->{META} ? &ent($File->{Charset}->{META}) : '';

  #
  # Add a warning if there was charset info conflict (HTTP header,
  # XML declaration, or <meta> element).
  if (&conflict($File->{Charset}->{HTTP}, $File->{Charset}->{XML}) and not ($File->{'Direct Input'})) {
    &add_warning('W18', {
      W18_http => $cs_http,
      W18_xml  => $cs_xml,
      W18_use  => $cs_use,
    });
  } elsif (&conflict($File->{Charset}->{HTTP}, $File->{Charset}->{META}) and not ($File->{'Direct Input'})) {
    &add_warning('W19', {
      W19_http => $cs_http,
      W19_meta => $cs_meta,
      W19_use  => $cs_use,
    });
  } elsif (&conflict($File->{Charset}->{XML}, $File->{Charset}->{META})) {
    &add_warning('W20', {
      W20_http => $cs_xml,
      W20_xml  => $cs_meta,
    });
    $File->{Tentative} |= T_WARN;
  }

  return $File;
}

#
# Transcode to UTF-8
sub transcode {
  my $File = shift;

  my $cs = $File->{Charset}->{Use};
  my ($command, $result_charset) = ('', '');
  if ($CFG->{Charsets}->{$cs}) {
    ($command, $result_charset) = split(" ", $CFG->{Charsets}->{$cs}, 2);
  }

  my $c;
  $result_charset = exact_charset($File, $result_charset);

  if ($command eq 'I') {
    # test if given charset is available
    eval { $c = Text::Iconv->new($result_charset, 'utf-8') };
    $command = '' if $@;
  } elsif ($command eq 'X') {
    $@ = "$File->{Charset}->{Use} undefined; replace by $result_charset";
  }

  if ($command ne 'I') {
    $File->{'Error Flagged'} = TRUE;
    $File->{E}->param(fatal_transcode_error   => TRUE);
    $File->{E}->param(fatal_transcode_charset => $cs);
    $File->{E}->param(fatal_transcode_errmsg  => ($@ || ''));
    return $File;
  }

  my $line = 0;
  for (@{$File->{Content}}) {
    my $in = $_;
    $line++;
    $_ = $c->convert($_); # $_ is local!!
    if ($in ne "" and (!defined($_) || $_ eq "")) {
      push @{$File->{Lines}}, $line;

      # try to decoded as much as possible of the line
      my $short = 0;                # longest okay
      my $long  = (length $in) - 1; # longest unknown

      while ($long > $short) {
        # binary search
        my $try = int (($long+$short+1) / 2);
        my $converted = $c->convert(substr($in, 0, $try));
        if (!defined($converted) || $converted eq "") {
          $long  = $try-1;
        } else {
          $short = $try;
        }
      }
      my $remain = (length $in) - $short;
      my $converted = $c->convert(substr($in, 0, $short));
      $_ = defined($converted) ? $converted : ''
           . "#### $remain byte(s) unconvertible ####";
    }
  }
  return $File;
}

#
# Check correctness of UTF-8 both for UTF-8 input and for conversion results
sub check_utf8 {
  my $File = shift;

  for (my $i = 0; $i < $#{$File->{Content}}; $i++) {
    # substitution needed for very long lines (>32K), to avoid backtrack
    # stack overflow. Handily, this also happens to count characters.
    local $_ = $File->{Content}->[$i];
    my $count =
    s/  [\x00-\x7F]                           # ASCII
      | [\xC2-\xDF]        [\x80-\xBF]        # non-overlong 2-byte sequences
      |  \xE0[\xA0-\xBF]   [\x80-\xBF]        # excluding overlongs
      | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}     # straight 3-byte sequences
      |  \xED[\x80-\x9F]   [\x80-\xBF]        # excluding surrogates
      |  \xF0[\x90-\xBF]   [\x80-\xBF]{2}     # planes 1-3
      | [\xF1-\xF3]        [\x80-\xBF]{3}     # planes 4-15
      |  \xF4[\x80-\x8F][\x80-\xBF]{2}        # plane 16
     //xg;
    if (length) {
      push @{$File->{Lines}}, ($i+1);
      $File->{Content}->[$i] = "#### encoding problem on this line, not shown ####";
      $count = 50; # length of above text
    }
    $count += 0; # Force numeric.
    $File->{Offsets}->[$i + 1] = [$count, $File->{Offsets}->[$i]->[1] + $count];
  }

  # Add a warning if doc is UTF-8 and contains a BOM.
  if ($File->{Charset}->{Use} eq 'utf-8' &&
        $File->{Content}->[0] =~ m(^\xEF\xBB\xBF)) {
    &add_warning('W21', {});
  }
  return $File;
}

#
# byte error analysis
sub byte_error {
  my $File = shift;
  my @lines = @{$File->{Lines}};
  if (scalar @lines) {
    $File->{'Error Flagged'} = TRUE;
    my $lines = join ', ', split ',', Set::IntSpan->new(\@lines)->run_list;
    $File->{E}->param(fatal_byte_error   => TRUE);
    $File->{E}->param(fatal_byte_lines   => $lines);
    $File->{E}->param(fatal_byte_charset => $File->{Charset}->{Use});
  }
  return $File;
}

#
# Autodetection as in Appendix F of the XML 1.0 Recommendation.
# <http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing>
#
# return values are: (base_encoding, BOMSize, Size, Pattern)
sub find_base_encoding {
  local $_ = shift;

  # With a Byte Order Mark:
  return ('ucs-4be',  4, 4, '\0\0\0(.)')
    if /^\x00\x00\xFE\xFF/; # UCS-4, big-endian machine (1234)
  return ('ucs-4le',  4, 4, '(.)\0\0\0')
    if /^\xFF\xFE\x00\x00/; # UCS-4, little-endian machine (4321)
  return ('utf-16be', 2, 2, '\0(.)')
    if /^\xFE\xFF/;         # UTF-16, big-endian.
  return ('utf-16le', 2, 2, '(.)\0')
    if /^\xFF\xFE/;         # UTF-16, little-endian.
  return ('utf-8',    3, 1, '')
    if /^\xEF\xBB\xBF/; # UTF-8.

  # Without a Byte Order Mark:
  return ('ucs-4be',  0, 4, '\0\0\0(.)')
    if /^\x00\x00\x00\x3C/; # UCS-4 or 32bit; big-endian machine (1234 order).
  return ('ucs-4le',  0, 4, '(.)\0\0\0')
    if /^\x3C\x00\x00\x00/; # UCS-4 or 32bit; little-endian machine (4321 order).
  return ('utf-16be', 0, 2, '\0(.)')
    if /^\x00\x3C\x00\x3F/; # UCS-2, UTF-16, or 16bit; big-endian.
  return ('utf-16le', 0, 2, '(.)\0')
    if /^\x3C\x00\x3F\x00/; # UCS-2, UTF-16, or 16bit; little-endian.
  return ('utf-8',    0, 1, '')
    if /^\x3C\x3F\x78\x6D/; # UTF-8, ISO-646, ASCII, ISO-8859-*, Shift-JIS, EUC, etc.
  return ('ebcdic',   0, 1, '')
    if /^\x4C\x6F\xA7\x94/; # EBCDIC
  return ('',         0, 1, '');
                            # nothing in particular
}

#
# Find encoding in document according to XML rules
# Only meaningful if file contains a BOM, or for well-formed XML!
sub find_xml_encoding {
  my $File = shift;
  my ($CodeUnitSize, $Pattern);

  ($File->{Charset}->{Auto}, $File->{BOM}, $CodeUnitSize, $Pattern)
    = &find_base_encoding($File->{Bytes});

  # 100 arbitrary, but enough in any case
  my $someBytes = substr $File->{Bytes}, $File->{BOM}, ($CodeUnitSize * 100);
  my $someText  = '';

  # translate from guessed encoding to ascii-compatible
  if ($File->{Charset}->{Auto} eq 'ebcdic') {

    # special treatment for EBCDIC, maybe use tr///
    # work on this later
  }
  elsif (!$Pattern) {
    $someText = $someBytes; # efficiency shortcut
  }
  else {
    # generic code for UTF-16/UCS-4
    $someBytes =~ /^(($Pattern)*)/s;
    $someText = $1;                   # get initial piece without chars >255
    $someText =~ s/$Pattern/$1/sg;    # select the relevant bytes
  }

  # try to find encoding pseudo-attribute
  my $s = '[\ \t\n\r]';
  $someText =~ m(^<\?xml $s+ version $s* = $s* ([\'\"]) [-._:a-zA-Z0-9]+ \1 $s+
                  encoding $s* = $s* ([\'\"]) ([A-Za-z][-._A-Za-z0-9]*) \2
                )xso;

  $File->{Charset}->{XML} = lc $3;
  return $File;
}

#
# Abort with a message if an error was flagged at point.
sub abort_if_error_flagged {
  my $File  = shift;
  my $Flags = shift;

  return unless $File->{'Error Flagged'};
  return if     $File->{'Error Handled'}; # Previous error, keep going.

  if ($File->{Opt}->{Output} eq 'html') {
    &prep_template($File, $E);
    print $E->output;
    exit;
  } else {

    #@@FIXME: This is borked after templatification.
    # &add_warning($File, 'fatal', 'Fatal Error', <<".EOF.");
    # A fatal error has occurred while processing the requested document. Processing
    # has continued but any later output will be of dubious quality. Limitations of
    # this output mode prevent the full error message from being returned; please
    # retry this operation in interactive mode using the web interface to see the
    # actual error message.
    # .EOF.
    #@@FIXME;
    $File->{'Error Handled'} = TRUE;
  }
}

#
# conflicting encodings
sub conflict {
  my $encodingA = shift;
  my $encodingB = shift;
  return $encodingA && $encodingB && ($encodingA ne $encodingB);
}

#
# Construct a self-referential URL from a CGI.pm $q object.
sub self_url_q {
  my ($q, $File) = @_;
  my $thispage = $File->{Env}->{'Self URI'};
  $thispage .= '?uri='       . uri_escape($q->param('uri'));
  $thispage .= ';ss=1'      if $q->param('ss');
  $thispage .= ';sp=1'      if $q->param('sp');
  $thispage .= ';noatt=1'   if $q->param('noatt');
  $thispage .= ';outline=1' if $q->param('outline');
  $thispage .= ';No200=1'   if $q->param('No200');
  $thispage .= ';verbose=1' if $q->param('verbose');
  if ($q->param('doctype')
      and not $q->param('doctype') =~ /(Inline|detect)/i) {
    $thispage .= ';doctype=' . uri_escape($q->param('doctype'));
  }
  if ($q->param('charset') and not $q->param('charset') =~ /detect/i) {
    $thispage .= ';charset=' . uri_escape($q->param('charset'));
  }
  return $thispage;
}

#
# Return random Tip with it's URL.
sub get_tip {
  my @tipAddrs = keys %{$CFG->{Tips}};
  my $tipAddr  = $tipAddrs[rand scalar @tipAddrs];
  my $tipSlug  = $CFG->{Tips}->{$tipAddr};

  return [$tipAddr, $tipSlug];
}

#
# Construct a self-referential URL from a $File object.
sub self_url_file {
  my $File = shift;

  my $thispage = $File->{Env}->{'Self URI'};
  my $escaped_uri = uri_escape($File->{URI});
  $thispage .= qq(?uri=$escaped_uri);
  $thispage .= ';ss=1'      if $File->{Opt}->{'Show Source'};
  $thispage .= ';sp=1'      if $File->{Opt}->{'Show Parsetree'};
  $thispage .= ';noatt=1'   if $File->{Opt}->{'No Attributes'};
  $thispage .= ';outline=1' if $File->{Opt}->{'Outline'};
  $thispage .= ';verbose=1' if $File->{Opt}->{'Verbose'};
  $thispage .= ';No200=1'   if $File->{Opt}->{'No200'};

  return $thispage;
}

#####

package W3C::Validator::UserAgent;

use LWP::UserAgent  1.90 qw(); # Need 1.90 for protocols_(allowed|forbidden)
use Net::hostent         qw(gethostbyname);
use Net::IP              qw();
use Socket               qw(inet_ntoa);

use base qw(LWP::UserAgent);

sub new {
  my ($proto, $CFG, $File, @rest) = @_;
  my $class = ref($proto) || $proto;
  my $self = $class->SUPER::new(@rest);
  $self->{'W3C::Validator::CFG'}  = $CFG;
  $self->{'W3C::Validator::File'} = $File;
  return $self;
}

sub redirect_ok {
  my ($self, $req, $res) = @_;
  return $self->SUPER::redirect_ok($req, $res) && $self->uri_ok($req->uri());
}

sub uri_ok {
  my ($self, $uri) = @_;
  return 1 if ($self->{'W3C::Validator::CFG'}->{'Allow Private IPs'} or
               !$uri->can('host'));

  my $addr = my $iptype = undef;
  if (my $host = gethostbyname($uri->host())) {
    $addr = inet_ntoa($host->addr()) if $host->addr();
    if ($addr && (my $ip = Net::IP->new($addr))) {
      $iptype = $ip->iptype();
    }
  }
  if ($iptype && $iptype ne 'PUBLIC') {
    my $File = $self->{'W3C::Validator::File'};
    $File->{'Error Flagged'} = 1;
    $File->{E}->param(fatal_ip_error    => 1);
    $File->{E}->param(fatal_ip_hostname => 1)
      if $addr and $uri->host() ne $addr;
    $File->{E}->param(fatal_ip_host => ($uri->host() || 'undefined'));
    return 0;
  }
  return 1;
}

# Local Variables:
# mode: perl
# indent-tabs-mode: nil
# tab-width: 2
# perl-indent-level: 2
# End:
