#!/usr/bin/perl -w

# Copyright (C) 2013,2014 Ole Tange, Mike DeGiorgio, Anna-Sapfo
# Malaspinas, Jose Victor Moreno-Mayar, Yong Wang and Free Software
# Foundation, Inc.
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License,
# or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

use Getopt::Long;

Getopt::Long::Configure("bundling");

my @retval = GetOptions
    ("debug|D" => \$opt::debug,
     "verbose|v" => \$opt::verbose,
#     "help|h" => \$opt::help,
     "triallelic_before_sampling|tribefore" => \$opt::triallelic_before_sampling,
    ) or pod2usage(2);

my $summary_file = shift || "tmp/summary";

srand(1);

my $individual_header = <>;
if(not defined($individual_header)) {
    # Empty input
    exit();
}
my @individuals = split /\s+/, $individual_header;
my $num_individuals = $#individuals + 1;
my $population_header = <>;
my @populations = split /\s+/, $population_header;
my $num_populations = $#populations + 1;
if($num_individuals != $num_populations) {
  error("Individuals @individuals\n");
  error("Populations @populations\n");
  error("There are $num_individuals individuals but $num_populations matching populations\n");
  exit();
}

my ($chr, @cols, $count_type, %sum, @available, @alleles_taken,
    @different_alleles_seen, %ref, $indv, @number_of_snps);
my $biallelic_sites = 0;
my $triallelic_sites = 0;
my $chrid = "ID";

# Always translate undef = -9, 0 = 0, 1 = 1
$ref{undef} = -9;
$ref{-9} = -9;
$ref{0} = 0;
$ref{1} = 1;

while(<>) {
  # Split input line on \0
  @nulls = split /\0/, $_;
  $chr = shift @nulls;
  $pos = shift @nulls;
  @cols = ();
  # Each chunk represents a different format.
  while(@nulls) {
    $input_type = shift @nulls;
    if($input_type eq "PU") {
      # Pileup line
      push @cols, full_pileup_line(shift @nulls);
    } elsif($input_type eq "TPED") {
      # TPED line
      push @cols, full_tped_line(shift @nulls);
    } elsif($input_type eq "Mike") {
      # Mike format
      push @cols, "Mike", split /\s+/, shift @nulls;
    } else {
      die("Unknown input format: $input_type");
    }
  }

  @sum{'A','C','G','T'} = (0,0,0,0);
  $ref_allele1 = undef;
  $ref_allele2 = undef;
  @alleles_taken = ();
  $indv = 0;
  
  while(@cols) {
    $count_type = shift @cols;
    $indv++;
    if($count_type eq "AC") {
      # Allele count format: ["AC",#a,#c,#g,#t]
      # Grab the next 4 columns into $nt{"A"}, $nt{"C"}, $nt{"G"}, $nt{"T"},
      @nt{"A","C","G","T"} = (shift @cols,shift @cols,shift @cols,shift @cols);
      # Sample allele at random
      @available = grep { $nt{$_} > 0 } keys %nt;
      # Select one of the available alleles at random (undef if none found)
      $selected = $available[rand($#available+1)];
      if(defined $selected) {
	push @alleles_taken, $selected;
	$defined_snp_count[$indv]++;
      } else {
	push @alleles_taken, -9;
      }
      # Keep a sum to determine if this is triallic
      if($opt::triallelic_before_sampling) {
	# Check if triallelic before sampling
	map { $sum{$_} += $nt{$_} } keys %nt;
      } else {
	# Check if triallelic after sampling
	if(defined $selected) {
	  map { $sum{$_} += $nt{$_} } $selected;
	}
      }
    } elsif ($count_type eq "Mike") {
      # Mike format allele count
      # ["Mike", chr, chrid, pos, allele1 allele2 1 0 0 1 0 0 0 0 0 0 0 -9]
      # Grab the next 5 columns
      ($chr, $chrid, $pos, $ref_allele1, $ref_allele2) =
	(shift @cols, shift @cols, shift @cols, shift @cols, shift @cols);
      # Change this to push to get the Ancient first
      push @alleles_taken, @cols;
      $ref{$ref_allele1} = 0;
      $ref{$ref_allele2} = 1;
      # Keep a sum to determine if this is triallic
      $sum{$ref_allele1}++; $sum{$ref_allele2}++;
      last;
    } else {
      die_bug("Format unknown (line $-): @cols");
    }
  }
  # Is triallelic or quadallelic? => ignore site
  @different_alleles_seen = grep { ($sum{$_} || 0) > 0 } qw(A C G T);
  if($#different_alleles_seen >= 2) {
    # Triallelic or more (perl counts from 0)
    # Ignored
    my $sel = $selected || "-";
    if($opt::verbose) {
      print STDERR "$chr $pos is triallelic (@different_alleles_seen) $sel ", join("/",%nt)," ::: ",join("/",%sum),"\n";
    }
    $triallelic_sites++;
  } else {
    # Site is biallelic or monoallelic
    if(not $ref_allele1) {
      # If the encoding is not decided from the Mike format:
      # Invent the 0,1 encoding for the alleles
      @ref{@different_alleles_seen} = (0,1);
      ($ref_allele1,$ref_allele2) = @different_alleles_seen;
      # If we see only one allele here: Use the same for both
      $ref_allele2 ||= $ref_allele1;
    }
    print $chr, " ", $chrid, " ", $pos, " ", $ref_allele1, " ", $ref_allele2;
    # Convert A G G A G undef => 1 0 0 1 0 -9
    print " @ref{@alleles_taken}";
    print "\n";
    $biallelic_sites++;
  }
}

if($opt::verbose) {
  # Perl counts from 0, but we only initialized 1 and up, so skip the first.
  shift @defined_snp_count;
  print STDERR "Number of snps: ",map { ("\t",undef_as_zero($_)) } @defined_snp_count;
  print STDERR "\n";
  $triallelic_sites = undef_as_zero($triallelic_sites);
  printf STDERR "Number of bi/monoallelic sites: %d (%.2f%%)\n",$biallelic_sites,$biallelic_sites/($biallelic_sites+$triallelic_sites)*100;
  printf STDERR "Number of (ignored) triallelic sites: %d (%.2f%%)\n",$triallelic_sites,$triallelic_sites/($biallelic_sites+$triallelic_sites)*100;
}

sub full_pileup_line {
  my $line = shift;
  my ($chr, $pos, $ref, @PUline) = split(/\t/, $line);
  # mpileup format:
  # chr pos             Ref     Depth1  Pileup1 Qual1   Depth2  Pileup2 Qual2
  # 22  32781587        N       1       ^FG     G       1       ^FG     G
  # 22  32781722        N       1       c       E       1       c       E
  # 22  32904927        N       2       t$t     HB      0       *       *
  # 22  42815298        N       0                       0       *       *
  # 22  42815298        N       1       c       H       0       ^~A$    J
  my @out;
  while(@PUline > 3) {
    my($depth1, $pileup, $qual) = (shift @PUline, shift @PUline, shift @PUline);
    if(defined $pileup) {
      # Allele count (AC) for $individual
      push @out, "AC", parse_pileup($pileup);
    }
  }
  return @out;
}

sub full_tped_line {
  my $tped_line = shift;
  #   push @out, (a,c,g,t)
  my @out;
  for my $set (parse_tped($tped_line)) {
    # Allele counts (AC) for individuals in tped
    push @out, "AC", @$set;
  }
  return @out;
}


sub parse_tped {
  # Read a tped line and return a list of alleles
  # Input:
  #   $tped_line = line from file.tped
  # Output:
  #   @output = ([#a,#c,#g,#t], [#a,#c,#g,#t], [#a,#c,#g,#t], [#a,#c,#g,#t], ...);
  my ($tped_line) = shift;
  # chr,pos_name, ?, pos, indv-1-parent-1, indv-1-parent-2, indv-2-parent-1, indv-2-parent-2, ...
  # 21 rs12627229 0 10913441 T T T T T T T T T T T T T T C C T T T T T T T T T T T T T T T T C C T T T T T T T T T T T T T T T T 0 0 T T C C T T C T T T T T T T C C T T C C T T C T 0 0 T T T T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 C T T T T T T T T T T T C T T T C T T T T T T T C T T T T T T T T T T T T T C T T T T T T T T T T T C C T T T T C T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T C T C T T T T T C T C T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T C T T T C T T T T T T T T T T T T T T T C T T T T T C T T T T T C T C T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T T
  if(not %Global::allele_pairs) {
    %Global::allele_pairs =
      (
       "00" => [0,0,0,0],
       "0A" => [1,0,0,0],
       "0C" => [0,1,0,0],
       "0G" => [0,0,1,0],
       "0T" => [0,0,0,1],
       
       "A0" => [1,0,0,0],
       "AA" => [2,0,0,0],
       "AC" => [1,1,0,0],
       "AG" => [1,0,1,0],
       "AT" => [1,0,0,1],
       
       "C0" => [0,1,0,0],
       "CA" => [1,1,0,0],
       "CC" => [0,2,0,0],
       "CG" => [0,1,1,0],
       "CT" => [0,1,0,1],
       
       "G0" => [0,0,1,0],
       "GA" => [1,0,1,0],
       "GC" => [0,1,1,0],
       "GG" => [0,0,2,0],
       "GT" => [0,0,1,1],
       
       "T0" => [0,0,0,1],
       "TA" => [1,0,0,1],
       "TC" => [0,1,0,1],
       "TG" => [0,0,1,1],
       "TT" => [0,0,0,2],
      );
  }
  chomp($tped_line);
  # Ignore the first 4 columns
  my ($i1,$i2,$i3,$i4,@col) = split /\s+/, $tped_line;
  my @out;
  while(@col) {
    my $parent1 = shift @col;
    my $parent2 = shift @col;
    if($Global::allele_pairs{$parent1.$parent2}) {
      push @out, $Global::allele_pairs{$parent1.$parent2};
    } else {
      die($parent1.$parent2);
    }
  }
  # Return lists with [#A, #C, #G, #T]
  return @out;
}


sub parse_pileup {
  # Input:
  #   Pileup column (e.g: '^FG' 'c' 't$t');
  # Output:
  #   (number_of_A,number_of_C,number_of_G,number_of_T)
  my $PU = shift;
  $PU = uc($PU);
  my %alleles=();
  $alleles{"A"}=0;
  $alleles{"C"}=0;
  $alleles{"G"}=0;
  $alleles{"T"}=0;
  # A pileup line may look like this G^FG^Fg^FG^FG^FG+1GC-2NN*GA$G
  while($PU) {
    #Always get the first operation into $1 and the rest of the operations into $2
    if($PU =~ s/^(\w)//) {
      # E.g PU="G"
      # Whenever there is a match, add that match to the corresponding allele count
      $alleles{$1}++;
      next;
    } elsif($PU =~ s/^\^(.)//) {
      # E.g PU="^F"
      # When a read ends, there is a hat followed by the read mapping quality. Get rid of this info.
      next;
    } elsif($PU =~ s/^\$//) {
      # E.g PU='$'
      # A dollar means a read starts here, get rid of it.
      next;
    } elsif($PU =~ s/^\*//) {
      # E.g PU='*'
      # * represents a deleted base in a previous read - ignore that
      next;
    } elsif($PU =~ s/^[-+](\d+)(.*)//) {
      # E.g. -2NN
      # This one deals with indels. +2GG would mean that 2 Gs were inserted between this position and the ref in one read
      my $nInDel = $1;
      # Skip the appropriate number of positions in the remaining pileup line
      $PU =~ s/^\w{$nInDel}//;
    } else {
      die_bug("PU contains unmatched chars: $PU");
    }
  }
  # Return the array of allele counts
  return @alleles{"A","C","G","T"};
}


sub die_bug {
  my $bugid = shift;
  $Global::progname ||= $0;
  $Global::version ||= 0.1;
  print STDERR
    ("$Global::progname: This should not happen. You have found a bug.\n",
     "Please contact <parallel\@gnu.org> and include:\n",
     "* The version number: $Global::version\n",
     "* The bugid: $bugid\n",
     "* The command line being run\n",
     "* The files being read (put the files on a webserver if they are big)\n",
     "\n",
     "If you get the error on smaller/fewer files, please include those instead.\n");
  ::wait_and_exit(255);
}


sub my_dump {
    # Returns:
    #   ascii expression of object if Data::Dump(er) is installed
    #   error code otherwise
    my @dump_this = (@_);
    eval "use Data::Dump qw(dump);";
    if ($@) {
        # Data::Dump not installed
        eval "use Data::Dumper;";
        if ($@) {
            my $err =  "Neither Data::Dump nor Data::Dumper is installed\n".
                "Not dumping output\n";
            print STDERR $err;
            return $err;
        } else {
            return Dumper(@dump_this);
        }
    } else {
	# Create a dummy Data::Dump:dump as Hans Schou sometimes has
	# it undefined
	eval "sub Data::Dump:dump {}";
        eval "use Data::Dump qw(dump);";
        return (Data::Dump::dump(@dump_this));
    }
}


sub error {
    my @w = @_;
    my $fh = $Global::original_stderr || *STDERR;
    my $prog = $Global::progname || "bammds_intersect";
    print $fh $prog, ": Error: ", @w;
}


sub warning {
    my @w = @_;
    my $fh = $Global::original_stderr || *STDERR;
    my $prog = $Global::progname || "bammds_intersect";
    print $fh $prog, ": Warning: ", @w;
}


sub debug {
    # Returns: N/A
    $opt::debug or return;
    @_ = grep { defined $_ ? $_ : "" } @_;
    if($Global::original_stdout) {
        print $Global::original_stdout @_;
    } else {
        print @_;
    }
}

sub undef_as_zero {
    return($_[0] || 0);
}

