#!/usr/bin/perl

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License 
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

# This software is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.

# You should have received a copy of the GNU General Public License 
# along with this software; if not, write to the 
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.

# grab data from the economagic web site and get it into a suitable
# form for creating a binary database

# arg: economagic source (e.g. fedbog, beana, fedstl)

# use strict;
use Socket;
use Date::Calc qw(Delta_Days Day_of_Week);
require LWP::UserAgent;
require "getopts.pl";

my(@fields, @obs, @yr, @subper, @subsubper, @sernames, @dirs, @daily, @business);
my($s, $n, $full_n, $i, $url, $line, $pdstr, $varname, $pd, $pdcheck);
my($remote, $port, $iaddr, $paddr, $proto, $msg, $title, $grab);
my($ua, $request, $response);
my $nseries = 0;
my $interactive = 1;
my $verbose = 0;
my $tester = 0;
my $getdaily = 0;

sub check_series_name {
    if ($_[0] =~ m@[^a-zA-Z0-9\+-_/]@) { return 1; }
    return 0;
}

sub series_is_daily {
    if ($_[0] =~ m+/day+) { return 1; }
    if ($_[0] =~ /^day-/) { return 1; }
    return 0;
}

sub business_days {
    # param: daily series name
    my $series = $_[0];
    my $busdays = 1;
    my $wkendcnt = 0;
    my $dow;

    if ($source eq "sp") {
	$grab = "sp/" . "$series";
    } else {
	$grab = "$series";
    }

    $msg = "GET /em-cgi/data.exe/$grab \n\n";
    if ($verbose) {
	print STDERR "Testing frequency of $grab\n";
    }
    socket(SOCK, PF_INET, SOCK_STREAM, $proto) or die "Error: $!";
    connect(SOCK, $paddr) or die "Error: $!";
    send(SOCK, $msg, 0) or die "Cannot send query: $!";

    $n = 0;
    while (<SOCK>) {
	if (/Server Error/) {
	    print;
	    die "Can't get info from server";
	}
	chomp;

	if (/^ 19/ || /^ 20/) {
	    s/\r//;
	    # remove economagic obfuscation
	    s#>[^<]*</#><#g;
	    s#<[^>]*># #g;

	    @fields = split(/ +/);
	    $dow = Day_of_Week($fields[1], $fields[2], $fields[3]);
	    if ($dow == 6 || $dow == 7) {
		$wkendcnt++;
	    }
	    $n++;
	}
	if ($n > 22) { last; }
    }

    if ($wkendcnt >= 6) { 
	$busdays = 0; 
	if ($verbose) { print STDERR " apparently 7-day data\n"; }
    } elsif ($verbose) {
	print STDERR " apparently 5-day data\n"; 
    }

    return $busdays;
}

sub get_gap {
    # params: delta_days, delta_days_expected, last yr, mo, day
    my $dd = $_[0];
    my $dd_exp = $_[1];
    my $gap = $dd - $dd_exp;

    if ($gap > 2) {
	my $start_dow = Day_of_Week($_[2], $_[3], $_[4]);
	my $days_till_sat = 6 - $start_dow;
	my $wkends = 0;

	if ($dd > $days_till_sat) {
	    $wkends = 1;
	    $dd -= $days_till_sat;
	    $wkends += int($dd / 7);
	    $gap -= ($wkends * 2);
	}
    }
    return $gap;
}

sub usage
{
    die <<"EndUsage";
usage: magicget [-b] data_source_identifier

magicget -- A program for grabbing data from www.economagic.com and
            compiling it into a gretl database.
Options: 
     -h  Help -- just display this message and quit.
     -b  Run in batch mode -- do not prompt for proceeding to make
         database after retrieving list of variable names.
     -v  Verbose operation.
     -t  Test mode: print out raw and parsed data to .tst file
     -d  Include daily data series (if any) in database

The "data_source_identifier" (e.g. fedbog, ecb) is the abbreviation used
by economagic for a given data source.

EndUsage
}

# Start of main program

# Get command-line options (b for batch, h for help, etc.);

if (!(&Getopts('bvhtd')) || $opt_h) { &usage; }
if ($opt_b) { $interactive = 0; }
if ($opt_v) { $verbose = 1; }
if ($opt_t) { $tester = 1; }
if ($opt_d) { $getdaily = 1; }

if (@ARGV == 0) { &usage; }
my $source = $ARGV[0];

open (BIN, ">$source.bin") || die "Can't open $source.bin";
open (IDX, ">$source.idx") || die "Can't open $source.idx";

if ($tester) {
    open (TST, ">$source.tst") || die "Can't open $source.tst";
}

$remote = "www.economagic.com";
$port = "80";

# First pass: get series names

print "Trying to connect to $remote...\n";

$title = "http://" . $remote . "/" . $source . ".htm";
print "$title\n";
$ua = LWP::UserAgent->new;
$request = HTTP::Request->new('GET', $title);
$response = $ua->request($request);

if ($response->is_success) {
    foreach (split(/\n/, $response->content)) {
	if (/Server Error/) {
	    print;
	    die "Can't get info from server";
	}
	chomp;
	if (/<TITLE>/) {
	    $i = index($_, "<TITLE>") + 26;
	    $line = substr($_, $i, rindex($_, "</TITLE>") - $i);
	    print "$line\n";
	    print IDX "# $line\n";
	}
	if (/data.exe/) {
	    $i = index($_, "href=") + 6;
	    $url = substr($_, $i);
	    $url = substr($url, 0, index($url, "\>") - 1);
	    $varname = substr($url, rindex($url, "/", 18) + 1);
	    print "series $varname\n";
	    if (check_series_name($varname)) {
		print STDERR "Got bad series name, $varname\n";
	    } else {
		if (series_is_daily($varname)) {
		    print STDERR "$varname: seems to be daily series\n";
		    $daily[$nseries] = 1;
		} else {
		    $daily[$nseries] = 0;
		}
		$sernames[$nseries] = $varname;
		# $dir = $url;
		$nseries++;
	    }
	}
    }
} else {
    print "Couldn't get document\n";
    exit;
}

if ($nseries == 0) {
    print "$source: got no good series names\n";
    exit;
} else {
    print "$source: found $nseries series names\n"; 
}

if ($interactive) {
    print "Proceed to download data and make database? (Y/n) ";
    $response = <STDIN>;
    if ($response =~ /n/) {
	print "OK, exiting\n";
	exit;
    }
}

# Now get data for the series we want:

$iaddr = inet_aton($remote) or die "Error: $!";
$paddr = sockaddr_in($port, $iaddr) or die "Error: $!";
$proto = getprotobyname('tcp') or die "Error: $!";

for ($s = 0; $s < $nseries; $s++) {
    if ($daily[$s]) { 
	push(@business, business_days($sernames[$s]));
    } else {
	push(@business, 0);
    }
}

for ($s = 0; $s < $nseries; $s++) {

    # if ($sernames[$s] !~ /day-tcm3y/) { next; }

    if (($daily[$s] || $business[$s]) && $getdaily == 0) {
	next;
    }

    if ($source eq "sp") {
	$grab = "sp/" . "$sernames[$s]";
    } else {
	$grab = "$sernames[$s]";
    }

    $msg = "GET /em-cgi/data.exe/$grab \n\n";
    if ($verbose) {
	print STDERR "Trying to get /em-cgi/data.exe/$grab\n";
    }
    socket(SOCK, PF_INET, SOCK_STREAM, $proto) or die "Error: $!";
    connect(SOCK, $paddr) or die "Error: $!";
    send(SOCK, $msg, 0) or die "Cannot send query: $!";

    @yr = ();
    @subper = ();
    @subsubper = ();
    @obs = ();
    $n = 0;
    $pd = 1;
    $title = "";
    my $baddate;
    
    if ($daily[$s]) { $pd = 7; } 
    if ($business[$s]) { $pd = 5; }

    while (<SOCK>) {
	if (/Server Error/) {
	    print;
	    die "Can't get info from server";
	}
	chomp;

	if (/Series Title:/) {
	    $title = <SOCK>;
	    chop($title);
	    if ($verbose) { print STDERR "title: $title\n"; }
	    if ($title =~ /Weekly/) { last; }
	    next;
	}

	if (/^ 19/ || /^ 20/) {
	    s/\r//;
	    if ($tester) { print TST "$_\n"; }
	    # remove economagic obfuscation
	    s#>[^<]*</#><#g;
	    s#<[^>]*># #g;
	    if ($tester) { print TST "$_\n"; }
	    @fields = split(/ +/);
	    push(@yr, $fields[1]);
	    push(@subper, $fields[2]);
	    if ($daily[$s]) {
		push(@subsubper, $fields[3]);
		push(@obs, $fields[4]);
	    } else {
		push(@obs, $fields[3]);
		$pdcheck = $fields[2];
		$pdcheck =~ s/^0//;
		if ($pdcheck > $pd) {
		    $pd = $pdcheck;
		}
	    }
	    # check that dates make sense 
	    if ($n > 14 && $pd != 4 && $pd != 12 && $pd != 1 && $pd != 5 && $pd != 7) { 
		print STDERR "Unrecognized frequency $pd for $sernames[$s]\n"; 
		print STDERR "Skipping this series\n";
		$pd = -1;
		last;
	    } else {
	        print STDERR "Frequency of series = $pd\n";
	    }
	    $n++;
	}
    }

    if ($pd != 4 && $pd != 12 && $pd != 1 && $pd != 5 && $pd != 7) { 
	print STDERR "Unrecognized frequency $pd for $sernames[$s]\n"; 
	print STDERR "Skipping this series\n";
	$pd = -1;
    }
    
    if ($pd == -1) { next; }
    if ($n == 0) {
	print "Got no observations on this series\n";
	next;
    }

    # print series name and description
    print "Processing $sernames[$s]...\n";
    $title =~ s/\r//;
    $title =~ s/Billions of/Bn./g;
    $title =~ s/dollars/\$/ig;
    $title =~ s/chained/ch./ig;
    $title =~ s/Government/Gov./g;
    $title =~ s/Nonfinancial/Nonfin./g;
    $title =~ s/Market/Mkt./;
    $title =~ s/ the / /;
    $varname = $sernames[$s];
    $varname = substr($varname, index($varname, "/") + 1);
    $varname =~ s/\+.*//;
    $varname =~ s/-/_/g;
    $varname =~ s/day_/d/;
    print IDX "$varname  $title\n";

    # print the dates/obs line
    if ($pd == 1) {
	print IDX "A  $yr[0] - $yr[$n-1]  n = $n\n";
    } elsif ($pd == 4 || $pd == 12) {
	if ($pd == 4) {
	    $pdstr = "Q";
	    $subper[0] =~ s/0//;
	    $subper[$n-1] =~ s/0//;
	} elsif ($pd == 12) {
	    $pdstr = "M";
	}
	print IDX "$pdstr  $yr[0].$subper[0] - $yr[$n-1].$subper[$n-1]  n = $n\n";
    } 

    # print the data values
    $full_n = $n;
    for ($i = 1; $i <= $n; $i++) {
	$baddate = 0;
	if (($pd == 5 || $pd == 7 ) && $i > 1) { # special treatment for daily data
	    my $dd = Delta_Days($yr[$i-2], $subper[$i-2], $subsubper[$i-2],
				$yr[$i-1], $subper[$i-1], $subsubper[$i-1]);
	    my $dow = Day_of_Week($yr[$i-1], $subper[$i-1], $subsubper[$i-1]);
	    my $dd_exp = ($dow == 1 && $pd == 5)? 3 : 1;
	    my $gap = 0;

	    if ($dd > $dd_exp) {
		if ($pd == 5) {
		    $gap = get_gap($dd, $dd_exp,
				   $yr[$i-2], $subper[$i-2], $subsubper[$i-2]);
		} else {
		    $gap = $dd - $dd_exp;
		}
	    } 

	    if ($pd == 5 && ($dow == 6 || $dow == 7)) {
		print STDERR "\nWarning: $varname: found data at bad date ";
		print STDERR "$yr[$i-1]/$subper[$i-1]/$subsubper[$i-1]\n";
		$baddate = 1;
		$full_n--;
	    }

	    # print STDERR "dd=$dd, dow=$dow, dd_exp=$dd_exp, gap=$gap\n";

	    for (my $t = 0; $t < $gap; $t++) {
		if ($verbose) {	print STDERR "NA\n"; }
		print BIN pack("f", "-999.0");
		$full_n++;
	    }
	}
	if ($baddate) { next; }
	chomp ($obs[$i-1]);
	if ($verbose) { 
	    if ($pd == 5 || $pd == 7) {
		print STDERR "$yr[$i-1]/$subper[$i-1]/$subsubper[$i-1]: $obs[$i-1]\n";
	    } else {
		print STDERR "$yr[$i-1]:$subper[$i-1]  $obs[$i-1]\n";
	    }
	}
	print BIN pack("f", $obs[$i-1]);
	if ($verbose && !($i % $pd)) { print STDERR "\n"; }
    }

    # daily data: print dates/obs once we know how many calendar days there are
    if ($pd == 5 || $pd == 7) {
	printf IDX "%s  $yr[0]/$subper[0]/$subsubper[0] - ", ($pd == 5)? "D" : "B";
	print IDX "$yr[$n-1]/$subper[$n-1]/$subsubper[$n-1]  n = $full_n\n";
    }


    if ($verbose) { print STDERR "\n"; }
    close (SOCK);
}

close (BIN);
close (IDX);
if ($tester) {
    close (TST);
}











