#!/usr/bin/perl

# 
# Yahoo!NEWS into PortaBase & TTextReader.
# Version : 0.0.3+20040123
# 
# It requires perl, nkf, and portabase.
# It fetchs web page with "wget", so maybe environment 
# variable 'HTTP_PROXY' is affective.
# 
# written by Makio Tsukamoto <walrus@digit.que.ne.jp>
# TTextReader support written by Kentaro Shirakata <argrath@ub32.org>
# this script is under Artistic License as perl is.
#
# 0.0.3+20040123 (2004/01/23)
#  * Add TTextReader support
# 
# 0.0.3 (2003/04/10)
#  * bug fix. sorry, 0.0.2 was not executable.
# 
# 0.0.2 (2003/04/08)
#  * command line option added and is set default
#  * configuration item 'limit_news' added
# 
# 0.0.1 (2003/04/07)
#  * first version
# 

# default settings, you can specify these in config file.
my %config = (
	'top_url'    => '',
	'last_url'   => '',
	'limit_days' => 0,
	'limit_news' => 0,
	'data_dir'   => '',
	'temp_file'  => 'YahooNEWS.temp',
	'csv_file'   => 'YahooNEWS.csv',
	'pob_file'   => '',
);

# executable path
my $WGET        = '/usr/bin/wget';
my $NKF         = '/home/QtPalmtop/bin/nkf';
my $PORTABASE   = '/home/QtPalmtop/bin/portabase';

# global variables
my @news_urls   = ();
my @records     = ();
my @index_urls  = ();
my %path;
my %url;
my %cmd = (
#	'fetch'  => 'curl -s "__URL__" -o "__FILE__"',
	'fetch'  => qq($WGET -o /dev/null -O "__FILE__" "__URL__"),
	'utf8'   => qq($NKF -w -i "__FROM__" > "__TO__"),
	'sjis'   => qq($NKF -s -i "__FROM__" > "__TO__"),
	'import' => qq($PORTABASE fromcsv "__CSV__" "__POB__"),
);

my ($outputproc) = \&output_portabase;

# commandline option
&end_usage() if ($ARGV[0] eq '-h' or $ARGV[0] eq '--help');
$path{'conf_file'} = shift(@ARGV) if (@ARGV and substr($ARGV[0], 0, 1) ne '-');
my %opt            = @ARGV;

# main process
&main;

sub main {
	&initialize;
	print "[LAST] $config{'last_url'}\n";
	# get index urls
	my @urls = &get_index_urls($url{'top'});
	@index_urls = @urls;
	@index_urls = splice(@index_urls, 0, $config{'limit_days'}) if ($config{'limit_days'});
	# get news urls
	while (my $index_url = shift(@index_urls)) {
		my ($next_url, @urls) = &get_news_urls($index_url);
		unshift(@index_urls, $next_url) if ($next_url);
		foreach my $url (@urls) {
			next unless ($url);
			if ($config{'last_url'} and $url eq $config{'last_url'}) {
				splice(@index_urls);
				last;
			}
			push(@news_urls, $url);
		}
	}
	@news_urls = splice(@news_urls, -($config{'limit_news'})) if ($config{'limit_news'});
	&$outputproc(@news_urls);
	&finish_script(1);
}

sub output_portabase {
	open(OUT, ">$path{'csv'}");
	foreach my $news_url (reverse(@news_urls)) {
		my $news = &get_news($news_url);
		my @fields = map {$news->{$_}} qw(title date time description link read);
		my $csv_data = join ',', map {(s/"/""/g or /[\r\n,]/) ? qq("$_") : $_} @fields; #"
		print OUT $csv_data,"\n";
		$config{'last_url'} = $news_url;
	}
	close OUT;
	# convert and inport
	&import;
}

sub output_ttextreader {
	open(OUT, ">$path{'csv'}");
	print OUT ".YahooNEWS\n";
	foreach my $news_url (reverse(@news_urls)) {
		my $news = &get_news($news_url);
	    $news->{'title'} = '..' . $news->{'title'};
	    $news->{'time'} .= "\n";
		my @fields = map {$news->{$_}} qw(title date time description link read);
		my $csv_data = join ',', map {(s/"/""/g or /[\r\n,]/) ? qq("$_") : $_} @fields; #"
		print OUT $csv_data,"\n";
		$config{'last_url'} = $news_url;
	}
	close OUT;
}

sub initialize {
# minimam condition : 
#  * data directory accessible.
#  * url of category top page is specified
#  * .pob file is exist
	# define config file path
	# (1) file specfied with first argument.
	# (2) file specfied with arguments "-conf_file FILE"
	# (3) script name which replaced ext with 'cfg'
	$path{'conf_file'} = $opt{'-conf_file'} if ($opt{'-conf_file'});
	if (not $path{'conf_file'}) {
		$path{'conf_file'} = $0;
		$path{'conf_file'} =~ s/(\.[^\.\/\\]+)?$/.cfg/i;
	}
	# read configuration
	if (-f $path{'conf_file'}) {
		if (open(IN, $path{'conf_file'})) {
			print "[READ CONFIG] $path{'conf_file'}\n";
			while (<IN>) {
				next if (/^\s*#/);
				my ($key, $val) = split(/\s*=\s*/, $_, 2);
				next unless (length($key) and length($val));
				$val =~ s/\s+$//;
				$val =~ s/__(.+?)__/$ENV{$1}/g;
				$config{$key} = $val;
			}
			close IN;
		} else {
			&end_usage(qq(Can't read configuration file "$path{'conf_file'}".)); #'
		}
	} else {
		$path{'conf_file'} = undef;
	}
	# define data directory
	# (1) directory specfied with arguments "-data_dir DIRECTORY"
	# (2) directory specfied in configuration file
	# (3) current directory
	if    ($opt{'-data_dir'})   { $path{'data_dir'} = $opt{'-data_dir'}; }
	elsif ($config{'data_dir'}) { $path{'data_dir'} = $config{'data_dir'}; }
	else {
		$path{'data_dir'} = $0;
		$path{'data_dir'} =~ s/\/[^\/]+$//;
	}
	unless (-d $path{'data_dir'} or mkdir $path{'data_dir'}, 0777) {
		&end_usage(qq(Can't open data diretory "$path{'data_dir'}".)); #'
	}
	$path{'temp'} = "$path{'data_dir'}/$config{'temp_file'}";
	$path{'csv'}  = "$path{'data_dir'}/$config{'csv_file'}";
	# define pob file path
	# (1) file in data directory which filename specfied with arguments "-pob_file FILENAME"
	# (2) file in data directory which filename specfied in configuration file
	# (3) script name which replaced ext with 'cfg'
	if    ($opt{'-pob_file'})   { $path{'pob'} = $opt{'-pob_file'}; }
	elsif ($config{'pob_file'}) { $path{'pob'} = "$path{'data_dir'}/$config{'pob_file'}"; }
	else {
		$path{'pob_file'} = $0;
		$path{'pob_file'} =~ s/(\.[^\\\/\.]+)?$/.pob/i;
	}
	unless (-f $path{'pob'})    { &end_usage(qq(PortaBase file "$path{'pob'}" does not exist.)); }
	# define top page url
	# (1) url specfied with arguments "-top_url URL"
	# (2) url specfied in configuration file
	$url{'top'} = ($opt{'-top_url'}) ? $opt{'-top_url'} : $config{'top_url'};
	unless ($url{'top'}) { &end_usage(qq(URL of category top page does not specified.)); }
	# define last page url
	# (1) url specfied with arguments "-last_url URL"
	# (2) url specfied in configuration file
	$url{'last'} = ($opt{'-last_url'}) ? $opt{'-last_url'} : $config{'last_url'};
	if($config{'format'} eq 'text'){$outputproc = \&output_ttextreader;}
	return;
}

sub finish_script {
	my ($result, $message) = @_;
	print $message if ($message);
	exit $result unless ($path{'conf_file'});
	exit $result unless (open(OUT, ">$path{'temp'}"));
	print "[WRITE CONFIG] $path{'conf_file'}\n";
	if (open(IN, "$path{'conf_file'}")) {
		while (my $line = <IN>) {
			if ($line =~ /^\s*#/) { print OUT $line; }
			elsif ($line !~ /=/) { print OUT $line; }
			else {
				my ($key, $val) = split(/\s*=\s*/, $line, 2);
				print OUT "$key\t= $config{$key}\n";
			}
		}
		close IN;
	} else {
		print OUT <<EOF;
# Site Information
top_url    = $config{'top_url'}
last_url   = $config{'last_url'}
limit_days = $config{'limit_days'}
limit_news = $config{'limit_news'}

# File and Directory Information
data_dir   = $config{'data_dir'}
temp_file  = $config{'temp_file'}
csv_file   = $config{'csv_file'}
pob_file   = $config{'pob_file'}
EOF
	}
	close OUT;
	rename $path{'temp'}, $path{'conf_file'};
}

sub end_usage {
	my ($error) = @_;
	print "error : $error\n" if ($error);
	print <<EOF;
usage : YahooNEWS.pl [CONFIG_FILE | -conf_file CONFIG_FILE]
                     [-data_dir DATA_DIRECTORY]
                     [-pob_file POB_FILE]
                     [-top_url TOP_URL]
                     [-last_url LAST_URL]
EOF
	exit(0);
}

sub fetch {
	my ($url, $file) = @_;
	$file            = $path{'temp'} unless ($file);
	my $fetch_cmd    = $cmd{'fetch'};
	$fetch_cmd       =~ s/__URL__/$url/g;
	$fetch_cmd       =~ s/__FILE__/$file/g;
	system($fetch_cmd);
}

sub import {
	# convert to utf-8
	my $utf8_cmd = $cmd{'utf8'};
	$utf8_cmd =~ s/__FROM__/$path{'csv'}/g;
	$utf8_cmd =~ s/__TO__/$path{'temp'}/g;
	print "[CONVERT] $utf8_cmd\n";
	system($utf8_cmd);
	rename $path{'temp'}, $path{'csv'};
	# import
	my $import_cmd = $cmd{'import'};
	$import_cmd =~ s/__CSV__/$path{'csv'}/g;
	$import_cmd =~ s/__POB__/$path{'pob'}/g;
	print "[import] $import_cmd\n";
	system($import_cmd);
}

sub get_index_urls {
	print "[TOP] $url{'top'}";
	# fetch category top page
	&fetch($url{'top'}, $path{'temp'});
	open(IN, $path{'temp'});
	my $html = join('', <IN>);
	close(IN);
	unlink $path{'temp'};
	# parse html
	my $form_url = 'http://headlines.yahoo.co.jp/hl';
	my %opt;
	$html = &trim($html, qq(<form action="$form_url" method="get">), '</form>');
	unless ($html) {
		print " -> (void)\n";
		return 
	}
	$html =~ s/\s+/ /g;
	$opt{$1} = $2 while ($html =~ s/<input type="hidden" name="(.+?)" value="(.+?)">//);
	my @days = ($html =~ /<option value="(.+?)"[^<>]*>/g);
	my @opts = map { $_ = "$_=$opt{$_}"; } keys(%opt);
	$form_url .= '?';
	$form_url .= join('&', @opts, '') if (@opts);
	my @urls = map { $_ = $form_url."d=$_"; } @days;
	print " -> (".scalar(@urls).")\n";
	return @urls;
}

sub get_news_urls {
	my ($index_url) = @_;
	print "[INDEX] $index_url";
	# fetch index page
	&fetch($index_url, $path{'temp'});
	open(IN, $path{'temp'});
	my $html = join('', <IN>);
	close(IN);
	unlink $path{'temp'};
	# parse html
	$html =~ s/\s+/ /g;
	my $next_url = $1 if ($html =~ /<a href="([^"]+)">次のページ<\/a>/); #"
	$html = &trim($html, '<ul>', '</ul>');
	my @urls = ($html =~ /<a href="(.+?)">/g);
	print " -> (".scalar(@urls).", $next_url)\n";
	return $next_url, @urls;
}

sub get_news {
	my ($news_url) = @_;
	print "[NEWS] $news_url\n";
	# fetch news page
	&fetch($news_url, $path{'temp'});
	open(IN, $path{'temp'});
	my $html = join('', <IN>);
	close(IN);
	unlink $path{'temp'};
	# parse html
	$html = &trim($html, '<font size=5>', '</div>');
	return unless ($html);
	$html =~ s/\n//g;
	my $title   = &unescape_html($1) if ($html =~ s/<font size=5><b>(.+?)<\/b><\/font><br><br>//);
	my ($date, $time, $dc_date);
	if ($news_url =~ /(2\d{3})(\d{2})(\d{2})/) {
		$date    = sprintf('%04d-%02d-%02d', $1, $2, $3);
		$dc_date = sprintf('%04d%02d%02d', $1, $2, $3);
	}
	if ($html =~ s/<div align=right>.+?(\d+)時(\d+)分.+?<\/div>//) {
		$dc_date .= sprintf('T%02d:%02d', $1, $2);
		$time     = sprintf('%02d%02d', $1, $2);
	}
	my $desc  = &unescape_html($html);
	my $news  = {
		'link'        => $news_url,
		'title'       => $title,
		'dc:date'     => $dc_date,
		'date'        => $date,
		'time'        => $time,
		'description' => $desc,
		'read'        => 0,
	};
	return $news;
}

sub trim {
	my ($text, $from, $to) = @_;
	if ($from) {
		my $pos = index($text, $from);
		return undef if ($pos == -1);
		$text = substr($text, $pos);
	}
	if ($to) {
		my $pos = index($text, $to);
		return undef if ($pos == -1);
		$text = substr($text, 0, $pos + length($to));
	}
	return $text;
}

sub unescape_html {
	my ($html) = @_;
	my %unescaped = ('&lt;' => '>', '&gt;' => '<', '&amp;' => '&', '&quot;' => '"');
	my $escaped   = join('|', keys(%unescaped));
	$html =~ s/<a .*?href="(.+?)">(.+?)<\/a>/($2) ? $1 : "$2($1)"/ge;
	$html =~ s/\s+//g;
	$html =~ s/<p>/\n\n/g;
	$html =~ s/<br>/\n/g;
	$html =~ s/<.*?>//g;
	$html =~ s/($escaped)/$unescaped{$1}/g;
	return $html;
}

1;
