#!/usr/bin/perl

# Copyright (c) 2004 by Charles A. Loomis, Jr. and Le Centre National de
# la Recherche Scientifique (CNRS).  All rights reserved.

# The software was distributed with and is covered by the European
# DataGrid License.  A copy of this license can be found in the included
# LICENSE file.  This license can also be obtained from
# http://www.eu-datagrid.org/license.

use strict;

use Getopt::Long;
use LWP::UserAgent;
use HTML::Parser;

# Setup of the options.
my %options = ();
&GetOptions(\%options, "help", "name=s", "url=s");

# Print help if necessary. 
if ($options{help}) {
    usage();
    exit;
}

# Check that options are defined. 
if (!defined($options{name})) {
    print "Must supply value for --name option.\n";
    usage();
    exit(1);
}
if (!defined($options{url})) {
    print "Must supply value for --url option.\n";
    usage();
    exit(1);
}


# Global to hold all of the entries for the rpms.
my @rpms;

# Fetch the HTML at the given URL.
my $ua = LWP::UserAgent->new;
my $request = HTTP::Request->new(GET => $options{url});
my $response = $ua->request($request);
if (! $response->is_success) {
    print $response->status_line . "\n";
    exit(1);
}

# Create the HTML parser and setup to process only the start elements
# of anchors.
my $parser = HTML::Parser->new(api_version => 3);
die "Can't create HTML parser.\n" unless ($parser);
$parser->handler(start => \&startHandler, "tagname,attr");
my @tags = ('a');
$parser->report_tags(@tags);

# Parse the file. 
$parser->parse($response->content) || die $!;

# Collect information. 
my $contents = join("\n",@rpms);
my $date = localtime();

# Actually print out the file. 
open OUT, ">pro_$options{name}_updates.tpl";
print OUT << "EOF";
#
# PAN updates template generated by html2updates.pl on $date.
#
 
template pro_$options{name}_updates;

$contents

EOF
; 
close OUT;

exit;

# This handles the start for HTML anchors and pulls out the value of
# the href attribute.  A line is added for each rpm found. 
sub startHandler {

    my ($name, $href) = @_;

    my %hash = %$href;

    my $href = $hash{'href'};

    if ($href =~ /\s*(.+)-((?:[^-]+)-(?:[^-]+))\.([^\.]+)\.rpm\s*$/) {
        my $name = $1;
        my $version = $2;
        my $arch = $3;
        my $tag = "$1-$2-$3";

	# Treat the kernel specially and allow multiple installations.
	if (($name =~ /^kernel/) && !($name =~ /^kernel-utils/)) {
	    push @rpms, "\"/software/packages\"=pkg_add(\"$name\",\"$version\",\"$arch\",\"true\");";
	} else {
	    push @rpms, "\"/software/packages\"=pkg_ronly(\"$name\",\"$version\",\"$arch\");";
	}
    }

}

# Print the usage instructions for this script.
sub usage {

    print <<'EOF'

html2updates.pl [--help] --name=osname \
                         --url http://location/dir

This command retrieve the html file from the given URL and parses any
embedded anchors for references to RPMs.  The script produces a valid
PAN template for use with quattor which contains pkg_ronly calls for
each rpm.  This is appropriate for an "updates" template for an OS.

NOTE: Kernel packages are treated specially.  They are added with a
pkg_add with the "multi" option specified.  You may want to edit the 
resulting template to reduce the number of redundant kernel versions.

--help       print out this usage statement

--name       name of the OS; used for the template name. This will
             create a template named pro_<OS>_updates.

--url        The http address to use to generate the template. 

All options but the 'help' option are required.

EOF
;

}
