#!/usr/bin/perl

# Copyright (c) 2004 by Charles A. Loomis, Jr. and Le Centre National de
# la Recherche Scientifique (CNRS).  All rights reserved.

# The software was distributed with and is covered by the European
# DataGrid License.  A copy of this license can be found in the included
# LICENSE file.  This license can also be obtained from
# http://www.eu-datagrid.org/license.

use strict;

use Getopt::Long;
use LWP::UserAgent;
use HTML::Parser;

# Setup of the options.
my %options = ();
&GetOptions(\%options, "help", "name=s", "owner=s", "url=s");

# Print help if necessary. 
if ($options{help}) {
    usage();
    exit;
}

# Check that options are defined. 
if (!defined($options{name})) {
    print "Must supply value for --name option.\n";
    usage();
    exit(1);
}
if (!defined($options{owner})) {
    print "Must supply value for --owner option.\n";
    usage();
    exit(1);
}
if (!defined($options{url})) {
    print "Must supply value for --url option.\n";
    usage();
    exit(1);
}


# Global to hold all of the entries for the rpms.
my @rpms;

# Fetch the HTML at the given URL.
my $ua = LWP::UserAgent->new;
my $request = HTTP::Request->new(GET => $options{url});
my $response = $ua->request($request);
if (! $response->is_success) {
    print $response->status_line . "\n";
    exit(1);
}

# Create the HTML parser and setup to process only the start elements
# of anchors.
my $parser = HTML::Parser->new(api_version => 3);
die "Can't create HTML parser.\n" unless ($parser);
$parser->handler(start => \&startHandler, "tagname,attr");
my @tags = ('a');
$parser->report_tags(@tags);

# Parse the file. 
$parser->parse($response->content) || die $!;

# Collect information. 
my $contents = join(",\n",@rpms);
my $date = localtime();

# Actually print out the file. 
open OUT, ">repository_$options{name}.tpl";
print OUT << "EOF";
#
# PAN repository template generated by html2pan.pl on $date.
#
 
structure template repository_$options{name};

"name" = "$options{name}";
"owner" = "$options{owner}";
"protocols" = list(
nlist("name","http",
      "url","$options{url}")
);

"contents" = nlist(
$contents
);

EOF
; 
close OUT;

exit;

# This handles the start for HTML anchors and pulls out the value of
# the href attribute.  A line is added for each rpm found. 
sub startHandler {

    my ($name, $href) = @_;

    my %hash = %$href;

    my $href = $hash{'href'};

    if ($href =~ /\s*(.+)-((?:[^-]+)-(?:[^-]+))\.([^\.]+)\.rpm\s*$/) {
        my $name = $1;
        my $version = $2;
        my $arch = $3;
        my $tag = "$1-$2-$3";

        push @rpms, "escape(\"$tag\"),nlist(\"name\",\"$name\",\"version\",\"$version\",\"arch\",\"$arch\")";
    }

}

# Print the usage instructions for this script.
sub usage {

    print <<'EOF'

html2pan.pl [--help] --name=repname \
                     --owner=owner_email 
                     --url http://location/dir

This command retrieve the html file from the given URL and parses any
embedded anchors for references to RPMs.  The script produces a valid
repository PAN template for use with quattor. 

--help       print out this usage statement

--name       name of the repository; used for the template name. This
             should be a lowercase string with no whitespace. 

--owner      The email address of the owner of the repository. 

--url        The http address to use to generate the template. 

All options but the 'help' option are required.

EOF
;

}
