# $Id$
package WWW::Crawler::RobotRules;

use strict;

use vars qw($VERSION @ISA);

require WWW::Crawler;
use WWW::RobotRules;
use Data::Dumper;

$VERSION="0.1";
@ISA=qw(WWW::Crawler);

sub DEBUG () {0}


###############################################################
sub new
{
    my $package=shift @_;
    my $self=$package->SUPER::new(@_);
    $self->{RULES}={};
    $self->{PENDING}={};
    return $self;
}

###############################################################
sub include
{
    my($self, $page)=@_;

    my $u=URI::URL->new($page->{uri});
    my $scheme=$u->scheme();
    unless($scheme eq 'http' or $scheme eq 'https') {
        DEBUG and warn "Scheme isn't http or https\n";
        return $self->SUPER::include($page);
    }
    my $host||=lc $u->host();

    if($self->{RULES}{$host}) {
        if($self->{RULES}{$host}->allowed($page->{uri})) {
            DEBUG and warn "$page->{uri} is allowed by http://$host/robots.txt\n";
            return $self->SUPER::include($page->{uri});
        }
        DEBUG and warn "$page->{uri} is denied by http://$host/robots.txt\n";
        return;
    }

    # OK, we don't have a RobotRules, so we need to remember what we are 
    # fetching and then fetch() robots.txt
    # This way we piggyback on whatever fetch() method was defined previously
    # note that this by-passes include and cannonical, but i can live with that

    my $do_fetch=0;
    # only fetch robots.txt once
    unless(@{$self->{PENDING}{$host}||[]}) {
        DEBUG and warn "Fetching rules for $host\n";
        $self->{PENDING}{$host}=[];
        $do_fetch=1;
    }
    push @{$self->{PENDING}{$host}}, $page;

    # fetch must be done after we added to {PENDING}{$host}
    $self->fetch({uri=>"http://$host/robots.txt", robots_host=>$host})
        if $do_fetch;

    ## URI is denied until we have robots.txt (or an error on that fetch)
    return;
}

###############################################################
sub fetched
{
    my($self, $page)=@_;
    return $self->SUPER::fetched($page) unless $page->{robots_host};

    my $host=$page->{robots_host};
    DEBUG and warn "Parsing rules for $host\n";

    my $name=$self->{UA} ? $self->{UA}->agent() : ref($self);
    my $rules=WWW::RobotRules->new($name);
    eval {
        local $SIG{__WARN__}=sub {
            print "WOAH!";
            print STDERR $_[0];
            print STDERR "Parsing '$page->{document}'\n";
        };
        $rules->parse($page->{uri}, $page->{document});    
    };
    $self->{RULES}{$host}=$rules;
#    die Dumper $name, $page, $rules;

    $self->schedule_pending($host);
}

###############################################################
sub error
{
    my($self, $page, $response)=@_;
    return $self->SUPER::error($page) unless $page->{robots_host};

    my $host=$page->{robots_host};
    
#    if($resonse->code == 404) {     # 404 means no robots.txt, so we allow everything
        DEBUG and warn "Response for $page->{uri}: ".$response->code;
        $self->{RULES}{$host}=WWW::Crawler::RobotRules::Agreeable->new;
        return $self->schedule_pending($host);
#    }
        
}


###############################################################
sub schedule_pending
{
    my($self, $host)=@_;
    my @pending=@{delete $self->{PENDING}{$host}};
    die Dumper $self unless @pending;
    DEBUG and warn "Unblocking ".@pending." URIs for $host\n";
    foreach my $npage (@pending) {
        DEBUG and warn "    $npage->{uri}\n";
        $self->schedule_link($npage);
    }
}


###############################################################
sub next_link
{
    my($self)=@_;
    my $ret=$self->SUPER::next_link();
    return $ret if defined $ret;
    if(keys %{$self->{PENDING}}) {      # make sure we don't die out if
                                        # we have some pending things to do
        DEBUG and 
            warn "Don't die yet, we have URIs that are waiting for robots.txt\n";
        return '';
    }
    return;
}

###############################################################
## A fake robotrules that agrees to everything
package WWW::Crawler::RobotRules::Agreeable;
use strict;

sub new { return bless \'', shift @_; }
sub allowed {1};

1;

__END__
# Below is the stub of documentation for your module. You better edit it!

=head1 NAME

WWW::Crawler::RobotRules - Deals with robot rules (robots.txt) for WWW::Crawler

=head1 SYNOPSIS

    package My::Crawler;
    use WWW::Crawler::LWP;
    use WWW::Crawler::RobotRules;
    use vars qw(@ISA);

    # We want enheritance to be as follows 
    # WWW::Cralwer::RobotRules -> WWW::Cralwer::LWP -> WWW::Cralwer
    # so that fetched() is called properly.  YUCK!
    BEGIN {@WWW::Crawler::RobotRules::ISA=qw(WWW::Crawler::LWP);}

    @ISA=qw(WWW:Crawler::RobotRules);
    
    sub fetched 
    {
        my($self, $page)=@_;
        # only need this if we need to overload fetched()... which an
        # application rarely wants to anyway
        return $self->SUPER::fetched($page) if $page->{robots_host};
        # ....
    }
    sub error
    {
        my($self, $page, $error)=@_;
        warn "BOO HOO!  Can't fetch $page->{uri}\n";
        $self->SUPER::fetched($page);
    }

    sub include 
    {
        my($self, $page)=@_;
        return unless $::LINK eq lc substr($page->{uri}, 0, length($::LINK));
        return $self->SUPER::include($page);
    }

    package main;

    use vars qw($LINK);
    $LINK="http://www.yahoo.com/";

    my $crawler=My::Crawler->new();
    $crawler->schedule_link($LINK);
    $crawler->run();


=head1 DESCRIPTION

WWW::Crawler::Robot is a subclass of WWW::Crawler that adds robots.txt
processing to your crawler.  It "piggybacks" on your object hierarchy's
fetch()/fetched() functions, so you might have to play with various @ISA
variables to get inheritance just right.

=head1 METHODS

=head2 error($self, $page, $error)

If an error occured while fetching robots.txt, a bogus RobotRules
(WWW::Crawler::RobotRules::YesMan) is defined that allows all URIs to be
fetched and all pending URIs for that host are then rescheduled.  If you
want better handling (like drop all URIs for a host if robots.txt is 500 or
off the air), you should overload this method.  NB: check for
$page->{checked_host} to make sure you are dealing with a fetched
robots.txt, not with a regular fetch.

=head2 fetched($self, $page)

Parses robots.txt with WWW::RobotRules, using $self->{UA}->agent() as the
name (if it can).  Then reschedules any pending URLs for a given page now
that we have have valid rules.

=head2 include($self, $uri)

If we have rules for a host, the URI is accepted.  If not, the URI is put in
a pending list and a subrequest is sent for the robots.txt of that host.

=head2 new($package)

Default constructor requires no parameters.

Creates the following members:

=over 4

=item PENDING
    
Hashref of host=>arrayref of URIs on a given host.

=item RULES

Hashref of hosts=>objects that encapsulate a robots.txt for that host.  Note
that only allowed() is called on these objects, so you don't have to use
WWW::RobotRules.

=back

=head2 next_link

Makes sure we keep going if we have pending URIs.

=head2 schedule_pending($self, $host)

Reschedules all the pending URIs for $host.  Pending URIs are set in
include().  Generally called from fetched() or error().

=head1 AUTHOR

Philip Gwyn <perl@pied.nu>

=head1 SEE ALSO

WWW::Crawler, 
perl(1).

=cut

$Log$
