#$Id$
package WWW::Crawler::POE;

use strict;

use vars qw($VERSION @ISA);

use WWW::Crawler;
use WWW::Crawler::LWP;
use URI;
use HTTP::Request;
use Data::Dumper;
use Carp;

$VERSION="0.1";

sub DEBUG () {1}

use POE qw(Component::Client::HTTP Session);


###############################################################
sub spawn
{
    my($package, $alias, $crawler)=@_;

    
    POE::Session->new(
        __PACKAGE__, [qw(_start response 
                    one_loop
                    schedule_link next_link 
                    cannonical include seen 
                    fetch fetched parse extract_links 
                    error process)],
        [$crawler, $alias]
    );
}



###############################################################
sub _start
{
    my($kernel, $heap, $crawler, $alias)=@_[KERNEL, HEAP, ARG0, ARG1];

    DEBUG and warn __PACKAGE__, "::_start\n";
    $crawler||=WWW::Crawler::POE::Delegate->new($kernel, $alias);

    $kernel->alias_set($alias);

    $heap->{CRAWLER}=$crawler;
    $heap->{KERNEL}=$kernel;
    $heap->{ALIAS}=$alias;

    WWW::Crawler::POE::Transport->spawn();  # create a transport (in case)

    return;
}


###############################################################
sub one_loop
{
    my ($kernel, $heap) = @_[KERNEL, HEAP];

    my $page=$kernel->call($heap->{ALIAS}, 'next_link');
    $kernel->post($heap->{ALIAS}, 'fetch', $page) if $page;
    return $page;
}

###############################################################
sub schedule_link
{
    my ($kernel, $caller, $heap, $page) = 
                            @_[KERNEL, SENDER, HEAP, ARG0];

    $page={uri=>$page} if ref $page ne 'HASH';

    DEBUG and warn "Scheduling $page->{uri}\n";

    $heap->{CRAWLER}->_schedule_link($page);
    $kernel->post($heap->{ALIAS}, 'one_loop');          # maybe send out another request
    return;
}

###############################################################
sub next_link
{
    my($kernel, $heap)=@_[KERNEL, HEAP];

    if($kernel->call('crawler_transport', 'full')) {
        return '';
    }
    return $heap->{CRAWLER}->_next_link;
}

###############################################################
sub fetch
{
    my($kernel, $heap, $page) = @_[KERNEL, HEAP, ARG0];

    DEBUG and warn __PACKAGE__, " Fetching $page->{uri}\n";

    $page->{request}=HTTP::Request->new('GET', $page->{uri});

    $heap->{0+$page->{request}}=$page;

    DEBUG and warn "Posting request to crawler_transport for $page->{uri}\n";
    $kernel->post('crawler_transport', 'request', 'response', $page->{request});

    return;
}
    
###############################################################
## This is the other half of fetch
sub response
{
    my ($kernel, $heap, $ignore, $args) = 
            @_[KERNEL, HEAP, ARG0, ARG1];

    my $page=delete $heap->{0+$args->[0]};
    die "No page for ".$args->[0]->uri unless $page;

    my $response=$args->[1];
    DEBUG and warn "Received response (".$response->code.") from useragent for $page->{uri}\n";

    $kernel->post($heap->{ALIAS}, '_one_loop');  # maybe send out another request

    if($response->is_success) {                 # we are in luck
        $page->{header}=$response->header;
        $page->{document}=$response->content;
        $kernel->post($heap->{ALIAS}, 'fetched', $page);
    } 
    elsif($response->code == 302 or $response->code == 303) {
        # POE::Component::Client::HTTP doesn't follow HTTP redirections
        # so we do it "by hand"

        my $location=$response->header('Location');
        die "No Location in ", Dumper $response unless $location;

        if($location !~ m(https?://)) {     # partial redirect
            DEBUG and warn "************ Partial redirect : $location\n";
            my $uri=URI->new_abs($location, $page->{uri});
            $location=$uri->as_string;
        }

        DEBUG and warn "***** $page->{uri} redirected to $location\n";
        $page->{original_uri}||=$page->{uri};

        $page->{uri}=$location;
        $kernel->post($heap->{ALIAS}, 'schedule_link', $page);
    } 
    else {                                      # something bad
        $kernel->post($heap->{ALIAS}, 'error', $page, $response);
    }
}

###############################################################
sub fetched
{
    my($kernel, $heap, $page)=@_[KERNEL, HEAP, ARG0];

    if($page->{robot_host}) {           # it's in fact a robots.txt
                                        # so we have to delegate it right away
        $heap->{CRAWLER}->fetched($page);
        return;
    }

    $page->{parsed}=$kernel->call($heap->{ALIAS}, 'parse', $page);
    $kernel->post($heap->{ALIAS}, 'process', $page);
    $kernel->post($heap->{ALIAS}, 'seen', $page);

    # TODO this loop should be unrolled
    foreach my $link ($kernel->call($heap->{ALIAS}, 'extract_links', $page)) {
        $kernel->post($heap->{ALIAS}, 'schedule_link', $link);
    }
}


###############################################################
sub _mk_delegate
{
    my($type, $event)=@_;
    return sub {
        my($heap)=$_[HEAP];
        DEBUG and warn __PACKAGE__, "/$event\n";
        $heap->{CRAWLER}->can("_$event")->($heap->{CRAWLER}, @_[ARG0..$#_]);
    };
}

*cannonical=    _mk_delegate('call', 'cannonical');
*include=       _mk_delegate('call', 'include');
*seen=          _mk_delegate('post', 'seen');

*extract_links= _mk_delegate('call', 'extract_links');


*parse=         _mk_delegate('call', 'parse');
*process=       _mk_delegate('post', 'process');
*error=         _mk_delegate('post', 'error');




















###########################################################################
## WWW::Crawler::POE::Transport;
## We can't derive from POE::Component::Client::HTTP, so we need this 
## helper session.  It encapsulates the QUEUE size and stuff.

package WWW::Crawler::POE::Transport;
use strict;

use vars qw(@ISA);
use POE qw(Component::Client::HTTP);
use Carp;

sub DEBUG {1}

sub spawn
{
    my($package, $queue, $name)=@_;

    # there can be only one
    if($poe_kernel->alias_resolve('crawler_transport')) {
        DEBUG and warn "There can be only one\n";
        return;
    }

    unless($name) {
        $name=$package;
        $name=~s/[_\W]+/_/g;
    }

    my $alias='crawler_transport';
    $queue||=10;

    POE::Session->new(
        $package=> [qw(_start request response full)],
        [$alias, $queue]
    );
    
    POE::Component::Client::HTTP->spawn(
           Agent    => "$name/$WWW::Crawler::VERSION",
           Alias    => $alias.'_agent',
         );
}

###############################################################
sub _start 
{
    my($kernel, $heap, $alias, $queue)=@_[KERNEL, HEAP, ARG0, ARG1];

    DEBUG and warn "Maxqueue for $alias : $queue\n";
    $kernel->alias_set($alias);
    $heap->{MAXQUEUE}=$queue;
    $heap->{QUEUE}=0;
    $heap->{ALIAS}=$alias;
    $heap->{POSTBACK}={};
}

###############################################################
sub full
{
    my($heap)=$_[HEAP];
    DEBUG and warn "Queue is ", 
                   ($heap->{QUEUE} == $heap->{MAXQUEUE} ? '' : 
                    ($heap->{QUEUE} > $heap->{MAXQUEUE} ? 'over' : 'not ')),
                   "full.  QUEUE=$heap->{QUEUE} MAX=$heap->{MAXQUEUE}\n";

    return $heap->{QUEUE} >= $heap->{MAXQUEUE};
}

###############################################################
sub request
{
    my($kernel, $heap, $sender, $event, $request)=
                    @_[KERNEL, HEAP, SENDER, ARG0, ARG1];

    $heap->{QUEUE}++;
    $heap->{POSTBACK}{0+$request}=$sender->postback($event, "ignore me");

    DEBUG and warn "Requesting ...\n";
    $kernel->post($heap->{ALIAS}.'_agent', 'request', 'response', $request);
}

###############################################################
sub response
{
    my($kernel, $heap, $request_o, $response_o)=
                    @_[KERNEL, HEAP, ARG0, ARG1];

    DEBUG and warn "Responds ...\n";
    $heap->{QUEUE}--;
    my $pb=delete $heap->{POSTBACK}{0+$request_o->[0]};
    $pb->($request_o->[0], $response_o->[0]);
}



















###########################################################################
package WWW::Crawler::POE::Delegate;
use strict;

use vars qw(@ISA);

use WWW::Crawler;
use WWW::Crawler::LWP;
use WWW::Crawler::RobotRules;
use Carp;


# We want inheritance to be as follows 
# WWW::Crawler::POE::Delegate 
#   -> WWW::Crawler::RobotRules::ISA
#       -> WWW::Cralwer::LWP 
#           -> WWW::Cralwer
BEGIN {@WWW::Crawler::RobotRules::ISA=qw(WWW::Crawler::LWP);}

@ISA=qw(WWW::Crawler::LWP);

sub DEBUG () {1}


###############################################################
sub new
{
    my $package=shift @_;
    my $kernel=shift @_;
    my $alias=shift @_;

    croak "Kernel referenece required" unless $kernel;
    croak "Session alias required" unless $alias;
 
    my $self;
    {   # don't want to create a UserAgent
        local @WWW::Crawler::RobotRules::ISA=qw(WWW::Crawler);

        $self=$package->SUPER::new(@_);
    }
    $self->{ALIAS}=$alias;
    $self->{KERNEL}=$kernel;
    return $self;
}

###############################################################
# The following methods are for when the delegate wants to talk to
# the session (though it probably doesn't know it)

sub _mk_thingback {
    my($type, $event)=@_;
    return sub {
        my $self=shift;
        DEBUG and warn ref($self)."->$event\n";
        return $self->{KERNEL}->can($type)->($self->{KERNEL}, $self->{ALIAS}, $event, @_);
    }
}

*cannonical=    _mk_thingback('call', 'cannonical');
*include=       _mk_thingback('call', 'include');
*seen=          _mk_thingback('post', 'seen');

*extract_links= _mk_thingback('call', 'extract_links');
*next_link=     _mk_thingback('call', 'next_link');
*schedule_link= _mk_thingback('post', 'schedule_link');


*one_loop=      _mk_thingback('post', 'one_loop');
*fetch=         _mk_thingback('post', 'fetch');
*fetched=       _mk_thingback('post', 'fetched');
*parse=         _mk_thingback('call', 'parse');
*process=       _mk_thingback('post', 'process');
*error=         _mk_thingback('post', 'error');

###############################################################
# The following methods are for when a session wants to talk to the delegate

*_cannonical=   sub {shift(@_)->SUPER::cannonical(@_)};
*_include=      sub {shift(@_)->SUPER::include(@_)};
*_seen=         sub {shift(@_)->SUPER::seen(@_)};

*_extract_links=sub {shift(@_)->SUPER::extract_links(@_)};
*_next_link=    sub {shift(@_)->SUPER::next_link(@_)};
*_schedule_link=sub {shift(@_)->SUPER::schedule_link(@_)};


*_one_loop=     sub {shift(@_)->SUPER::one_loop(@_)};
*_fetch=        sub {shift(@_)->SUPER::fetch(@_)};
*_fetched=      sub {shift(@_)->SUPER::fetched(@_)};
*_parse=        sub {shift(@_)->SUPER::parse(@_)};
*_process=      sub {shift(@_)->SUPER::process(@_)};
*_error=        sub {shift(@_)->SUPER::error(@_)};

1;

__END__

# Below is the stub of documentation for your module. You better edit it!

=head1 NAME

WWW::Crawler::POE - A web crawler session that delegates to WWW::Crawler::LWP wherever it makes sense

=head1 SYNOPSIS

=head1 DESCRIPTION

=head1 METHODS

=head2 error($self, $page, $response)

=head2 extract_links($self, $page)

=head2 fetch($self, $page)

=head2 new($package)


=head1 AUTHOR

Philip Gwyn <perl AT pied.nu>

=head1 SEE ALSO


WWW::Crawler,
LWP::UserAgent.

=cut

$Log$

