#$Id$
package WWW::Crawler::Parallel;

use strict;

use vars qw($VERSION @ISA);

use WWW::Crawler;
use WWW::Crawler::LWP;
use URI;
use HTTP::Request;
use POE;
use Data::Dumper;
use Carp;

$VERSION="0.1";
@ISA=qw(WWW::Crawler::LWP);

sub DEBUG () {0}

###############################################################
sub new
{
    my $package=shift @_;
    my $queue=shift @_;
    $queue||=10;
    warn "Maxqueue = $queue\n";
    my $self;
    {
        local @ISA=qw(WWW::Crawler);        # don't want to create a UserAgent
        $self=$package->SUPER::new(@_);
    }
    
    $self->{ALIAS}='crawler';
    $self->{MAXQUEUE}=$queue;
    $self->{CALLBACKS}={};
    $self->{QUEUE}=0;
    WWW::Crawler::Parallel::Session->spawn($self, $self->{ALIAS});

    return $self;
}


###############################################################
sub run
{
    $poe_kernel->run();
}


###############################################################
sub fetch
{
    my($self, $page)=@_;

    DEBUG and warn __PACKAGE__, " Fetching $page->{uri}\n";
    $page->{request}=HTTP::Request->new('GET', $page->{uri});
    $self->{QUEUE}++;
    $poe_kernel->post($self->{ALIAS}, 'request', $page);
}
    
###############################################################
# This is the second half of fetch(), as it were
sub response
{
    my($self, $request, $response, $page)=@_;

    $self->{QUEUE}--;           # one less in QUEUE
    $self->one_loop();          # maybe send out another request

    if($response->is_success) {
        $page->{header}=$response->header;
        $page->{document}=$response->content;
        $self->fetched($page);
    } 
    else {
        $self->error($page, $response);
    }
}

###############################################################
sub fetched
{
    my($self, $page)=@_;

    $page->{parsed}=$self->parse($page);
    $self->process($page);
    $self->seen($page);

    DEBUG and warn "Extracting links from $page->{uri}...\n";
    my $count=0;
    # this loop should be unrolled (or extract_links, even)
    foreach my $link ($self->extract_links($page)) {
        $count++;
        $self->schedule_link($link);
    }
    DEBUG and warn "$count links extracted...\n";
}


###############################################################
sub schedule_link
{
    my($self, $page)=@_;

    $page=$self->SUPER::schedule_link($page);   # add to TODO list
    return unless $page;

    $self->one_loop();                          # and send it on it's way
    return $page;
}

###############################################################
sub next_link
{
    my($self)=@_;
    warn "NEXT LINK size=$self->{QUEUE}, max=$self->{MAXQUEUE}\n";
    if($self->{QUEUE} >= $self->{MAXQUEUE}) {
        DEBUG and warn "Request queue is full...\n";
        return '';
    }
    
    return $self->SUPER::next_link;
}





#############################################################################
package WWW::Crawler::Parallel::Session;

use strict;

use vars qw($VERSION @ISA);

use POE qw(Component::Client::HTTP Session);

# @ISA=qw(WWW::Crawler::POE);

sub DEBUG {1}

###############################################################
sub spawn
{
    my($package, $crawler, $alias)=@_;

    my $name=ref $crawler;
    $name=~s/\W+/_/g;

    POE::Component::Client::HTTP->spawn(
           Agent    => "$name/$WWW::Crawler::VERSION",
           Alias    => 'useragent',
         );

    POE::Session->new(
        __PACKAGE__, [qw(_start request response)],
        [$crawler, $alias]
    );
}

###############################################################
sub _start
{
    my($kernel, $heap, $crawler, $alias)=@_[KERNEL, HEAP, ARG0, ARG1];

    DEBUG and warn __PACKAGE__, "->_start $alias\n";
    $heap->{crawler}=$crawler;
    $kernel->alias_set($alias);
    return;
}

###############################################################
sub request
{
    my($kernel, $heap, $page) = @_[KERNEL, HEAP, ARG0];

    $page->{request}||=HTTP::Request->new('GET', $page->{uri});
    $heap->{0+$page->{request}}=$page;

    DEBUG and warn "Posting request to useragent for $page->{uri}\n";
    $kernel->post('useragent', 'request', 'response', $page->{request});

    return;
}

###############################################################
sub response
{
    my ($kernel, $heap, $request_p, $response_p) = @_[KERNEL, HEAP, ARG0, ARG1];

    my $page=delete $heap->{0+$request_p->[0]}; # get the $page back

    my $response=$response_p->[0];
    DEBUG and warn "Recieved response (".$response->code.") from useragent for $page->{uri}\n";

    if($response->code == 302 or $response->code == 303) {

        # POE::Component::Client::HTTP doesn't follow HTTP redirections
        # so we do it "by hand"

        my $location=$response->header('Location');
        die "No Location in ", Dumper $response unless $location;

        if($location !~ m(https?://)) {     # partial redirect
            DEBUG and warn "************ Partial redirect : $location\n";
            my $uri=URI->new_abs($location, $page->{uri});
            $location=$uri->as_string;
        }
        DEBUG and warn "***** $page->{uri} redirected to $location\n";
        $page->{original_uri}||=$page->{uri};

        $page->{uri}=$location;
        $heap->{crawler}->schedule_link($page);
        return;
    } 

    $heap->{crawler}->response($request_p->[0], $response, $page);
    return;
}


1;

__END__

# Below is the stub of documentation for your module. You better edit it!

=head1 NAME

WWW::Crawler::Parallel - A web crawler that uses POE for parellel crawling

=head1 SYNOPSIS

=head1 DESCRIPTION

=head1 METHODS

=head2 error($self, $page, $response)

=head2 extract_links($self, $page)

=head2 fetch($self, $page)

=head2 new($package)


=head1 AUTHOR

Philip Gwyn <perl AT pied.nu>

=head1 SEE ALSO


WWW::Crawler,
LWP::UserAgent.

=cut

$Log$
