#$Id$
package WWW::Crawler::LWP;

use strict;

use vars qw($VERSION @ISA);

use WWW::Crawler;
use LWP::UserAgent;
use URI;
use HTML::LinkExtor;
use HTTP::Request;

$VERSION="0.1";
@ISA=qw(WWW::Crawler);

sub DEBUG () {0}

###############################################################
sub new
{
    my $package=shift @_;
    my $self=$package->SUPER::new(@_);
    $self->{UA}=LWP::UserAgent->new();
    return $self;
}

###############################################################
sub cannonical
{
    my($self, $page)=@_;
    my $uri=$page->{uri};
    $uri=URI->new($uri) unless ref $uri;

    $uri->scheme('http') unless $uri->scheme;
    $uri->fragment(undef());
    $page->{uri}=$uri->as_string;
    return $page;
}

###############################################################
sub fetch
{
    my($self, $page)=@_;
    DEBUG and warn "Fetching $page->{uri}\n";

    $page->{request}=HTTP::Request->new('GET', $page->{uri});
    my $res=$self->{UA}->request($page->{request});

    if($res->is_success) {
        $page->{header}=$res->header;
        $page->{document}=$res->content;
        $self->fetched($page);
    } else {
        $self->error($page, $res);
    }
}
    
###############################################################
sub extract_links
{
    my($self, $page)=@_;
    my @links;
    my $extor=HTML::LinkExtor->new(sub {
            my($tag, %attr)=@_;
            return unless $tag eq 'a' and $attr{href};
            push @links, {uri=>$attr{href}};
        }, $page->{uri});
    $extor->parse($page->{document});
    DEBUG and warn "Links in $page->{uri}\n    ".join("\n    ", 
                                            map {$_->{uri}} @links), "\n";
    return @links;
}

###############################################################
sub error
{
    my($self, $page, $response)=@_;
    DEBUG and warn "$page->{uri} wasn't fetched: ".$response->code."\n";
}


1;
__END__
# Below is the stub of documentation for your module. You better edit it!

=head1 NAME

WWW::Crawler::LWP - A web crawler that uses LWP and HTML::LinkExtor

=head1 SYNOPSIS

    package My::Crawler;
    
    use WWW::Crawler::LWP;
    my @ISA=qw(WWW::Crawler::LWP);

    sub new
    {
        my $package=shift @_;
        my $self=$package->SUPER::new(@_);
        $self->{UA}->agent("My::Crawler 0.1 (Mozila;1;Linux)");
        return $self;
    }

    sub parse 
    {
        my($self, $page)=@_;
        my $data=$self->SUPER::parse($page);
        $data->{title}=$1 if $page->{document} =~ m(<title>(.+?)</title>)i;
        return $data;
    }

    sub error
    {
        my($self, $page, $response)=@_;
        print "$page->{uri} wasn't fetched: ".$response->code."\n";
    }

    sub process 
    {
        my($self, $page)=@_;
        print "Doing something to $page->{parsed}{title}\n";
    }

    sub include 
    {
        my($self, $page)=@_;
        return unless $::LINK eq lc substr($page->{uri}, 0, length($::LINK));
        return $self->SUPER::include($page);
    }

    package main;

    use vars qw($LINK);
    $LINK="http://www.yahoo.com/";

    my $crawler=My::Crawler->new();
    $crawler->schedule_link({uri=>$LINK});
    $crawler->run();

=head1 DESCRIPTION

WWW::Crawler::LWP is a barebones sub class of WWW::Crawler.  It should be
subclassed so for each application.

NOTE : it does not respect robots.txt, nor does it restrict it's activity to
one server, nor does act kindly on a given server.

=head1 METHODS

=head2 cannonical($self, $page)

Turns $page->{uri} an absolute HTTP URI, without a fragement 
(the bit after the #).

=head2 error($self, $page, $response)

Called when an error occurs while fetching an URI.  $response is the
HTTP::Response object.  Default is do nothing.  You should overload this if
you want to report errors somewhere.

=head2 extract_links($self, $page)

Uses HTML::LinkExtor to extract all the links from a page.

=head2 fetch($self, $page)

Uses LWP::UserAgent to fetch a page.  Deals with error conditions and calls
fetched() if there wasn't an error.

=head2 new($package)

Constructor requires no parameters.

Creates the following members:

=over 4

=item UA 
    
A LWP::UserAgent.  You can call methods on the member to get/set any
parameters you need, like from(), cookie_jar() and credentials().

=back


=head1 AUTHOR

Philip Gwyn <perl@pied.nu>

=head1 SEE ALSO


WWW::Crawler,
LWP::UserAgent.

=cut

$Log$
