summary refs log tree commit diff
diff options
context:
space:
mode:
authorfranck cuny <franck@lumberjaph.net>2009-06-24 17:46:58 +0200
committerfranck cuny <franck@lumberjaph.net>2009-06-24 17:46:58 +0200
commit9728a14de3c2bbb83786ce1b7ee5e19c20ad89e1 (patch)
treeeb1320d7443a13f05d50874da86bc41d97fa3ecd
parentinitial commit (diff)
downloadmoosex-useragent-9728a14de3c2bbb83786ce1b7ee5e19c20ad89e1.tar.gz
basic useragent
-rw-r--r--lib/MooseX/UserAgent.pm167
1 files changed, 155 insertions, 12 deletions
diff --git a/lib/MooseX/UserAgent.pm b/lib/MooseX/UserAgent.pm
index 30686cf..72c2fa1 100644
--- a/lib/MooseX/UserAgent.pm
+++ b/lib/MooseX/UserAgent.pm
@@ -1,32 +1,175 @@
 package MooseX::UserAgent;
 
-use Moose;
-our $VERSION = '0.01';
+use Moose::Role;
+
+our $VERSION = '0.2.0';
+
+use Encode;
+use HTTP::Response;
+use LWPx::ParanoidAgent;
+use HTML::Encoding 'encoding_from_http_message';
+use Compress::Zlib;
+
+has 'agent' => (
+    isa     => 'Object',
+    is      => 'rw',
+    lazy    => 1,
+    default => sub {
+        my $self = shift;
+        my $ua   = LWPx::ParanoidAgent->new;
+
+        my $conf = $self->useragent_conf;
+        $ua->agent( $conf->{name} ) if $conf->{name};
+        $ua->from( $conf->{mail} )  if $conf->{mail};
+        $ua->max_size( $conf->{max_size} || 3000000 );
+        $ua->timeout( $conf->{timeout}   || 30 );
+        $ua;
+    }
+);
+
+sub fetch {
+    my ( $self, $url ) = @_;
+
+    my $req = HTTP::Request->new( GET => URI->new( $url ) );
+
+    $req->header('Accept-Encoding', 'gzip');
+
+    if ( $self->context->{ useragent }->{ use_cache } ) {
+        my $ref = $self->cache->get( $url );
+        if ( defined $ref && $ref->{ LastModified } ne '' ) {
+            $req->header( 'If-Modified-Since' => $ref->{ LastModified } );
+        }
+    }
+
+    my $res = $self->agent->request( $req );
+
+    if ( $self->context->{ useragent }->{ use_cache } ) {
+        $self->cache->set(
+            $url,
+            {   ETag         => $res->header( 'Etag' )          || '',
+                LastModified => $res->header( 'Last-Modified' ) || ''
+            }
+        );
+    }
+
+    $res;
+}
+
+sub get_content {
+    my ( $self, $res ) = @_;
+
+    my $enc = encoding_from_http_message($res);
+
+    my $content = $res->content;
+    if ( $res->content_encoding && $res->content_encoding eq 'gzip' ) {
+        $content = Compress::Zlib::memGunzip($content);
+    }
+
+    if ( $enc && $enc !~ /utf-8/i ) {
+        $content = $res->decoded_content( raise_error => 1 );
+        if ($@) {
+            $content = Encode::decode( $enc, $content );
+        }
+    }
+    $content;
+}
 
 1;
+
 __END__
 
 =head1 NAME
 
-MooseX::UserAgent -
+RTGI::Role::UserAgent - Fetch an url using LWP as the HTTP library
 
 =head1 SYNOPSIS
 
-  use MooseX::UserAgent;
+    package Foo;
+
+    use Moose;
+    with qw/MooseX::UserAgent/;
+
+    has useragent_conf => (
+        isa     => 'HashRef',
+        default => sub {
+            { name => 'myownbot', };
+        }
+    );
+
+    my $res = $self->fetch($url, $cache);
+    ...
+    my $content = $self->get_content($res);
+
+    --- yaml configuration
+    name: 'Mozilla/5.0 (compatible; RTGI; http://rtgi.fr/)'
+    mail: 'bot@rtgi.fr'
+    max_size: 3000000
+    timeout: 30
+
+    --- kwalify schema
+    "use_cache":
+      name: use_cache
+      desc: use cache
+      required: true
+      type: int
+    "name":
+      name: name
+      desc: useragent string
+      required: true
+      type: str
+    "mail":
+      name: mail
+      desc: mail for the useragent
+      required: true
+      type: str
+    "timeout":
+      name: timeout
+      desc: timeout
+      required: true
+      type: int
+    "max_size":
+      name: max_size
+      desc: max size
+      required: true
+      type: int
 
 =head1 DESCRIPTION
 
-MooseX::UserAgent is
+=head2 METHODS
 
-=head1 AUTHOR
+=over 4
+
+=item B<agent>
+
+The default useragent is a LWPx::ParanoidAgent object. In the
+configuration, the name, mail of the useragent have to be defined. The
+default size of a page manipulated can't excess 3 000 000 octets and the
+timeout is set to 30 seconds.
 
-franck cuny E<lt>franck.cuny {at} rtgi.frE<gt>
+=item B<fetch>
 
-=head1 SEE ALSO
+This method will fetch a given URL. This method handle only the http
+protocol.
+
+If there is a cache configuration, the url will be checked in the cache,
+and if there is a match, the content will be returned.
+
+In the case of scraping search engines, a delay may be given, so we will
+not hammer the server.
+
+=item B<get_content>
+
+This method will return a content in utf8.
+
+=back
+
+=head1 BUGS AND LIMITATIONS
+
+=head1 AUTHOR
 
-=head1 LICENSE
+franck cuny  C<< <franck.cuny@rtgi.fr> >>
 
-This library is free software; you can redistribute it and/or modify
-it under the same terms as Perl itself.
+=head1 LICENCE AND COPYRIGHT
 
-=cut
+Copyright (c) 2009, RTGI
+All rights reserved.