From 9728a14de3c2bbb83786ce1b7ee5e19c20ad89e1 Mon Sep 17 00:00:00 2001 From: franck cuny Date: Wed, 24 Jun 2009 17:46:58 +0200 Subject: basic useragent --- lib/MooseX/UserAgent.pm | 167 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 155 insertions(+), 12 deletions(-) diff --git a/lib/MooseX/UserAgent.pm b/lib/MooseX/UserAgent.pm index 30686cf..72c2fa1 100644 --- a/lib/MooseX/UserAgent.pm +++ b/lib/MooseX/UserAgent.pm @@ -1,32 +1,175 @@ package MooseX::UserAgent; -use Moose; -our $VERSION = '0.01'; +use Moose::Role; + +our $VERSION = '0.2.0'; + +use Encode; +use HTTP::Response; +use LWPx::ParanoidAgent; +use HTML::Encoding 'encoding_from_http_message'; +use Compress::Zlib; + +has 'agent' => ( + isa => 'Object', + is => 'rw', + lazy => 1, + default => sub { + my $self = shift; + my $ua = LWPx::ParanoidAgent->new; + + my $conf = $self->useragent_conf; + $ua->agent( $conf->{name} ) if $conf->{name}; + $ua->from( $conf->{mail} ) if $conf->{mail}; + $ua->max_size( $conf->{max_size} || 3000000 ); + $ua->timeout( $conf->{timeout} || 30 ); + $ua; + } +); + +sub fetch { + my ( $self, $url ) = @_; + + my $req = HTTP::Request->new( GET => URI->new( $url ) ); + + $req->header('Accept-Encoding', 'gzip'); + + if ( $self->context->{ useragent }->{ use_cache } ) { + my $ref = $self->cache->get( $url ); + if ( defined $ref && $ref->{ LastModified } ne '' ) { + $req->header( 'If-Modified-Since' => $ref->{ LastModified } ); + } + } + + my $res = $self->agent->request( $req ); + + if ( $self->context->{ useragent }->{ use_cache } ) { + $self->cache->set( + $url, + { ETag => $res->header( 'Etag' ) || '', + LastModified => $res->header( 'Last-Modified' ) || '' + } + ); + } + + $res; +} + +sub get_content { + my ( $self, $res ) = @_; + + my $enc = encoding_from_http_message($res); + + my $content = $res->content; + if ( $res->content_encoding && $res->content_encoding eq 'gzip' ) { + $content = Compress::Zlib::memGunzip($content); + } + + if ( $enc && $enc !~ /utf-8/i ) { + $content = $res->decoded_content( raise_error => 1 ); + if ($@) { + $content = Encode::decode( $enc, $content ); + } + } + $content; +} 1; + __END__ =head1 NAME -MooseX::UserAgent - +RTGI::Role::UserAgent - Fetch an url using LWP as the HTTP library =head1 SYNOPSIS - use MooseX::UserAgent; + package Foo; + + use Moose; + with qw/MooseX::UserAgent/; + + has useragent_conf => ( + isa => 'HashRef', + default => sub { + { name => 'myownbot', }; + } + ); + + my $res = $self->fetch($url, $cache); + ... + my $content = $self->get_content($res); + + --- yaml configuration + name: 'Mozilla/5.0 (compatible; RTGI; http://rtgi.fr/)' + mail: 'bot@rtgi.fr' + max_size: 3000000 + timeout: 30 + + --- kwalify schema + "use_cache": + name: use_cache + desc: use cache + required: true + type: int + "name": + name: name + desc: useragent string + required: true + type: str + "mail": + name: mail + desc: mail for the useragent + required: true + type: str + "timeout": + name: timeout + desc: timeout + required: true + type: int + "max_size": + name: max_size + desc: max size + required: true + type: int =head1 DESCRIPTION -MooseX::UserAgent is +=head2 METHODS -=head1 AUTHOR +=over 4 + +=item B + +The default useragent is a LWPx::ParanoidAgent object. In the +configuration, the name, mail of the useragent have to be defined. The +default size of a page manipulated can't excess 3 000 000 octets and the +timeout is set to 30 seconds. -franck cuny Efranck.cuny {at} rtgi.frE +=item B -=head1 SEE ALSO +This method will fetch a given URL. This method handle only the http +protocol. + +If there is a cache configuration, the url will be checked in the cache, +and if there is a match, the content will be returned. + +In the case of scraping search engines, a delay may be given, so we will +not hammer the server. + +=item B + +This method will return a content in utf8. + +=back + +=head1 BUGS AND LIMITATIONS + +=head1 AUTHOR -=head1 LICENSE +franck cuny C<< >> -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. +=head1 LICENCE AND COPYRIGHT -=cut +Copyright (c) 2009, RTGI +All rights reserved. -- cgit 1.4.1