1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
|
package MooseX::UserAgent;
our $VERSION = '0.2.0';
use Moose::Role;
with qw/MooseX::UserAgent::Config MooseX::UserAgent::Content
MooseX::UserAgent::Cache/;
use URI;
use HTTP::Request;
use HTTP::Response;
use LWP::UserAgent;
sub fetch {
my ( $self, $url ) = @_;
my $req = HTTP::Request->new( GET => URI->new( $url ) );
$req->header( 'Accept-Encoding', 'gzip' );
my $last_modified = $self->get_ua_cache($url);
$req->header( 'If-Modified-Since' => $last_modified )
if $last_modified;
my $res = $self->agent->request( $req );
$self->store_ua_cache($url, $res);
$res;
}
1;
__END__
=head1 NAME
RTGI::Role::UserAgent - Fetch an url using LWP as the HTTP library
=head1 SYNOPSIS
package Foo;
use Moose;
with qw/MooseX::UserAgent/;
has useragent_conf => (
isa => 'HashRef',
default => sub {
{ name => 'myownbot', };
}
);
my $res = $self->fetch($url, $cache);
...
my $content = $self->get_content($res);
--- yaml configuration
name: 'Mozilla/5.0 (compatible; RTGI; http://rtgi.fr/)'
mail: 'bot@rtgi.fr'
max_size: 3000000
timeout: 30
cache:
use_cache: 1
root: /tmp
default_expires_in: 5 days
namespace: my::useragent
=head1 DESCRIPTION
This is a role which provides a useragent to a Moose Class.
The role will do the caching for you if you need it, using Cache::*Cache
modules. By default it uses Cache::FileCache, but you can use any Cache
modules you want.
=head2 METHODS
=over 4
=item B<agent>
The default useragent is a LWPx::ParanoidAgent object. In the
configuration, the name, mail of the useragent have to be defined. The
default size of a page manipulated can't excess 3 000 000 octets and the
timeout is set to 30 seconds.
=item B<fetch>
This method will fetch a given URL. This method handle only the http
protocol.
If there is a cache configuration, the url will be checked in the cache,
and if there is a match, the content will be returned.
In the case of scraping search engines, a delay may be given, so we will
not hammer the server.
=item B<get_content>
This method will return a content in utf8.
=back
=head1 BUGS AND LIMITATIONS
=head1 AUTHOR
franck cuny C<< <franck@lumberjaph.net> >>
=head1 LICENCE AND COPYRIGHT
Copyright (c) 2009, RTGI
All rights reserved.
|