1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
|
package MooseX::UserAgent;
our $VERSION = '0.2.0';
use URI;
use HTTP::Request;
use HTTP::Response;
use LWP::UserAgent;
use Moose::Role;
with qw/
MooseX::UserAgent::Config
MooseX::UserAgent::Content
MooseX::UserAgent::Cache
/;
sub fetch {
my ( $self, $url ) = @_;
my $req = HTTP::Request->new( GET => URI->new($url) );
$req->header( 'Accept-Encoding', 'gzip' );
my $last_modified = $self->get_ua_cache($url);
$req->header( 'If-Modified-Since' => $last_modified )
if $last_modified;
my $res = $self->agent->request($req);
$self->store_ua_cache( $url, $res );
$res;
}
1;
__END__
=head1 NAME
RTGI::Role::UserAgent - Fetch an url using LWP as the HTTP library
=head1 SYNOPSIS
package Foo;
use Moose;
with qw/MooseX::UserAgent/;
has useragent_conf => (
isa => 'HashRef',
default => sub {
{ name => 'myownbot', };
}
);
my $res = $self->fetch($url, $cache);
...
my $content = $self->get_content($res);
--- yaml configuration
name: 'Mozilla/5.0 (compatible; RTGI; http://rtgi.fr/)'
mail: 'bot@rtgi.fr'
max_size: 3000000
timeout: 30
cache:
use_cache: 1
root: /tmp
default_expires_in: 5 days
namespace: my::useragent
=head1 DESCRIPTION
This is a role which provides a useragent to a Moose Class.
The role will do the caching for you if you need it, using Cache::*Cache
modules. By default it uses Cache::FileCache, but you can use any Cache
modules you want:
my $cache = new Cache::MemoryCache(
{
'namespace' => 'mymemorycacheforbot',
'default_expires_in' => 600
}
);
my $class = $MyClassUsingUA->new(
useragent_conf => {
cache => {
use_cache => 1,
namespace => 'testua',
}
},
ua_cache => $cache,
);
=head2 METHODS
=head3 useragent_conf
This is an attribut you need to add to your Class. It's a HashRef that
contains all the required configuration for the useragent.
=over 4
=item B<agent>
The default useragent is a LWP::UserAgent object. In the configuration,
the name and mail of the useragent have to be defined. The default size of
a page manipulated can't excess 3 000 000 octets and the timeout is set to
30 seconds.
=item B<fetch>
This method will fetch a given URL. This method handle only the http
protocol.
If there is a cache configuration, the url will be checked in the cache,
and if there is a match, a 304 HTTP code will be returned.
Return a HTTP::Response object.
=item B<get_content>
This method will return a content in utf8.
=back
=head1 BUGS AND LIMITATIONS
=head1 AUTHOR
franck cuny C<< <franck.cuny@rtgi.fr> >>
=head1 LICENCE AND COPYRIGHT
Copyright (c) 2009, RTGI
All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself. See L<perlartistic>.
|