Learn Web.Crawling of Perl
来源:程序员人生 发布时间:2015-03-11 08:03:31 阅读次数:2236次
#####
#Overview of Web.Crawling related modules.
#Note that, below codes can not be executed just for overview intention.
#####
#!/usr/bin/perl
#####
#HTTP::Thin
#####
use 5.12.1;
use HTTP::Request::Common;
use HTTP::Thin;
say HTTP::Thin->new()->request(GET 'http://example.com')->as_string;
#####
#HTTP:Tiny
#####
use HTTP::Tiny;
my $response = HTTP::Tiny->new->get('http://example.com/');
die "Failed!
" unless $response->{success};
print "$response->{status} $response->{reason}
";
while (my ($k, $v) = each %{$response->{headers}}) {
for (ref $v eq 'ARRAY' ? @$v : $v) {
print "$k: $_
";
}
}
print $response->{content} if length $response->{content};
#new
$http = HTTP::Tiny->new{ %attrubutes };
#valid attributes include:
#-agent
#-cookie_jar
#-default_headers
#-local_address
#-keep_alive
#-max_redirect
#-max_size
#-https_proxy
#-proxy
#-no_proxy
#-timeout
#-verify_SSL
#-SSL_options
#get[head][put][post]delete
$response = $http->get($url);
$response = $http->get($url, \%options);
$response = $http->head($url);
#post_form
$response = $http->post_form($url, $form_data);
$response = $http->post_form($url, $form_data, \%options);
#request
$response = $http->request($method, $url);
$response = $http->request($method, $url, \%options);
$http->request('GET', 'http://user:pwd hk.mars@aol.com');
#or
$http->request('GET', 'http://mars%40:pwd hk.mars@aol.com');
#www_form_urlencode
$params = $http->www_form_urlencode( $data );
$response = $http->get("http://example.com/query?$params");
#SSL support
SSL_options => {
SSL_ca_file => $file_path,
}
#proxy support
#####
#www::Mechanize
#
#Stateful programmatic web browsing, used for automating interaction with websites.
#####
use WWW::Mechanize;
my $mech = WWW::Mechanize->new();
$mech->get( $url );
$mech->follow_link( n => 3 );
$mech->follow_link( text_regex => qr/download this/i );
$mech->follow_link( url => 'http://host.com/index.html' );
$mech->submit_form(
form_number => 3,
fields => {
username => 'banana',
passoword => 'lost-and-alone',
}
);
$mech->submit_form(
form_name => 'search',
fields => { query => 'pot of gold', },
button => 'search now'
);
#testing web applications
use Test::More;
like( $mech->content(), qr/$expected/, "Got expected content" );
#page traverse
$mech->back();
#finer control over page
$mech->find_link( n => $number );
$mech->form_number( $number );
$mech->form_name( $name );
$mech->field( $name, $value );
$mech->set_fields( $field_values );
$mech->set_visible( @criteria );
$mech->click( $button );
#subclass of LWP::UserAgent, eg:
$mech->add_header( $name =>$value );
#page-fecting methods
#status methods
#content-handling methods
#link methods
#image methods
#form methods
#field methods
#miscellaneous methods
#overridden LWP::UserAgent methods
#inherited unchanced LWP::UserAgent methods
#yeah now, it's easy to implement a spider project for future integration use.
Mars
生活不易,码农辛苦
如果您觉得本网站对您的学习有所帮助,可以手机扫描二维码进行捐赠