2010-11-01

LWP::UserAgentでローカルファイル保存

perl

LWP::UserAgent - search.cpan.orgでローカルにファイル保存するためのコード。
更新された場合に限って保存するには$ua->mirror($url, $filename);使う

#!/usr/bin/perl

use strict;
use warnings;
use Encode;
use LWP::UserAgent;
use File::Spec;
our $CachDir = '/path/to/dir';

if (! -e $CachDir ) {
	mkdir $CachDir or die "cannot creat $CachDir : $!";
}

my $url = $ARGV[0];
my $file = $ARGV[1];
my $filename = File::Spec->catfile($CachDir, $file);

my $http_proxy = "http://proxy:port";
my $ua = LWP::UserAgent->new;
$ua->agent('agent');
$ua->proxy('http', $http_proxy);

my $res = $ua->get($url, ':content_file' => $filename);
print $res->status_line;

2010-11-01

Web::Scraper & Spreadsheet::WriteExcelでリスト生成

perl

Spreadsheet::WriteExcel - search.cpan.orgを使ってエクセルファイル生成

#!/usr/bin/perl
use strict;
use warnings;
use Web::Scraper;
use URI;
use Spreadsheet::WriteExcel;

my $url = $ARGV[0];
my $filename = $ARGV[1];
my $scraper = scraper {
		process '', 'list[]' => { 'item1' => 'TEXT',  'item2' => 'TEXT', 'item3' => '@href' };
};
my $workbook = Spreadsheet::WriteExcel->new($filename);
   $workbook->add_format(text_wrap => 1);
my $worksheet = $workbook->add_worksheet('list');

my $uri = URI->new($URL);
my $result = $scraper->scrape($uri);
my $row = 0;
foreach my $data ( @{ $result->{list} } ) {
	my $col = 0;
	$worksheet->write($row, $col++, $data->{item1});
	$worksheet->write($row, $col++, $data->{item2});	
	$worksheet->write($row, $col++, $data->{item3});
	$row++;
}

2010-11-01

Web::Scraper cookie設定

perl

Web::Scraper - search.cpan.orgでcookie設定

#!/usr/bin/perl

use strict;
use warnings;
use Web::Scraper;
use URI;
use HTTP::Cookies;
use Data::Dumper;

my $cookie_file = '/path/to/cookies.txt';
my $cookie_jar = HTTP::Cookies->new(file => $cookie_file, autosave =>1);

my $url = shift;

my $scraper = scraper{
		process '', '' => '';
};
   $scraper->user_agent->cookie_jar($cookie_jar);

my $uri = URI->new($url);
my $result = $scraper->scrape($uri);
print Damper($result);

2010-11-01

Web::Scraper proxy設定

perl

Web::Scraper - search.cpan.orgでproxy設定

#!/usr/bin/perl
use strict;
use warnings;
use Web::Scraper;
use URI;
use Data::Dumper;

my $url = shift;
my $http_proxy = 'http:// proxy : port';

my $scraper = scraper {
		process '', '' => '';
};

   $scraper->user_agent->agent('Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.3) Gecko/20100407 Ubuntu/9.10 (karmic) Firefox/3.6.3');
   
   $scraper->user_agent->proxy('http', $http_proxy);

my $uri = URI->new($URL);
my $result = $scraper->scrape($uri);
print Dumper($result);

2010-10-26

Config::Pitについての覚書

perl

Config::Pit - search.cpan.orgの使い方
ワンライナーで登録するには以下のとおりで。

perl -MConfig::Pit -e'Config::Pit::set("d.hatena.jp", data=>{ username=>"login_account", password=>"login_pass" })'

以上のデータセットで$HOME/.pitにセットされる。
URLで個々に設定できるので便利。URLでなくても設定できる。
取り出しは、以下のように簡単に設定できる。

my $config = pit_get("d.hatena.ne.jp");

あとはusername,passwordは以下のように取り出すことができる。

my $username = $config->{username};
my $password = $config->{password};

mailアカウントなどは以下のようにして登録しておくと便利

perl -MConfig::Pit -e'Config::Pit::set("mail.com", data=>{ gmail_1_user =>"gmail_1_login", gmail_1_pass =>"gmail_1_pass", gmail_2_user => "gmail_2_login", gmail_2_pass => "gmail_2_pass", mobile_address => "mobile_address" })'

上記の場合、以下のように取り出す。

my $config = pit_get("mail.com");

my $gmail_user = $config->{gmail_1_login};
my $gmail_pass = $config->{gmail_2_pass};
my $mobile_add = $config->{mobile_address};

参照先

パスワード設定をコードに書かない(Config::Pit) - モダンなPerl入門 - モダンなPerl入門

2010-10-25

マルコフ連鎖

perl

Yahoo！のテキスト解析Web APIで日本語生成（マルコフ連鎖） - ishiducaの日記 - Hatena::Group::Perlを参考にしました。

#!/usr/local/bin/perl

use strict;
use warnings;
use MeCab;
use Data::Dumper;

my $text = shift || die "You should set argument $!";
my $eos = "EOS\n";
my %markov;
my @data;
my $sentence;

my $mecab = new MeCab::Tagger("-Owakati");
for (my $n = $mecab->parseToNode($text); $n; $n = $n->{next}) {
	push @data, $n->{surface};
}
push @data, $eos;

for (my $i = 0; $i < $#data; $i++) {
	push @{ $markov{$data[$i+1]}{$data[$i+2]} }, $data[$i+3];
}
my @keys = keys %markov;
my $phr1 = $data[1];
my @key = keys %{$markov{$phr1}};
my $phr2 = $key[int rand(@key)];
my $rand = int rand(@{$markov{$phr1}{$phr2}});
my $phr3 = $markov{$phr1}{$phr2}[$rand];
$sentence = "$phr1$phr2$phr3";

for(my $d = 0; $d < 1000; $d++) {	
	$phr1 = $phr2;
	$phr2 = $phr3;
	$rand = int rand(@{$markov{$phr1}{$phr2}});
	$phr3 =  $markov{$phr1}{$phr2}[$rand];
	last if $phr3 eq $eos;
	$sentence .= "$phr3";	
}
print $sentence;

2010-10-25

はてなブックマークタグ集計ランキング

perl

はてなブックマークのタグを集計してランキング表示する。
404 Blog Not Found:最近のはてブで流行している７つのタグをそのまま真似て書いてみた。

#!/usr/bin/perl 
use strict;
use warnings;
use HTML::LinkExtor;
use XML::LibXML;
use Encode;

my $url = shift;

my %entry;
sub get_verbosely{
    use LWP::Simple;
    my $uri = shift;
    my $nonl = shift;
    return get($uri);
}

my $content = get_verbosely($url);
HTML::LinkExtor
    ->new(sub{
              my($tag, %link_of) = @_;
              return if $tag ne 'a';
              for my $attr (keys %link_of){
                  next if $attr ne 'href';
                  next if $link_of{$attr} !~ m{^/entry/};
                  my $uri = $link_of{$attr};
                  $uri =~ s,^/entry/,http://,o;
                  $entry{ $uri }++
              }
          })->parse($content);
my %keywords;
for my $uri (keys %entry){
    my $rss_uri = "http://b.hatena.ne.jp/entry/rss/" . $uri;
	my $con = get_verbosely($rss_uri, 1);
	my $tag = ($con =~ s{ <dc:subject>(.*?)</dc:subject> }
             { $keywords{ encode('utf-8', lc($1)) }++ }egx );
    	print STDERR "$tag keywords found\n";
}
my $order = 0;
for my $k (sort {$keywords{$b} <=> $keywords{$a}} keys %keywords){
    printf "%2d\t%4d\t%s\n", ++$order, $keywords{$k}, $k;
}