HTML::ExtractContent&Lingua::JA::Summarize::Extract #2
記事最初の2行を別で抜き出し、サマリー生成後、追加表示する
#!/usr/bin/perl use strict; use warnings; use LWP::UserAgent; use URI; use HTML::ExtractContent; use Encode; use Lingua::JA::Summarize::Extract; use Data::Dumper; my $url = shift; my $ua = LWP::UserAgent->new; my $res = $ua->get($url); if ($res->is_success) { my $ext = HTML::ExtractContent->new; $ext->extract($res->decoded_content); my $cont = $ext->as_text; my @text = split"\n", $cont; my @sentence = splice(@text, 0,2); my $summarize = Lingua::JA::Summarize::Extract->new({ rate => 5, 'length' => 300 }); my $res_summari = $summarize->extract($cont)->as_string; print "first-sentence\n"; print map { encode('utf-8', $_) } @sentence; print "summarize\n"; print encode('utf-8', $res_summari); } else { die $res->status_line; }