HTML::ExtractContent&Lingua::JA::Summarize::Extract #2

記事最初の2行を別で抜き出し、サマリー生成後、追加表示する


#!/usr/bin/perl

use strict;
use warnings;
use LWP::UserAgent;
use URI;
use HTML::ExtractContent;
use Encode;
use Lingua::JA::Summarize::Extract;
use Data::Dumper;

my $url = shift;
my $ua = LWP::UserAgent->new;
my $res = $ua->get($url);
if ($res->is_success) {
	my $ext = HTML::ExtractContent->new;
	   $ext->extract($res->decoded_content);
	my $cont = $ext->as_text;
	my @text = split"\n",  $cont;
	my @sentence = splice(@text, 0,2);
	my $summarize = Lingua::JA::Summarize::Extract->new({ rate => 5, 'length' => 300 });
	my $res_summari = $summarize->extract($cont)->as_string;
	print "first-sentence\n";
	print map { encode('utf-8', $_) } @sentence;
	print "summarize\n";
        print encode('utf-8', $res_summari); 
} else {
	die $res->status_line;
}