#!/usr/bin/perl
# Find keywords for RPM packages using tf-idf
# http://en.wikipedia.org/wiki/Tf-idf
use strict;

my $dir = shift;
-d $dir or die "Usage: $0 DIR\n";

my %TF;
my %IDF;

for my $f (<$dir/*.rpm>) {
	use RPM::Header;
	my $rpm = RPM::Header->new($f) or die $RPM::err;
	my $text = $$rpm{NAME}."\n".$$rpm{SUMMARY}."\n".$$rpm{DESCRIPTION};
	my @words = $text =~ /(\w{2,})/g;
	
	#use Text::English qw(stem);
	#@words = map { stem(lc($_)) } @words;
	@words = map lc, @words;
	
	my %vec;
	$vec{$_}++ for @words;
	$TF{$f} = \%vec;
	$IDF{$_}++ for keys %vec;
}

for my $f (sort keys %TF) {
	my %vec = %{$TF{$f}};
	use List::Util qw(sum);
	my $sigma = sum values %vec;
	my @tf = map { $vec{$_} / $sigma } sort keys %vec;
	my $D = scalar keys %vec;
	my @idf = map { log($D/$_) } map { $IDF{$_} } sort keys %vec;
	use List::MoreUtils qw(pairwise);
	my @tfidf = pairwise { $a * $b } @tf, @idf;
	my @rank = pairwise { [$a,$b] } @{[ sort keys %vec ]}, @tfidf;
	my @keywords =  map { $$_[0].sprintf('=%.2f',$$_[1]) } sort { $$b[1] <=> $$a[1] } grep { $$_[1]> 0.04 } @rank;
	print "$f @keywords\n";
}
