Most common word - PERL

Submitted by admin on Tue, 01/04/2011 - 20:24

This script determines the most common "X" long string in a file.

#!/usr/bin/perl
 
#Name: text_most_common_str_v1.pl
#
#Written by: Balazs, Lendvay (ITFanatic :) )
#
#Purpose: This script reads a file and determines the most common string which has #the length of 5 characters, only words are counted (\w), filename is an argument
#
#The script can be downloaded from "ITFanatic.com" and it is absolutely FREE,
#you can redistribute it and/or modify it under the same terms as Perl itself. (I think:) )
#This script comes with NO WARRANTY of any kind. (I am sure! :) )
#I cannot guarantee that this works in your environment, and I am not responsible
#for any harm it may cause on your computer. (I hope it won't cause harm at all :) )
 
use warnings;
 
$filename = $ARGV[0];
 
#open file for read
open STUFF, $filename or die "Cannot open $filename for read :$!";
 
#set default values
my $i = 0;
my $p = 1;
my $words = 0;
my $chars = 0;
@all; #contains all strings
@count; #contains the lengths for all stings at the same position
$van=0; #exists or not
 
#read lines
while (<STUFF>) {
	chomp; #remove the input record separator
	$i = $.; #line numbers
	$p++ if (m/^$/); #count paragraphs
	@t = split (/\s+/); #split sentences into "words" by spaces
 
	#for each word do the test
	foreach $aaa (@t)
	{
		#remove characters which are not \w
		$aaa=~s/[^\w]//g;
		#if length matches
		if (length($aaa)==5)
		{
			$van=0;
			#determine the length of the array
			$len=scalar(@all);
			#read all fields from the array
			for ($i0=0;$i0<$len;$i0++)
			{
				#if match already exists, increase counter
				if ($all[$i0] eq $aaa) 
				{
					$van=1;
					$count[$i0]=$count[$i0]+1;
				};
			}
			if ($van==0) 
			{
				#if no match found in the array, this will be a new member
				push(@all,$aaa);
				#and it was found once
				push(@count,1);
			};
		};
	};
	$words += @t; #add count to $words
	$chars += tr/ //c; #tr/ //c count all characters except spaces and add to $chars
}
#close the file
close(STUFF);
 
#display results
print "Lines     : \"$i\" \n";
print "Paragraphs: \"$p\" \n";
print "Words     : \"$words\" \n";
print "Chars     : \"$chars\" \n";
 
#determine the length of the array
$len1=scalar(@all);
#reset variables
$max_s="";
$max_n=0;
#determine the max occurence
for ($i1=0;$i1<$len1;$i1++)
{
#	print "@all[$i1]:@count[$i1]\n";
	if ($count[$i1]>$max_n) 
	{
		$max_s=$all[$i1];
		$max_n=$count[$i1];
	};
}
#print results
print "\nThe most common \"$thelen\" long word is \"$max_s\" and the number of occurance is \"$max_n\".\n";