This script determines the most common "X" long string in a file.
#!/usr/bin/perl #Name: text_most_common_str_v1.pl # #Written by: Balazs, Lendvay (ITFanatic :) ) # #Purpose: This script reads a file and determines the most common string which has #the length of 5 characters, only words are counted (\w), filename is an argument # #The script can be downloaded from "ITFanatic.com" and it is absolutely FREE, #you can redistribute it and/or modify it under the same terms as Perl itself. (I think:) ) #This script comes with NO WARRANTY of any kind. (I am sure! :) ) #I cannot guarantee that this works in your environment, and I am not responsible #for any harm it may cause on your computer. (I hope it won't cause harm at all :) ) use warnings; $filename = $ARGV[0]; #open file for read open STUFF, $filename or die "Cannot open $filename for read :$!"; #set default values my $i = 0; my $p = 1; my $words = 0; my $chars = 0; @all; #contains all strings @count; #contains the lengths for all stings at the same position $van=0; #exists or not #read lines while (<STUFF>) { chomp; #remove the input record separator $i = $.; #line numbers $p++ if (m/^$/); #count paragraphs @t = split (/\s+/); #split sentences into "words" by spaces #for each word do the test foreach $aaa (@t) { #remove characters which are not \w $aaa=~s/[^\w]//g; #if length matches if (length($aaa)==5) { $van=0; #determine the length of the array $len=scalar(@all); #read all fields from the array for ($i0=0;$i0<$len;$i0++) { #if match already exists, increase counter if ($all[$i0] eq $aaa) { $van=1; $count[$i0]=$count[$i0]+1; }; } if ($van==0) { #if no match found in the array, this will be a new member push(@all,$aaa); #and it was found once push(@count,1); }; }; }; $words += @t; #add count to $words $chars += tr/ //c; #tr/ //c count all characters except spaces and add to $chars } #close the file close(STUFF); #display results print "Lines : \"$i\" \n"; print "Paragraphs: \"$p\" \n"; print "Words : \"$words\" \n"; print "Chars : \"$chars\" \n"; #determine the length of the array $len1=scalar(@all); #reset variables $max_s=""; $max_n=0; #determine the max occurence for ($i1=0;$i1<$len1;$i1++) { # print "@all[$i1]:@count[$i1]\n"; if ($count[$i1]>$max_n) { $max_s=$all[$i1]; $max_n=$count[$i1]; }; } #print results print "\nThe most common \"$thelen\" long word is \"$max_s\" and the number of occurance is \"$max_n\".\n";