Tuesday, 21 September 2010

WordFrequency in java collections

This program reads files of words and lists their frequencies. A file of words to ignore can also be supplied. This example illustrates the use of generic data structures (Sets, Maps, and ArrayList), regular expressions (as used by Scanner), and Comparators.

class WordFrequencyCmd {
//===================================================================== main
public static void main(String[] unused) {
Scanner in = new Scanner(System.in);

try {
//... Read two file names from the input.
System.out.println("Name of file containing text to analyze:");
File inputFile = new File(in.nextLine());

System.out.println("Name of file containing words to ignore:");
File ignoreFile = new File(in.nextLine());

//... Supply two files to WordCounter.
WordCounter counter = new WordCounter();
counter.ignore(ignoreFile);
counter.countWords(inputFile);

//... Get the results.
String[] wrds = counter.getWords(WordCounter.SortOrder.BY_FREQUENCY);
int[] frequency = counter.getFrequencies(WordCounter.SortOrder.BY_FREQUENCY);

//... Display the results.
int n = counter.getEntryCount();
for (int i=0; i
 
The model (the logic of the program without any user interface) is 
implemented primarily in the WordCounter class, which uses
utility classes: ComparatorFrequency and ComparatorAlphabetic to sort, and Int
to record the frequency counts.
import java.io.*;
import java.util.*;
import java.util.regex.*;

/** Computes word frequency in source file; ignores words in ignore file.
* Uses generic Sets, Maps, ArrayLists, regular expressions, Scanner.
* @author Fred Swartz
* @version 2007-05-06
*/
public class WordCounter {
//================================================================ constants
private static final Comparator> SORT_BY_FREQUENCY =
new ComparatorFrequency();
private static final Comparator> SORT_ALPHABETICALLY =
new ComparatorAlphabetic();
public enum SortOrder {ALPHABETICALLY, BY_FREQUENCY}

//=================================================================== fields
Set _ignoreWords; // Words to ignore.
Map _wordFrequency; // Words -> frequency
int _totalWords; // Total source words.

//============================================================== constructor
/** Constructor */
public WordCounter() {
_ignoreWords = new HashSet();
_wordFrequency = new HashMap();
_totalWords = 0;
}

//=================================================================== ignore
/**
* Reads file of words to ignore. Ignore words are added to a Set.
* The IOException is passed to caller because we certinaly don't
* know what the user interface issue is.
*
* @param ignoreFile File of words to ignore.
*/
public void ignore(File ignoreFile) throws IOException {
Scanner ignoreScanner = new Scanner(ignoreFile);
ignoreScanner.useDelimiter("[^A-Za-z]+");

while (ignoreScanner.hasNext()) {
_ignoreWords.add(ignoreScanner.next());
}
ignoreScanner.close(); // Close underlying file.
}

//=================================================================== ignore
/**
* Takes String of words to ignore. Ignore words are added to a Set.
*
* @param ignore String of words to ignore.
*/
public void ignore(String ignoreStr) {
Scanner ignoreScanner = new Scanner(ignoreStr);
ignoreScanner.useDelimiter("[^A-Za-z]+");

while (ignoreScanner.hasNext()) {
_ignoreWords.add(ignoreScanner.next());
}
}


//=============================================================== countWords
/** Record the frequency of words in the source file.
* May be called more than once.
* IOException is passed to caller, who might know what to do with it.
*@param File of words to process.
*/
public void countWords(File sourceFile) throws IOException {
Scanner wordScanner = new Scanner(sourceFile);
wordScanner.useDelimiter("[^A-Za-z]+");

while (wordScanner.hasNext()) {
String word = wordScanner.next();
_totalWords++;

//... Add word if not in map, otherewise increment count.
if (!_ignoreWords.contains(word)) {
Int count = _wordFrequency.get(word);
if (count == null) { // Create new entry with count of 1.
_wordFrequency.put(word, new Int(1));
} else { // Increment existing count by 1.
count.value++;
}
}
}
wordScanner.close(); // Close underlying file.
}


//=============================================================== countWords
/** Record the frequency of words in a String.
* May be called more than once.
*@param String of words to process.
*/
public void countWords(String source) {
Scanner wordScanner = new Scanner(source);
wordScanner.useDelimiter("[^A-Za-z]+");

while (wordScanner.hasNext()) {
String word = wordScanner.next();
_totalWords++;

//... Add word if not in map, otherewise increment count.
if (!_ignoreWords.contains(word)) {
Int count = _wordFrequency.get(word);
if (count == null) { // Create new entry with count of 1.
_wordFrequency.put(word, new Int(1));
} else { // Increment existing count by 1.
count.value++;
}
}
}
}

//============================================================= getWordCount
/** Returns number of words in all source file(s).
*@return Total number of words proccessed in all source files.
*/
public int getWordCount() {
return _totalWords;
}

//============================================================ getEntryCount
/** Returns the number of unique, non-ignored words, in the source file(s).
* This number should be used to for the size of the arrays that are
* passed to getWordFrequency.
*@return Number of unique non-ignored source words.
*/
public int getEntryCount() {
return _wordFrequency.size();
}

//========================================================= getWordFrequency
/** Stores words and their corresponding frequencies in parallel array lists
* parameters. The frequencies are sorted from low to high.
* @param words Unique words that were found in the source file(s).
* @param counts Frequency of words at corresponding index in words array.
*/
public void getWordFrequency(ArrayList out_words,
ArrayList out_counts) {
//... Put in ArrayList so sort entries by frequency
ArrayList> entries =
new ArrayList>(_wordFrequency.entrySet());
Collections.sort(entries, new ComparatorFrequency());

//... Add word and frequency to parallel output ArrayLists.
for (Map.Entry ent : entries) {
out_words.add(ent.getKey());
out_counts.add(ent.getValue().value);
}
}

//================================================================= getWords
/** Return array of unique words, in the order specified.
* @return An array of the words in the currently selected order.
*/
public String[] getWords(SortOrder sortBy) {
String[] result = new String[_wordFrequency.size()];
ArrayList> entries =
new ArrayList>(_wordFrequency.entrySet());
if (sortBy == SortOrder.ALPHABETICALLY) {
Collections.sort(entries, SORT_ALPHABETICALLY);
} else {
Collections.sort(entries, SORT_BY_FREQUENCY);
}

//... Add words to the String array.
int i = 0;
for (Map.Entry ent : entries) {
result[i++] = ent.getKey();
}
return result;
}

//=========================================================== getFrequencies
/** Return array of frequencies, in the order specified.
* @return An array of the frequencies in the specified order.
*/
public int[] getFrequencies(SortOrder sortBy) {
int[] result = new int[_wordFrequency.size()];
ArrayList> entries =
new ArrayList>(_wordFrequency.entrySet());
if (sortBy == SortOrder.ALPHABETICALLY) {
Collections.sort(entries, SORT_ALPHABETICALLY);
} else {
Collections.sort(entries, SORT_BY_FREQUENCY);
}

//... Add words to the String array.
int i = 0;
for (Map.Entry ent : entries) {
result[i++] = ent.getValue().value;
}
return result;
}
}

No comments:

Post a Comment