#!/usr/bin/python # Tim Finin, finin@umbc.edu # prints frequency of words found in standard input sorted by # frequency after removing stop words and stipping off punctuation # command line arguments: # -n # -t import sys from operator import itemgetter from optparse import OptionParser # read command line arguments and process parser = OptionParser() parser.add_option('-n', '--number', type="int", default=None, help='number of words to report') parser.add_option("-t", "--threshold", type="int", default=0, help="report words where frequency > threshold") (options, args) = parser.parse_args() punct = """'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'""" freq = {} # frequency of words in text # load stop words to ignore into a dictionary stop_words = set() for line in open("stop_words.txt"): stop_words.add(line.strip()) stop_words.add('') # compute the frequencies for line in sys.stdin: for word in line.split(): word = word.strip(punct).lower() if word not in stop_words: freq[word] = freq.get(word,0) + 1 # words is a list of (word,freq) tuples sorted by freq words = sorted(freq.items(), key=itemgetter(1), reverse=True) # print the top option.number words but only those with freq>option.threshold for (word, freq) in words[:options.number]: if freq > options.threshold: print freq, word