#!/usr/bin/python # prints frequency of words found in standard input sorted # alphabetically after removing stop words and stipping off # punctuation import sys # punct is a string of chars we consider to be punctuation punct = """'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'""" # freq is a dictionary to store frequency of words in text freq = {} # load stop words to ignore into a dictionary stop_words = {} for line in open("stop_words.txt"): stop_words[line.strip()] = True # read lines in text, split into words, remove punctuation and # downcase, ignore stop words, update freq for line in sys.stdin: for word in line.split(): word = word.strip(punct).lower() if word not in stop_words: freq[word] = freq.get(word,0) + 1 # words is a list of the words in freq, sorted alpabetically words = freq.keys() words.sort() # print words and their frequencies for w in words: print w, freq[w]