term.py
import re class StoplistFileError(Exception): "Used to indicate that a stoplist file breaks one or more formatting rules." class StopList(object): def __init__(self, file=None): self.__stopwords = set() if file: self.load_file(file) @property def wordset(self): return self.__stopwords.copy() def __iter__(self): self.__iter_index = 0 self.__stopwords_iter = list(self.__stopwords) return self def next(self): if self.__iter_index == len(self.__stopwords_iter): raise StopIteration next_item = self.__stopwords_iter[self.__iter_index] self.__iter_index += 1 return next_item def __contains__(self, word): if word.lower() in self.__stopwords: return True else: return False def load_file(self, file): stopwords_file = open(file, "r") for line in stopwords_file: utf8_line = line.decode('utf-8').rstrip() if not re.match('^\w+$', utf8_line, re.UNICODE): raise StoplistFileError self.__stopwords.add(utf8_line.lower()) stopwords_file.close() def remove_stopwords(self, words): if words is None: return None filtered_words = [] for word in words: if word.lower() not in self: filtered_words.append(word) return filtered_words