import re
class StoplistFileError(Exception):
"Used to indicate that a stoplist file breaks one or more formatting rules."
class StopList(object):
def __init__(self, file=None):
self.__stopwords = set()
if file:
self.load_file(file)
@property
def wordset(self):
return self.__stopwords.copy()
def __iter__(self):
self.__iter_index = 0
self.__stopwords_iter = list(self.__stopwords)
return self
def next(self):
if self.__iter_index == len(self.__stopwords_iter):
raise StopIteration
next_item = self.__stopwords_iter[self.__iter_index]
self.__iter_index += 1
return next_item
def __contains__(self, word):
if word.lower() in self.__stopwords:
return True
else:
return False
def load_file(self, file):
stopwords_file = open(file, "r")
for line in stopwords_file:
utf8_line = line.decode('utf-8').rstrip()
if not re.match('^\w+$', utf8_line, re.UNICODE):
raise StoplistFileError
self.__stopwords.add(utf8_line.lower())
stopwords_file.close()
def remove_stopwords(self, words):
if words is None:
return None
filtered_words = []
for word in words:
if word.lower() not in self:
filtered_words.append(word)
return filtered_words
|