Code Block | ||
---|---|---|
| ||
import re |
...
class StoplistFileError(Exception): |
...
"Used to indicate that a stoplist file breaks one or more formatting rules." |
...
class StopList(object): |
...
def __init__(self, file=None): |
...
self.__stopwords = set() |
...
if file: |
...
self.load_file(file) |
...
@property def wordset(self): |
...
return self.__stopwords.copy() |
...
def __iter__(self): |
...
self.__iter_index = 0 |
...
self.__stopwords_iter = list(self.__stopwords) |
...
return |
...
self def next(self): |
...
if self.__iter_index == len(self.__stopwords_iter): |
...
raise StopIteration next_item = self.__stopwords_iter[self.__iter_index |
...
] self.__iter_index += 1 |
...
return next_item |
...
def __contains__(self, word): |
...
if word.lower() in self.__stopwords: |
...
return True else: return False def load_file(self, file): |
...
stopwords_file = open(file, "r") |
...
for line in stopwords_file: |
...
utf8_line = line.decode('utf-8').rstrip() |
...
if not re.match('^\w+$', utf8_line, re.UNICODE): |
...
raise |
...
StoplistFileError self.__stopwords.add(utf8_line.lower()) |
...
stopwords_file.close() |
...
def remove_stopwords(self, words): |
...
if words is None: |
...
return None filtered_words = [] |
...
for word in words: |
...
if word.lower() not in self: |
...
filtered_words.append(word) |
...
return filtered_words |