You are viewing an old version of this page. View the current version.

Compare with Current View Page History

Version 1 Next »

import re

class StoplistFileError(Exception):
"Used to indicate that a stoplist file breaks one or more formatting rules."

class StopList(object):
def _init_(self, file=None):
self.__stopwords = set()
if file:
self.load_file(file)

@property
def wordset(self):
return self.__stopwords.copy()

def _iter_(self):
self.__iter_index = 0
self._stopwords_iter = list(self._stopwords)
return self

def next(self):
if self._iter_index == len(self._stopwords_iter):
raise StopIteration
next_item = self.__stopwords_iterself.__iter_index
self.__iter_index += 1
return next_item

def _contains_(self, word):
if word.lower() in self.__stopwords:
return True
else:
return False

def load_file(self, file):
stopwords_file = open(file, "r")
for line in stopwords_file:
utf8_line = line.decode('utf-8').rstrip()
if not re.match('^\w+$', utf8_line, re.UNICODE):
raise StoplistFileError
self.__stopwords.add(utf8_line.lower())
stopwords_file.close()

def remove_stopwords(self, words):
if words is None:
return None
filtered_words = []
for word in words:
if word.lower() not in self:
filtered_words.append(word)
return filtered_words

  • No labels