Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.
Comment: Migrated to Confluence 5.3
Code Block
titleterm.py


import re

class StoplistFileError(Exception):
    "Used to indicate that a stoplist file breaks one or more formatting rules."

class StopList(object):
    def __init__(self, file=None):
        self.__stopwords = set()
	if file:
	    self.load_file(file)
	
    @property
    def wordset(self):
	return self.__stopwords.copy()

    def __iter__(self):
	self.__iter_index = 0
	self.__stopwords_iter = list(self.__stopwords)
	return self
    
    def next(self):
	if self.__iter_index == len(self.__stopwords_iter):
	    raise StopIteration
	next_item = self.__stopwords_iter[self.__iter_index]
	self.__iter_index += 1
	return next_item
    
    def __contains__(self, word):
	if word.lower() in self.__stopwords:
	    return True
	else:
	    return False
    
    def load_file(self, file):
	stopwords_file = open(file, "r")
	for line in stopwords_file:
	    utf8_line = line.decode('utf-8').rstrip()
	    if not re.match('^\w+$', utf8_line, re.UNICODE):
		raise StoplistFileError
	    self.__stopwords.add(utf8_line.lower())
	stopwords_file.close()
	
    def remove_stopwords(self, words):
	if words is None:
	    return None
	filtered_words = []
	for word in words:
	    if word.lower() not in self:
		filtered_words.append(word)
	return filtered_words