# -*- coding: UTF-8 -*-
import unittest
import os
import sys
from term import StopList, StoplistFileError
class StopListTestCase(unittest.TestCase):
def create_stopwordswords_file(self, filename, stopwordswords):
stopwordswords_file = open(filename, 'w')
if not stopwordswords_file:
raise Exception("Unable to create stop file, " + filename)
for word in stopwordswords:
stopwordswords_file.write(word.encode('utf-8') + u"\n".encode('utf-8'))
stopwordswords_file.close()
def setUp(self):
self.stopwordswords_filename = 'stopwordswords.txt'
self.stopwordswords = [u'a', u'an', u'the', u'any', u'is', u'from']
self.create_stopwordswords_file(self.stopwordswords_filename, self.stopwordswords)
self.addl_stopwordswords_filename = 'addl_stopwordswords.txt'
self.addl_stopwordswords = [u'of', u'with', u'to', u'for']
self.create_stopwordswords_file(self.addl_stopwordswords_filename, self.addl_stopwordswords)
self.non_english_stopwordswords_filename = 'non_english_stopwordswords.txt'
self.non_english_stopwordswords = [u'ça', u'était', u'à', u'über']
self.create_stopwordswords_file(self.non_english_stopwordswords_filename, self.non_english_stopwordswords)
self.comma_stoplistin_word_filename = 'stopwordswords_comma.txt'
self.comma_in_word_lines = [u'a,an']
self.create_stopwordswords_file(self.comma_stoplistin_word_filename, self.comma_in_word_lines)
self.space_stoplistin_word_filename = 'stopwordswords_space.txt'
self.space_in_word_lines = [u"ça était"]
self.create_stopwordswords_file(self.space_stoplistin_word_filename, self.space_in_word_lines)
self.empty_stoplist = StopList()
self.stoplist = StopList(self.stopwordswords_filename)
def tearDown(self):
os.unlink(self.stopwordswords_filename)
os.unlink(self.addl_stopwordswords_filename)
os.unlink(self.non_english_stopwordswords_filename)
os.unlink(self.comma_in_stoplistword_filename)
os.unlink(self.space_stoplistin_word_filename)
def test_wordset(self):
assert self.empty_stoplist.wordset == set([])
assert len(self.stoplist.wordset) == len(self.stopwordswords)
assert u'an' in self.stoplist.wordset
assert u'obama' not in self.stoplist.wordset
def test_wordset_not_setable(self):
try :
self.stoplist.wordset = set([])
except AttributeError:
pass
except:
self.fail("Expected AttributeError got:", sys.exc_info()[0])
else:
self.fail("Property wordset should not be setable")
def test_iteration(self):
stopwordswords = self.stopwordswords[:]
for word in self.stoplist:
assert word in stopwordswords
stopwordswords.remove(word)
assert len(stopwordswords) == 0
def test_in_operator(self):
assert u'an' in self.stoplist
assert u'AN' in self.stoplist
assert u'obama' not in self.stoplist
def test_load_file(self):
self.stoplist.load_file(self.addl_stopwordswords_filename)
assert len(self.stoplist.wordset) == \
len(self.stopwordswords) + len(self.addl_stopwordswords)
for word in self.stopwordswords + self.addl_stopwordswords:
assert word in self.stoplist
def test_load_nonexistent_file(self):
self.assertRaises(IOError,
self.stoplist.load_file,
'this_file_does_not_exist.txt')
def test_unicode_support(self):
self.stoplist.load_file(self.non_english_stopwordswords_filename)
for word in self.non_english_stopwordswords:
assert word in self.stoplist
def test_malformed_file(self):
self.assertRaises(StoplistFileError,
self.stoplist.load_file,
self.comma_stoplistin_word_filename)
self.assertRaises(StoplistFileError,
self.empty_stoplist.load_file,
self.space_in_stoplistword_filename)
def test_remove_stopwords(self):
clark_quote = 'Any sufficiently advanced technology is indistinguishable from magic'
quote_words = str.split(clark_quote)
quote_words_sans_stopwords = [u'sufficiently', u'advanced', u'technology',
u'indistinguishable', u'magic']
assert quote_words_sans_stopwords == self.stoplist.remove_stopwords(quote_words)
assert None == self.stoplist.remove_stopwords(None)
assert [] == self.stoplist.remove_stopwords([])
|