term_test.py
# -*- coding: UTF-8 -*- import unittest import os import sys from term import StopList, StoplistFileError class StopListTestCase(unittest.TestCase): def create_stopwords_file(self, filename, stopwords): stopwords_file = open(filename, 'w') if not stopwords_file: raise Exception("Unable to create stop file, " + filename) for word in stopwords: stopwords_file.write(word.encode('utf-8') + u"\n".encode('utf-8')) stopwords_file.close() def setUp(self): self.stopwords_filename = 'stopwords.txt' self.stopwords = [u'a', u'an', u'the', u'any', u'is', u'from'] self.create_stopwords_file(self.stopwords_filename, self.stopwords) self.addl_stopwords_filename = 'addl_stopwords.txt' self.addl_stopwords = [u'of', u'with', u'to', u'for'] self.create_stopwords_file(self.addl_stopwords_filename, self.addl_stopwords) self.non_english_stopwords_filename = 'non_english_stopwords.txt' self.non_english_stopwords = [u'ça', u'était', u'à', u'über'] self.create_stopwords_file(self.non_english_stopwords_filename, self.non_english_stopwords) self.comma_stoplist_filename = 'stopwords_comma.txt' self.comma_word_lines = [u'a,an'] self.create_stopwords_file(self.comma_stoplist_filename, self.comma_word_lines) self.space_stoplist_filename = 'stopwords_space.txt' self.space_word_lines = [u"ça était"] self.create_stopwords_file(self.space_stoplist_filename, self.space_word_lines) self.empty_stoplist = StopList() self.stoplist = StopList(self.stopwords_filename) def tearDown(self): os.unlink(self.stopwords_filename) os.unlink(self.addl_stopwords_filename) os.unlink(self.non_english_stopwords_filename) os.unlink(self.comma_stoplist_filename) os.unlink(self.space_stoplist_filename) def test_wordset(self): assert self.empty_stoplist.wordset == set([]) assert len(self.stoplist.wordset) == len(self.stopwords) assert u'an' in self.stoplist.wordset assert u'obama' not in self.stoplist.wordset def test_wordset_not_setable(self): try : self.stoplist.wordset = set([]) except AttributeError: pass except: self.fail("Expected AttributeError got:", sys.exc_info()[0]) else: self.fail("Property wordset should not be setable") def test_iteration(self): stopwords = self.stopwords[:] for word in self.stoplist: assert word in stopwords stopwords.remove(word) assert len(stopwords) == 0 def test_in_operator(self): assert u'an' in self.stoplist assert u'AN' in self.stoplist assert u'obama' not in self.stoplist def test_load_file(self): self.stoplist.load_file(self.addl_stopwords_filename) assert len(self.stoplist.wordset) == \ len(self.stopwords) + len(self.addl_stopwords) for word in self.stopwords + self.addl_stopwords: assert word in self.stoplist def test_load_nonexistent_file(self): self.assertRaises(IOError, self.stoplist.load_file, 'this_file_does_not_exist.txt') def test_unicode_support(self): self.stoplist.load_file(self.non_english_stopwords_filename) for word in self.non_english_stopwords: assert word in self.stoplist def test_malformed_file(self): self.assertRaises(StoplistFileError, self.stoplist.load_file, self.comma_stoplist_filename) self.assertRaises(StoplistFileError, self.empty_stoplist.load_file, self.space_stoplist_filename) def test_remove_stopwords(self): clark_quote = 'Any sufficiently advanced technology is indistinguishable from magic' quote_words = str.split(clark_quote) quote_words_sans_stopwords = [u'sufficiently', u'advanced', u'technology', u'indistinguishable', u'magic'] assert quote_words_sans_stopwords == self.stoplist.remove_stopwords(quote_words) assert None == self.stoplist.remove_stopwords(None) assert [] == self.stoplist.remove_stopwords([])