Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.
Comment: Migrated to Confluence 5.3
Code Block
titleterm_test.py

 # -*- coding: UTF-8 -*-

import unittest
import os
import sys
from term import StopList, StoplistFileError
	
class StopListTestCase(unittest.TestCase):
    
    def create_stopwordswords_file(self, filename, stopwordswords):
        stopwordswords_file = open(filename, 'w')
        if not stopwordswords_file:
            raise Exception("Unable to create stop file, " + filename)
        for word in stopwordswords:
            stopwordswords_file.write(word.encode('utf-8') + u"\n".encode('utf-8'))
        stopwordswords_file.close()
        
    def setUp(self):
        self.stopwordswords_filename = 'stopwordswords.txt'
        self.stopwordswords = [u'a', u'an', u'the', u'any', u'is', u'from']
        self.create_stopwordswords_file(self.stopwordswords_filename, self.stopwordswords)

        self.addl_stopwordswords_filename = 'addl_stopwordswords.txt'
        self.addl_stopwordswords =  [u'of', u'with', u'to', u'for']
        self.create_stopwordswords_file(self.addl_stopwordswords_filename, self.addl_stopwordswords)

        self.non_english_stopwordswords_filename = 'non_english_stopwordswords.txt'
        self.non_english_stopwordswords = [u'ça', u'était', u'à', u'über']
        self.create_stopwordswords_file(self.non_english_stopwordswords_filename, self.non_english_stopwordswords)

        self.comma_stoplistin_word_filename = 'stopwordswords_comma.txt'
        self.comma_in_word_lines = [u'a,an']
        self.create_stopwordswords_file(self.comma_stoplistin_word_filename, self.comma_in_word_lines)

        self.space_stoplistin_word_filename = 'stopwordswords_space.txt'
        self.space_in_word_lines = [u"ça était"]
        self.create_stopwordswords_file(self.space_stoplistin_word_filename, self.space_in_word_lines)

        self.empty_stoplist = StopList()
        self.stoplist = StopList(self.stopwordswords_filename)
        
    def tearDown(self):
        os.unlink(self.stopwordswords_filename)
        os.unlink(self.addl_stopwordswords_filename)
        os.unlink(self.non_english_stopwordswords_filename)
        os.unlink(self.comma_in_stoplistword_filename)
        os.unlink(self.space_stoplistin_word_filename)
        
    def test_wordset(self):
        assert self.empty_stoplist.wordset == set([])
        assert len(self.stoplist.wordset) == len(self.stopwordswords)
        assert u'an' in self.stoplist.wordset
        assert u'obama' not in self.stoplist.wordset
        
    def test_wordset_not_setable(self):
        try :
            self.stoplist.wordset = set([])
        except AttributeError:
            pass
        except:
            self.fail("Expected AttributeError got:", sys.exc_info()[0])
        else:
            self.fail("Property wordset should not be setable")
        
    def test_iteration(self):
        stopwordswords = self.stopwordswords[:]
        for word in self.stoplist:
            assert word in stopwordswords
            stopwordswords.remove(word)
        assert len(stopwordswords) == 0

    def test_in_operator(self):
        assert u'an' in self.stoplist
        assert u'AN' in self.stoplist
        assert u'obama' not in self.stoplist
        
    def test_load_file(self):
        self.stoplist.load_file(self.addl_stopwordswords_filename)
        assert len(self.stoplist.wordset) == \
               len(self.stopwordswords) + len(self.addl_stopwordswords)
        for word in self.stopwordswords + self.addl_stopwordswords:
            assert word in self.stoplist
            
    def test_load_nonexistent_file(self):
        self.assertRaises(IOError, 
                          self.stoplist.load_file, 
                          'this_file_does_not_exist.txt')

    def test_unicode_support(self):
        self.stoplist.load_file(self.non_english_stopwordswords_filename)
        for word in self.non_english_stopwordswords:
            assert word in self.stoplist
    
    def test_malformed_file(self):
        self.assertRaises(StoplistFileError, 
                          self.stoplist.load_file, 
                          self.comma_stoplistin_word_filename)
        self.assertRaises(StoplistFileError, 
                          self.empty_stoplist.load_file, 
                          self.space_in_stoplistword_filename)
            
    def test_remove_stopwords(self):
        clark_quote = 'Any sufficiently advanced technology is indistinguishable from magic'
        quote_words = str.split(clark_quote)
        quote_words_sans_stopwords = [u'sufficiently', u'advanced', u'technology', 
                                      u'indistinguishable', u'magic']
        assert quote_words_sans_stopwords == self.stoplist.remove_stopwords(quote_words)
        assert None == self.stoplist.remove_stopwords(None)
        assert [] == self.stoplist.remove_stopwords([])