svm.py 11.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
# Natural Language Toolkit: SVM-based classifier
#
# Copyright (C) 2001-2012 NLTK Project
# Author: Leon Derczynski <leon@dcs.shef.ac.uk>
#
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

"""
A classifier based on a support vector machine. This code uses Thorsten Joachims'
SVM^light implementation (http://svmlight.joachims.org/), wrapped using
PySVMLight (https://bitbucket.org/wcauchois/pysvmlight). The default settings are to
train a linear classification kernel, though through minor modification, full SVMlight
capabilities should be accessible if needed. Only binary classification is possible at present.
"""

from nltk.probability import DictionaryProbDist

from nltk.classify.api import ClassifierI

#
# Interface to Support Vector Machine
#

try:
    import svmlight
except:
    raise LookupError("\n\n===========================================================================\n  NLTK was unable to import SVMlight!\n\n  For more information, see <https://bitbucket.org/wcauchois/pysvmlight>\n===========================================================================")

# create a boolean feature name for the SVM from a feature/value pair,
# that'll take on a 1.0 value if the original feature:value is asserted.
def featurename(feature, value):
    """
    :param feature: a string denoting a feature name
    :param value: the value of the feature
    """
    return '|'.join([feature, type(value).__name__, str(value)])

# convert a set of NLTK classifier features to SVMlight format
def map_features_to_svm(features, svmfeatureindex):
    """
    :param features: a dict of features in the format {'feature':value}
    :param svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels
    """
    instancefeatures = []
    # svmlight supports sparse feature sets and so we simply omit features that we don't include
    for k,v in features.iteritems():
        # each feature is represented as an (int, float) tuple where the int is the SVMlight feature label and the float is the value; as we either have or have not a feature, this is 1.0
        # this does not support scalar features - rather, each value that a feature may take on is a discrete independent label
        # use 1.0 as the feature value to specify the presence of a feature:value couple
        svmfeaturename = featurename(k, v)
        if svmfeaturename not in svmfeatureindex:
            # skip over feature:value pairs that were not in the training data and so not included in our mappings
            continue
        instancefeatures.append( (svmfeatureindex[svmfeaturename], 1.0) )
    return instancefeatures


# convert a whole instance (including label) from NLTK to SVMlight format

def map_instance_to_svm(instance, labelmapping, svmfeatureindex):
    """
    :param instance: an NLTK format instance, which is in the tuple format (dict(), label), where the dict contains feature:value pairs, and the label signifies the target attribute's value for this instance (e.g. its class)
    :param labelmapping: a previously-defined dict mapping from text labels in the NLTK instance format to SVMlight labels of either +1 or -1
    @svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels
    """
    (features, label) = instance
    instancefeatures = map_features_to_svm(features, svmfeatureindex)
    return (labelmapping[label], instancefeatures)


class SvmClassifier(ClassifierI):
    """
    A Support Vector Machine classifier. To explain briefly, support
    vector machines (SVM) treat each feature as a dimension, and
    position features in n-dimensional feature space.  An optimal
    hyperplane is then determined that best divides feature space into
    classes, and future instances classified based on which side of
    the hyperplane they lie on, and their proximity to it.

    This implementation is for a binary SVM - that is, only two
    classes are supported. You may achieve perform classification with
    more classes by training an SVM per class and then picking a best
    option for new instances given results from each binary class-SVM.
    """

    def __init__(self, labels, labelmapping, svmfeatures, model=None):
        """
        :param labels: a list of text labels for classes
        :param labelmapping: a mapping from labels to SVM classes (-1,+1)
        :param svmfeatures: a list of svm features, where the index is the integer feature number and the value an feature/value pair
        :param model: the model generated by svmlight.learn()
        """
        self._labels = labels
        self._model = model
        self._labelmapping = labelmapping
        self._svmfeatures = svmfeatures
        # _svmfeatureindex is the inverse of svmfeatures, allowing us
        # to find an SVM feature name (int) given a feature/value
        self._svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures))))
        self._verbose = False

    def labels(self):
        """
        Return the list of class labels.
        """
        return self._labels

    def svm_label_name(self, label):
        """
        searches values of _labelmapping to resolve +1 or -1 to a string

        :param label: the string label to look up
        """
        labelname = [k for k, v in self._labelmapping.iteritems() if v == label][0]
        return labelname

    def resolve_prediction(self, prediction):
        """
        resolve a float (in this case, probably from
        svmlight.learn().classify()) to either -1 or +1, and then look
        up the label for that class in _labelmapping, and return the
        text label

        :param prediction: a signed float describing classifier confidence
        """
        classification = cmp(prediction, 0)
        return self.svm_label_name(classification)

    def _get_svm_classification(self, featureset):
        """
        given a set of features, classify them with our trained model
        and return a signed float

        :param featureset: a dict of feature/value pairs in NLTK format, representing a single instance
        """
        instance_to_classify = (0, map_features_to_svm(featureset, self._svmfeatureindex))
        if self._verbose:
            print 'instance', instance_to_classify
        # svmlight.classify expects a list; this should be taken advantage of when writing SvmClassifier.batch_classify / .batch_prob_classify.
        # it returns a list of floats, too.
        [prediction] = svmlight.classify(self._model, [instance_to_classify])
        return prediction

    def prob_classify(self, featureset):
        """
        Return a probability distribution of classifications

        :param featureset: a dict of feature/value pairs in NLTK format, representing a single instance
        """
        if self._model is None:
            raise Exception('This classifier is not yet trained')
            return None

        # do the classification
        prediction = self._get_svm_classification(featureset)
        if self._verbose:
            print 'prediction', prediction

        # lump it into a boolean class, -1 or +1
        predicted_label = cmp(prediction, 0)

        # sometimes the result is not within -1 ... +1; clip it so
        # that it is, and we get a sane-looking probability
        # distribution.  this will upset some results with non-linear
        # partitioning where instance-hyperplane distance can be many
        # orders of magnitude larger; I don't have a fix for that
        if prediction < -1.0:
            prediction = -1.0
        if prediction > 1.0:
            prediction = 1.0

        # if the prediction is negative, then we will maximise the
        # value of the -1 class; otherwise, that of the 1 class will
        # be greater.
        if predicted_label == 1:
            distribution = {str(self.resolve_prediction(1)): prediction,
                            str(self.resolve_prediction(-1)): 1 - prediction}
        else:
            distribution = {str(self.resolve_prediction(1)): prediction + 1,
                            str(self.resolve_prediction(-1)): -prediction}

        return DictionaryProbDist(distribution)


    def classify(self, featureset):
        """
        Use a trained SVM to predict a label given for an unlabelled instance

        :param featureset: a dict of feature/value pairs in NLTK format, representing a single instance
        """
        prediction = self._get_svm_classification(featureset)
        if self._verbose:
            print 'prediction', prediction
        return self.resolve_prediction(prediction)

    @staticmethod
    def train(featuresets):
        """
        given a set of training instances in nltk format:
        [ ( {feature:value, ..}, str(label) ) ]
        train a support vector machine

        :param featuresets: training instances
        """

        # build a unique list of labels
        labels = set()
        for (features, label) in featuresets:
            labels.add(label)

        # this is a binary classifier only
        if len(labels) > 2:
            raise ValueError('Can only do boolean classification (labels: '+ str(labels) + ')')
            return False

        # we need ordering, so a set's no good
        labels = list(labels)

        # next, assign -1 and 1
        labelmapping = {labels[0]:-1, labels[1]:1}

        # now for feature conversion
        # iter through instances, building a set of feature:type:str(value) triples
        svmfeatures = set()
        for (features, label) in featuresets:
            for k,v in features.iteritems():
                svmfeatures.add(featurename(k, v))
        # svmfeatures is indexable by integer svm feature number
        # svmfeatureindex is the inverse (svm feature name -> number)
        svmfeatures = list(svmfeatures)
        svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures))))

        # build svm feature set case by case
        svmfeatureset = []
        for instance in featuresets:
            svmfeatureset.append(map_instance_to_svm(instance, labelmapping, svmfeatureindex))

        # train the svm
        # TODO: implement passing of SVMlight parameters from train() to learn()
        return SvmClassifier(labels, labelmapping, svmfeatures, svmlight.learn(svmfeatureset, type='classification'))


def demo():

    def gender_features(word):
        return {'last_letter': word[-1], 'penultimate_letter': word[-2]}

    from nltk.classify import accuracy
    from nltk.corpus import names


    import random
    names = ([(name, 'male') for name in names.words('male.txt')] +
             [(name, 'female') for name in names.words('female.txt')])
    import random
    random.seed(60221023)
    random.shuffle(names)

    featuresets = [(gender_features(n), g) for (n,g) in names]
    train_set, test_set = featuresets[500:], featuresets[:500]

    print '--- nltk.classify.svm demo ---'
    print 'Number of training examples:', len(train_set)
    classifier = SvmClassifier.train(train_set)
    print 'Total SVM dimensions:', len(classifier._svmfeatureindex)
    print 'Label mapping:', classifier._labelmapping
    print '--- Processing an example instance ---'
    print 'Reference instance:', names[0]
    print 'NLTK-format features:\n    ' + str(test_set[0])
    print 'SVMlight-format features:\n    ' + str(map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex))
    distr = classifier.prob_classify(test_set[0][0])
    print 'Instance classification and confidence:', distr.max(), distr.prob(distr.max())
    print '--- Measuring classifier performance ---'
    print 'Overall accuracy:', accuracy(classifier, test_set)


if __name__ == '__main__':
    demo()