# Problem definition

In this post, we use the Stanford Large Movie Reviews dataset to perform simple binary sentiment classification on each review as positive or negative. We keep things simple, as the aim here is to only investigate the performance of various Scikit-learn classifiers on three different training-testing set splits.

# Preprocessing and feature vector generation

For preprocessing, we only perform punctuation removal, stopwords removal, conversion to lowercase, and tokenization to create a bag of words model. We only use the first 5,000 positive reviews and 5,000 negative reviews. The top 5000 most frequent words are used to create a features vector for each review, and then generate a dictionary of key value pairs for the dataset, where keys are words and values is a boolean as True if the word exists in the review document.

# Training-testing splits and classifiers used

We test on three different training-testing set splits of 50:50, 75:25, and 90:10. The classifiers used are:

# The full code:

```import nltk
import time
import random
import re
import glob
from nltk.corpus import stopwords
# import classifiers
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.tokenize import word_tokenize

def main():

pos_docs = glob.glob('test\\pos\\*.txt') # open dir of pos docs
neg_docs = glob.glob('test\\neg\\*.txt') # open dir of pos docs
pos_dataset = []
neg_dataset = []

# read pos docs one by one
for doc in pos_docs[:5000]:
f = open(doc, encoding=&quot;utf8&quot;)
pos_dataset.append(doc)
f.close()

# read neg docs one by one
for doc in neg_docs[:5000]:
f = open(doc, encoding=&quot;utf8&quot;)
neg_dataset.append(doc)
f.close()

all_words = []
docs = []

#define stopwords
stop_words = list(set(stopwords.words('english')))

### preprocessing of pos docs:
for doc in pos_dataset:

# generate tuples of review - label pairs
docs.append((doc, &quot;pos&quot;))
# punctuation removal
cleaned_doc = re.sub(r'[^(a-zA-Z)\s]','', doc)
# tokenization to create a BOW model
tokenized_doc = word_tokenize(cleaned_doc)
# stopwords removal
final_doc = [w for w in tokenized_doc if not w in stop_words]
# conversion to lowercase
for word in final_doc:
all_words.append(word.lower())

### preprocessing of neg docs
for doc in neg_dataset:

# generate tuples of review - label pairs
docs.append((doc, &quot;neg&quot;))
# punctuation removal
cleaned_doc = re.sub(r'[^(a-zA-Z)\s]','', doc)
# tokenization to create BOW model
tokenized_doc = word_tokenize(cleaned_doc)
# stopwords removal
final_doc = [w for w in tokenized_doc if not w in stop_words]
# conversion to lowercase
for w in final_doc:
all_words.append(w.lower())

# taking words and generating a frequency distribution
all_words = nltk.FreqDist(all_words)

# using the most frequent 5000 words as the features vector for each review
word_features = list(all_words.keys())[:5000]

# generating a dictionary of features
#(keys are words and values are boolean values as True if word exists in doc)
def find_features(document):
words = word_tokenize(document)
features = {}
for word in word_features:
features[word] = (word in words)
return features

# creating a features vector for each review
featuresets = [(find_features(review), category) for (review, category) in docs]

# always good to shuffle the docs
random.shuffle(featuresets)

#defining the train-test sets
training_set = featuresets[:5000] #setting the training set to 50% of the samples
testing_set = featuresets[5000:] #setting the testing set to the remaining 50%

# performing classification using various classifiers
print(&quot;Benchmarking classifiers using {} training samples and {} testing samples...\n&quot;.format(len(training_set), len(testing_set)))

NLTKNB_classifier = nltk.NaiveBayesClassifier.train(training_set)
print(&quot;NLTK NB Classifier accuracy percent:&quot;,(nltk.classify.accuracy(NLTKNB_classifier, testing_set))*100)

# multinomial naive bayes classifier
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print(&quot;MNB_classifier accuracy percent:&quot;,(nltk.classify.accuracy(MNB_classifier, testing_set))*100)

# Bernoulli naive bayes classifier
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print(&quot;BernoulliNB_classifier accuracy percent:&quot;,(nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

# Logistic regression classifier
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print(&quot;LogisticRegression_classifier accuracy percent:&quot;,(nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

# SGD classifier
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print(&quot;SGDClassifier_classifier accuracy percent:&quot;,(nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

# Support vector classifier
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print(&quot;SVC_classifier accuracy percent:&quot;,(nltk.classify.accuracy(SVC_classifier, testing_set))*100)

# Linear version of support vector classifier
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print(&quot;LinearSVC_classifier accuracy percent:&quot;,(nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

# Nu support vector classifier
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print(&quot;NuSVC_classifier accuracy percent:&quot;,(nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
if __name__ == '__main__':

startExecutionTime = time.time()

main()

# get execution time
print(&quot;~~~~~-Execution time: %s Min~~~~~&quot; % ((time.time() - startExecutionTime)/60))
```

# Output

The output is shown below. The training-testing split was 50:50 in this case. The accuracy is relatively similar across all classifiers, with the highest accuracy being 82.52 by the NuSVC classifier, followed by Logistic Regression (82.04) and then Multinomial NB (80.82). The total time taken for the entire process was about 20 min. The accuracy result would vary slightly on different runs since we are shuffling the documents.

```Benchmarking classifiers using 5000 training samples and 5000 testing samples...
NLTK NB Classifier accuracy percent: 80.04
MNB_classifier accuracy percent: 80.82000000000001
BernoulliNB_classifier accuracy percent: 80.02
LogisticRegression_classifier accuracy percent: 82.04
SGDClassifier_classifier accuracy percent: 80.36
SVC_classifier accuracy percent: 76.58
LinearSVC_classifier accuracy percent: 78.5
NuSVC_classifier accuracy percent: 82.52000000000001
~~~~~-Execution time: 20.05739815235138 Min~~~~~
```

The same code above was run two more times, with a training-testing split of 75:25 and 90:10, as shown below.

```    #defining the train-test sets
training_set = featuresets[:7500] #setting the training set to 75% of the samples
testing_set = featuresets[7500:] #setting the testing set to 25% of the samples
```
```    #defining the train-test sets
training_set = featuresets[:9000] #setting the training set to 90% of the samples
testing_set = featuresets[9000:] #setting the testing set to 10% of the samples
```

The output is shown below. Surprisingly, the classifiers perform with roughly similar performance even after we increased the training samples. This may be because we only use the top 5000 most frequent words to generate the features vectors for each review.

```Benchmarking classifiers using 7500 training samples and 2500 testing samples...
NLTK NB Classifier accuracy percent: 80.24
MNB_classifier accuracy percent: 80.84
BernoulliNB_classifier accuracy percent: 80.0
LogisticRegression_classifier accuracy percent: 81.52000000000001
SGDClassifier_classifier accuracy percent: 79.60000000000001
SVC_classifier accuracy percent: 77.92
LinearSVC_classifier accuracy percent: 79.67999999999999
NuSVC_classifier accuracy percent: 81.88
~~~~~-Execution time: 16.24495245218277 Min~~~~~
```
```Benchmarking classifiers using 9000 training samples and 1000 testing samples...
NLTK NB Classifier accuracy percent: 79.9
MNB_classifier accuracy percent: 81.6
BernoulliNB_classifier accuracy percent: 80.2
LogisticRegression_classifier accuracy percent: 82.69999999999999
SGDClassifier_classifier accuracy percent: 81.10000000000001
SVC_classifier accuracy percent: 78.8
LinearSVC_classifier accuracy percent: 78.60000000000001
NuSVC_classifier accuracy percent: 84.39999999999999
~~~~~-Execution time: 14.92531372308731 Min~~~~~
```

# Spaghetti plot

The spaghetti plot was created using matplotlib. The values were manually input, just to show how the classifiers perform when increasing training data samples, and what classifiers perform better than others in this case. The code and plots are shown below. According to the plots, NuSVC performs best, followed by Logistic Regression, and finally Multinomial Naive Bayes.

```import matplotlib.pyplot as plt
x1 = [5000,7500,9000]
y1 = [80.04,80.24,79.9]
plt.plot(x1, y1, label = &quot;NLTK NB&quot;)

x2 = [5000,7500,9000]
y2 = [80.82,80.84,81.6]
plt.plot(x2, y2, label = &quot;Multinomial NB&quot;)
x3 = [5000,7500,9000]
y3 = [80.02,80,80.2]
plt.plot(x3, y3, label = &quot;Bernoulli NB&quot;)
x4 = [5000,7500,9000]
y4 = [82.04,81.52,82.69]
plt.plot(x4, y4, label = &quot;Log Reg&quot;)
x5 = [5000,7500,9000]
y5 = [80.36,79.6,81.1]
plt.plot(x5, y5, label = &quot;SGD&quot;)
x6 = [5000,7500,9000]
y6 = [76.58, 77.92, 78.8]
plt.plot(x6, y6, label = &quot;SVC&quot;)
x7 = [5000,7500,9000]
y7 = [78.5,79.67,78.6]
plt.plot(x7, y7, label = &quot;LinearSVC&quot;)
x8 = [5000,7500,9000]
y8 = [82.52,81.88,84.39]
plt.plot(x8, y8, label = &quot;NuSVC&quot;)

plt.xlabel('Training samples')
plt.ylabel('Accuracy')
plt.title('Classification accuracy vs training samples ')
plt.legend()
plt.show()
```

That’s it! Now you can use your own training data to investigate which classifier works best with it, and how much training data you need to achieve the desired accuracy. ðŸ™‚

3