Using Machine Learning To Cluster Trump, Bernie and Biden Speeches


My goal for this project is to demonstrate how KNN can be used to cluster a corpus of text. In this example, I use speech transcripts from each politician found on https://www.rev.com/.

In [124]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS 
import docx2txt
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
plt.style.use('seaborn')
%matplotlib inline
nltk.download('punkt')
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/Dhutchings/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Out[124]:
True
In [125]:
# Loading in text documents

trump1 = open('/Users/Dhutchings/Downloads/data-society-major-speeches-by-donald-trump/speech_41.txt','r').read()
trump2 = open('/Users/Dhutchings/Downloads/data-society-major-speeches-by-donald-trump/speech_55.txt','r').read()
trump3 = open('/Users/Dhutchings/Downloads/data-society-major-speeches-by-donald-trump/speech_54.txt','r').read()

biden_1 = docx2txt.process("/Users/Dhutchings/Documents/biden_speeches/biden_speech1.docx")
biden_2 = docx2txt.process("/Users/Dhutchings/Documents/biden_speeches/biden_speech2.docx")

bernie_1 = docx2txt.process('/Users/Dhutchings/Documents/bernie_speeches/bernie_speech1.docx')
bernie_2 = docx2txt.process('/Users/Dhutchings/Documents/bernie_speeches/berni_speech2.docx')
bernie_3 = docx2txt.process('/Users/Dhutchings/Documents/bernie_speeches/bernie_speech3.docx')
In [126]:
# Defining file list that will iterated through later on

file_list = [trump1,trump2,trump3,biden_1,biden_2,bernie_1,bernie_2,bernie_3]
In [127]:
def preprocess(s, lowercase=True, strip_punctuation=True):
    """
    Input:
        string s
        boolean lowercase
        boolean strip_punctuation
    Return:
        list of strings
    """
    stopwords = set(STOPWORDS) 
    punctuation = '.,?<>:;"\'!%]\[--`/'
    if isinstance(s, str):
        s = word_tokenize(s)
    if lowercase:
        s = [t.lower() for t in s]
    if strip_punctuation:
        s = [t.strip(punctuation) for t in s]
    s = [t for t in s if t not in stopwords and t not in ['',"'",'1']]
    return s
In [128]:
def token_frequency(tokens=None, tf={}, relative=False):
    """
    Input:
        tokens = list of strings or None
        tf = dict or None
        relative = boolean
    Return:
        dictionary of token frequencies
    """
    for t in tokens:
        if t in tf:
            tf[t]+=1
        else:
            tf[t]=1
    if relative:
        total = sum([c for t, c in tf.items()])
        tf = {t:tf[t]/total for t in tf}
    return tf
In [129]:
# Creating term frequency dictionary of every word in corpus

tf = {}
for f in file_list:
    s = preprocess(f)
    tf = token_frequency(s,tf=tf,relative=True)
In [130]:
# Sorting dictionary by top 50 most frequent words in corpus

top_50 = sorted(tf.items(),key=lambda x: x[1],reverse=True)[1:51]
In [131]:
# Creating list of the top 50 most frequent words in corpus

features = [k for k,v in top_50]
In [132]:
# Creating a list of term frequency dictionary's that reflect
# the term frequency of the word in that specific speech

vectors = [token_frequency(preprocess(f), tf={}, relative=True) for f in file_list]
In [133]:
# Filtering for words that are in the list of top 50 words in corpus

vectors = [{key:v[key] for key in v if key in features} for v in vectors]
In [134]:
# Defining labels for dataframe in next cell

labels = ['trump','trump','trump','biden','biden','bernie','bernie','bernie']
In [135]:
vectors_df = pd.DataFrame(vectors, index=labels, columns=features).fillna(0)
vectors_df
Out[135]:
re going country people t s now us believe tired ... talk half million healthcare power hearing right know friends united
trump 0.004284 0.025707 0.011140 0.004284 0.000000 0.013710 0.002571 0.001714 0.002571 0.000000 ... 0.000000 0.000857 0.000857 0.000857 0.000000 0.000000 0.001714 0.000000 0.000000 0.007712
trump 0.009639 0.008434 0.010843 0.020482 0.000000 0.025301 0.006024 0.008434 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.002410 0.010843 0.000000 0.003614
trump 0.000000 0.010161 0.002540 0.006774 0.000000 0.015241 0.003387 0.002540 0.000000 0.000847 ... 0.000000 0.000847 0.000847 0.000847 0.002540 0.000000 0.001693 0.000000 0.000000 0.001693
biden 0.015419 0.009912 0.008811 0.011013 0.014317 0.041850 0.002203 0.006608 0.005507 0.000000 ... 0.000000 0.000000 0.001101 0.000000 0.000000 0.000000 0.000000 0.001101 0.000000 0.003304
biden 0.009914 0.001322 0.004627 0.005948 0.014541 0.031064 0.003305 0.005948 0.006609 0.000000 ... 0.000000 0.000000 0.000661 0.000000 0.003966 0.000000 0.002644 0.001322 0.000000 0.003305
bernie 0.019190 0.023454 0.008529 0.012793 0.008529 0.008529 0.006397 0.014925 0.000000 0.002132 ... 0.000000 0.004264 0.002132 0.002132 0.000000 0.000000 0.004264 0.004264 0.000000 0.004264
bernie 0.021127 0.031690 0.016432 0.016432 0.004695 0.002347 0.005869 0.009390 0.004695 0.000000 ... 0.001174 0.000000 0.000000 0.000000 0.000000 0.000000 0.007042 0.007042 0.004695 0.003521
bernie 0.026758 0.017584 0.017584 0.016820 0.012232 0.010703 0.009939 0.009174 0.009174 0.009174 ... 0.003823 0.003823 0.003823 0.003823 0.003823 0.003823 0.003058 0.003058 0.003058 0.003058

8 rows × 50 columns

In [136]:
# Fitting a KNN model on the dataframe that is searching for 3 clusters

n_clusters=3
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(vectors_df)
kmeans
Out[136]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)
In [137]:
# Printing out the labels generated by the KNN model
# The KNN model accurately clustered each speech

kmeans.labels_
Out[137]:
array([2, 2, 2, 0, 0, 1, 1, 1], dtype=int32)
In [138]:
# Using Principal Component Analysis to simplify
#   our dataframe to 2 features for the purpose of visualizing the results

pca = PCA(n_components=2)
transformed = pca.fit_transform(vectors_df)
In [139]:
pca.explained_variance_ratio_
Out[139]:
array([0.40636527, 0.24081337])
In [140]:
x = transformed[:,0]
y = transformed[:,1]
In [141]:
col_dict = {0:'red', 1:'blue',2:'green'}
cols = [col_dict[l] for l in kmeans.labels_]
plt.figure(figsize=(18,13))
plt.scatter(x,y, c=cols)
for i, l in enumerate(labels):
    plt.text(x[i],y[i], l, fontsize=20)
plt.xlabel('Component 1',fontsize=20)
plt.ylabel('Component 2', fontsize=20)
plt.title("2-D Visualization of Trump, Biden and Bernie Speeches", fontsize=35)
Out[141]:
Text(0.5, 1.0, '2-D Visualization of Trump, Biden and Bernie Speeches')
In [146]:
bernie_wordcloud = ' '.join(preprocess(bernie_1))
In [147]:
berniewordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(bernie_wordcloud) 
In [148]:
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.title("Bernie Wordcloud", fontsize=35)
plt.imshow(berniewordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show() 
In [149]:
biden_wordcloud = ' '.join(preprocess(biden_1))
In [150]:
bidenwordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(biden_wordcloud)
In [151]:
plt.figure(figsize = (8, 8), facecolor = None) 
plt.title("Biden Wordcloud", fontsize=35)
plt.imshow(bidenwordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show() 
In [152]:
trump_wordcloud = ' '.join(preprocess(trump1))
In [153]:
trumpwordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(trump_wordcloud)
In [154]:
plt.figure(figsize = (8, 8), facecolor = None) 
plt.title("Trump Wordcloud", fontsize=35)
plt.imshow(trumpwordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show()