# Imports
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import docx2txt
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
plt.style.use('seaborn')
%matplotlib inline
nltk.download('punkt')
# Loading in text documents
trump1 = open('/Users/Dhutchings/Downloads/data-society-major-speeches-by-donald-trump/speech_41.txt','r').read()
trump2 = open('/Users/Dhutchings/Downloads/data-society-major-speeches-by-donald-trump/speech_55.txt','r').read()
trump3 = open('/Users/Dhutchings/Downloads/data-society-major-speeches-by-donald-trump/speech_54.txt','r').read()
biden_1 = docx2txt.process("/Users/Dhutchings/Documents/biden_speeches/biden_speech1.docx")
biden_2 = docx2txt.process("/Users/Dhutchings/Documents/biden_speeches/biden_speech2.docx")
bernie_1 = docx2txt.process('/Users/Dhutchings/Documents/bernie_speeches/bernie_speech1.docx')
bernie_2 = docx2txt.process('/Users/Dhutchings/Documents/bernie_speeches/berni_speech2.docx')
bernie_3 = docx2txt.process('/Users/Dhutchings/Documents/bernie_speeches/bernie_speech3.docx')
# Defining file list that will iterated through later on
file_list = [trump1,trump2,trump3,biden_1,biden_2,bernie_1,bernie_2,bernie_3]
def preprocess(s, lowercase=True, strip_punctuation=True):
"""
Input:
string s
boolean lowercase
boolean strip_punctuation
Return:
list of strings
"""
stopwords = set(STOPWORDS)
punctuation = '.,?<>:;"\'!%]\[--`/'
if isinstance(s, str):
s = word_tokenize(s)
if lowercase:
s = [t.lower() for t in s]
if strip_punctuation:
s = [t.strip(punctuation) for t in s]
s = [t for t in s if t not in stopwords and t not in ['',"'",'1']]
return s
def token_frequency(tokens=None, tf={}, relative=False):
"""
Input:
tokens = list of strings or None
tf = dict or None
relative = boolean
Return:
dictionary of token frequencies
"""
for t in tokens:
if t in tf:
tf[t]+=1
else:
tf[t]=1
if relative:
total = sum([c for t, c in tf.items()])
tf = {t:tf[t]/total for t in tf}
return tf
# Creating term frequency dictionary of every word in corpus
tf = {}
for f in file_list:
s = preprocess(f)
tf = token_frequency(s,tf=tf,relative=True)
# Sorting dictionary by top 50 most frequent words in corpus
top_50 = sorted(tf.items(),key=lambda x: x[1],reverse=True)[1:51]
# Creating list of the top 50 most frequent words in corpus
features = [k for k,v in top_50]
# Creating a list of term frequency dictionary's that reflect
# the term frequency of the word in that specific speech
vectors = [token_frequency(preprocess(f), tf={}, relative=True) for f in file_list]
# Filtering for words that are in the list of top 50 words in corpus
vectors = [{key:v[key] for key in v if key in features} for v in vectors]
# Defining labels for dataframe in next cell
labels = ['trump','trump','trump','biden','biden','bernie','bernie','bernie']
vectors_df = pd.DataFrame(vectors, index=labels, columns=features).fillna(0)
vectors_df
# Fitting a KNN model on the dataframe that is searching for 3 clusters
n_clusters=3
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(vectors_df)
kmeans
# Printing out the labels generated by the KNN model
# The KNN model accurately clustered each speech
kmeans.labels_
# Using Principal Component Analysis to simplify
# our dataframe to 2 features for the purpose of visualizing the results
pca = PCA(n_components=2)
transformed = pca.fit_transform(vectors_df)
pca.explained_variance_ratio_
x = transformed[:,0]
y = transformed[:,1]
col_dict = {0:'red', 1:'blue',2:'green'}
cols = [col_dict[l] for l in kmeans.labels_]
plt.figure(figsize=(18,13))
plt.scatter(x,y, c=cols)
for i, l in enumerate(labels):
plt.text(x[i],y[i], l, fontsize=20)
plt.xlabel('Component 1',fontsize=20)
plt.ylabel('Component 2', fontsize=20)
plt.title("2-D Visualization of Trump, Biden and Bernie Speeches", fontsize=35)
bernie_wordcloud = ' '.join(preprocess(bernie_1))
berniewordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(bernie_wordcloud)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.title("Bernie Wordcloud", fontsize=35)
plt.imshow(berniewordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
biden_wordcloud = ' '.join(preprocess(biden_1))
bidenwordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(biden_wordcloud)
plt.figure(figsize = (8, 8), facecolor = None)
plt.title("Biden Wordcloud", fontsize=35)
plt.imshow(bidenwordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
trump_wordcloud = ' '.join(preprocess(trump1))
trumpwordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(trump_wordcloud)
plt.figure(figsize = (8, 8), facecolor = None)
plt.title("Trump Wordcloud", fontsize=35)
plt.imshow(trumpwordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()