Analysing Youtube Comments — Stuff Made Here

Introduction

Visualizations

Word — Count table Stuff Made Here comments
Wordcloud from Stuff Made Here comments

Interesting Words

Making the vizualizations

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import json_lines
import os
from progress.bar import Bar
import json

files = os.listdir("rawdata")
stopwords = stopwords.words('english')

bar = Bar('Progress: ', max=len(files))
data = {}
for file in files:
file_data = json_lines.reader(open('rawdata/'+file,'r'))
for comment in file_data:
tokens = word_tokenize(comment['text'])
for word in tokens:
word = word.lower()
if word not in stopwords and word.isalpha():
if word in data.keys():
data[word] += 1
else:
data[word] = 1
bar.next()
bar.finish()

data = {k: v for k, v in sorted(data.items(), key=lambda item: item[1],reverse=True)}
json.dump(data,fp=open("wordcount.json","w"))
import os
from wordcloud import WordCloud
import numpy as np
from PIL import Image
import json

mask = np.array(Image.open("mask.png"))
data = json.load(open("wordcount.json","r"))


wc = WordCloud(width=3888,height=5180, background_color="white", max_words=6000,mask=mask,max_font_size=1000, random_state=32)

wc.generate_from_frequencies(data)
wc.to_file("cloud.png")

Student | Developer | Photographer