Tutorial Topic Modeling / Word Cloud Data Twitter

Oleh : Gilang Ananda Akbar 1461800191

Gilang Ananda Akbar

3 min readApr 27, 2021

https://untag-sby.ac.id/

https://informatika.untag-sby.ac.id/

Sebelum melakukan WordCloud, kita harus melakukan Scrapping data Twitter terlebih dahulu, berikut tutorialnya:

# Install Library

!pip install tweepy

# Import Library

import tweepy

import pandas as pd

import numpy as np

import datetime

# Fill the API Key

API_key = ‘K2iPMwsaSP76AjT55A05vpyz0’

API_secret = ‘GwdI6mqeTChByqs8dP1Knrep0GdXnrguo7JAlHphPB7GzBtarn’

access_token = ‘1214127349316218880 4uz7swBjNlNpv48zRSWMA8R7AqRGZ8’

access_token_secret = ‘XzpxD71qEM81Q9ezgRu346u707x4mW4Hw5zIe1LNItWKh’

# Auth.

auth = tweepy.OAuthHandler(API_key, API_secret)

auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

# Finding tweets by Keyword ( ‘jokowi’ & 5000 data)

tweets = api.search(‘jokowi’, count=5000, lang=’id’)

# Show Tweets

tweets

# Print the collected tweet

data = pd.DataFrame()

data[‘Created ‘] = [tweet.created_at for tweet in tweets]

data[‘Screen Name’] = [tweet.user.screen_name for tweet in tweets]

data[‘Tweet’] = [tweet.text for tweet in tweets]

data[‘Location’] = [tweet.user.location for tweet in tweets]

data[‘Number of Retweet’] = [tweet.retweet_count for tweet in tweets]

data[‘Number of Like’] = [tweet.favorite_count for tweet in tweets]

display(data.head(10))

# Save as .CSV

data.to_csv(‘scrapping-jokowi.csv’)

Setelah kita melakukan Scrapping data dan ekspor ke Excel, selanjutnya kita lakukan WordCloud

#Import Library

import pandas as pd

import numpy as np

import re

import nltk

import matplotlib.pyplot as plt

%matplotlib inline

#Membaca file yang berada di Google Drive

from google.colab import drive

drive.mount(‘/content/drive’)

#Membaca file csv di Drive

dataset = pd.read_csv(“/content/drive/MyDrive/scrapping-jokowi.csv”)

dataset.head()

label = dataset[‘Tweet’]

label[:10]

#Text Cleaning

processed_features = []

for sentence in range(0, len(label)):

# Remove all the special characters

processed_feature = re.sub(r’\W’, ‘ ‘, str(label[sentence]))

# remove all single characters

processed_feature= re.sub(r’\s+[a-zA-Z]\s+’, ‘ ‘, processed_feature)

# Remove single characters from the start

processed_feature = re.sub(r’^[a-zA-Z]\s+’, ‘’, processed_feature)

# Substituting multiple spaces with single space

processed_feature = re.sub(r’\s+’, ‘ ‘, processed_feature, flags=re.I)

# Removing prefixed ‘b’

processed_feature = re.sub(r’^b\s+’, ‘’, processed_feature)

# Converting to Lowercase

processed_feature = processed_feature.lower()

processed_features.append(processed_feature)

label[:10]

#NLTK

import nltk

nltk.download(‘punkt’) # Sentence Tokenizer

nltk.download(‘stopwords’) # Stopword

#Stopword Indonesia

from nltk.corpus import stopwords

stop=nltk.corpus.stopwords.words(‘indonesian’)

label = label.apply(lambda x: “ “.join(x for x in x.split() if x not in stop))

label.head()

#Menghapus Tanda baca

label = label.str.replace(‘[^\w\s]’,’’)

label.head()

#Word Cloud

from wordcloud import WordCloud, STOPWORDS

import matplotlib.pyplot as plt

word_string=” “.join(label.str.lower())

wordcloud = WordCloud(stopwords=STOPWORDS,

background_color=’white’,

).generate(word_string)

plt.figure(figsize=(15,10))

plt.imshow(wordcloud)

plt.axis(‘off’)

plt.show()