Tutorial Topic Modeling / Word Cloud Data Twitter
Oleh : Gilang Ananda Akbar 1461800191
https://informatika.untag-sby.ac.id/
Sebelum melakukan WordCloud, kita harus melakukan Scrapping data Twitter terlebih dahulu, berikut tutorialnya:
# Install Library
!pip install tweepy
# Import Library
import tweepy
import pandas as pd
import numpy as np
import datetime
# Fill the API Key
API_key = ‘K2iPMwsaSP76AjT55A05vpyz0’
API_secret = ‘GwdI6mqeTChByqs8dP1Knrep0GdXnrguo7JAlHphPB7GzBtarn’
access_token = ‘1214127349316218880 4uz7swBjNlNpv48zRSWMA8R7AqRGZ8’
access_token_secret = ‘XzpxD71qEM81Q9ezgRu346u707x4mW4Hw5zIe1LNItWKh’
# Auth.
auth = tweepy.OAuthHandler(API_key, API_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
# Finding tweets by Keyword ( ‘jokowi’ & 5000 data)
tweets = api.search(‘jokowi’, count=5000, lang=’id’)
# Show Tweets
tweets
# Print the collected tweet
data = pd.DataFrame()
data[‘Created ‘] = [tweet.created_at for tweet in tweets]
data[‘Screen Name’] = [tweet.user.screen_name for tweet in tweets]
data[‘Tweet’] = [tweet.text for tweet in tweets]
data[‘Location’] = [tweet.user.location for tweet in tweets]
data[‘Number of Retweet’] = [tweet.retweet_count for tweet in tweets]
data[‘Number of Like’] = [tweet.favorite_count for tweet in tweets]
display(data.head(10))
# Save as .CSV
data.to_csv(‘scrapping-jokowi.csv’)
Setelah kita melakukan Scrapping data dan ekspor ke Excel, selanjutnya kita lakukan WordCloud
#Import Library
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
%matplotlib inline
#Membaca file yang berada di Google Drive
from google.colab import drive
drive.mount(‘/content/drive’)
#Membaca file csv di Drive
dataset = pd.read_csv(“/content/drive/MyDrive/scrapping-jokowi.csv”)
dataset.head()
label = dataset[‘Tweet’]
label[:10]
#Text Cleaning
processed_features = []
for sentence in range(0, len(label)):
# Remove all the special characters
processed_feature = re.sub(r’\W’, ‘ ‘, str(label[sentence]))
# remove all single characters
processed_feature= re.sub(r’\s+[a-zA-Z]\s+’, ‘ ‘, processed_feature)
# Remove single characters from the start
processed_feature = re.sub(r’^[a-zA-Z]\s+’, ‘’, processed_feature)
# Substituting multiple spaces with single space
processed_feature = re.sub(r’\s+’, ‘ ‘, processed_feature, flags=re.I)
# Removing prefixed ‘b’
processed_feature = re.sub(r’^b\s+’, ‘’, processed_feature)
# Converting to Lowercase
processed_feature = processed_feature.lower()
processed_features.append(processed_feature)
label[:10]
#NLTK
import nltk
nltk.download(‘punkt’) # Sentence Tokenizer
nltk.download(‘stopwords’) # Stopword
#Stopword Indonesia
from nltk.corpus import stopwords
stop=nltk.corpus.stopwords.words(‘indonesian’)
label = label.apply(lambda x: “ “.join(x for x in x.split() if x not in stop))
label.head()
#Menghapus Tanda baca
label = label.str.replace(‘[^\w\s]’,’’)
label.head()
#Word Cloud
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
word_string=” “.join(label.str.lower())
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color=’white’,
).generate(word_string)
plt.figure(figsize=(15,10))
plt.imshow(wordcloud)
plt.axis(‘off’)
plt.show()