Tech | Code

Twitter Virality Prediction Algorithm

#based on a data set of tweets I played around with Python to predict wheter or not a Tweet will become viral or not based on the tweet length, word count, as well as the nr of followers, friends, hashtags and links.

#You can find the original data in the random tweets zip file

and if you have have the necessary packages installed

you can see I reached an accuracy of 92%.

Here is how I did it.

import pandas as pd

#Having a look at the data set structure

all_tweets = pd.read_json("random_tweets.json", lines=True)

print(len(all_tweets))

print(all_tweets.columns)

print(all_tweets.loc[0]['text'])

#Print the user here and the user's location here.

print(all_tweets.loc[0]["user"]["location"])

#So how do we define a viral tweet? A good place to start is to look at the number of retweets the tweet has. This can be found using the feature `"retweet_count"`. Let's say we wanted to create a column called `is_viral` that is a `1` if the tweet had more than `x` retweets and `0` otherwise.

import numpy as np

all_tweets['is_viral'] = np.where(all_tweets['retweet_count'] > 5000, 1, 0)

print(all_tweets['is_viral'].value_counts())

# Making Features: Now that we've created a label for every tweet in our dataset, we can begin thinking about which features might determine whether a tweet is viral. We can create new columns in our dataset to represent these features. For example, let's say we think the length of a tweet might be a valuable feature.

all_tweets['tweet_length'] = all_tweets.apply(lambda tweet: len(tweet['text']), axis=1)

all_tweets['followers_count'] = all_tweets.apply(lambda tweet: tweet['user']['followers_count'], axis=1)

all_tweets['friends_count'] = all_tweets.apply(lambda tweet: tweet['user']['friends_count'], axis=1)

all_tweets['hashtag_count'] = all_tweets.apply(lambda tweet: tweet['text'].count("#"), axis=1)

all_tweets['link_count'] = all_tweets.apply(lambda tweet: tweet['text'].count("http"), axis=1)

all_tweets['word_count'] = all_tweets.apply(lambda tweet: len(tweet['text'].split(" ")), axis=1)

#Normalizing data to avoid skewing

from sklearn.preprocessing import scale

labels = all_tweets["is_viral"]

data = all_tweets[["tweet_length","followers_count","friends_count", "hashtag_count", "link_count", "word_count"]]

scaled_data = scale(data,axis=0)

print(scaled_data[0])

#creating training set and test set

from sklearn.model_selection import train_test_split

train_data, test_data, train_labels, test_labels = train_test_split(scaled_data, labels, test_size = 0.2, random_state = 1)

#Determining ideal nr of neigherest neighbor K

from sklearn.neighbors import KNeighborsClassifier

scores=[]

for k in range(1,200):

classifier = KNeighborsClassifier(n_neighbors=k)

classifier.fit(train_data,train_labels)

scores.append(classifier.score(test_data,test_labels))

import matplotlib.pyplot as plt

plt.plot(range(1,200),scores)

plt.show()

#Choosing optimal nr of neighrest neighbor

classifier = KNeighborsClassifier(n_neighbors=4)

#Fitting classifier and calculating accuracy

classifier.fit(train_data,train_labels)

print(classifier.score(train_data,train_labels))

random tweets.zip