# Import packages
import os
import sys
import fbprophet
import datetime
import holidays
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.dates as mdates
import matplotlib.pyplot as pyplot
from datetime import date
from statistics import mode
from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics
from matplotlib import pyplot as plt
from statsmodels.tsa import seasonal
from statsmodels.tsa import stattools
from search_sampler import SearchSampler
from pandas.plotting import autocorrelation_plot
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')
Brand: Netflix
Freebase ID: /m/017rf_
Source: https://www.wikidata.org
In section 1, I
statsmodels.tsa
to decompose the time series. # Read in Netflix_googletrends.csv as a DataFrame
Netflix = pd.read_csv('Netflix.googletrends.csv',parse_dates=['period'])
# Convert column of 'period' into DateTime format.
Netflix['period'] = Netflix['period'].dt.date
# Set the column of 'period' as the index of DataFrame.
Netflix.set_index('period',inplace=True)
# Have a glance
Netflix.head(5)
'''
Convert data to a stationary format that is acceptable for time series modeling by using
YearLocator() and MonthLocator() from matplotlib.dates library.
'''
years = mdates.YearLocator() # Locate years by matplotlib.dates
months = mdates.MonthLocator() # Locate months by matplotlib.dates
yearmonth = mdates.DateFormatter('%m,%y') # Format Data to mm,yy
# Plot of General Trends (Moving Averages)
fig, ax = pyplot.subplots(figsize=(40,8))
ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(yearmonth)
ax.title.set_text('General Trends')
ax.plot(Netflix.index,Netflix['value'])
plt.show()
'''
Decomposing the time series by using 'seasonal.seasonal_decompose' from 'statsmodels.tsa'
library.
'''
decompose_model = seasonal.seasonal_decompose(Netflix['value'].tolist(), freq=365, model='additive')
dir(decompose_model)
'''
A standard time series equation can be expressed as $x_t = f_t + s_t + c_t + e_t$, which
is a sum of the trend, seasonal, cyclical, and irregular components in that order.
Plot of 'API Value', 'Trend', 'Seasonality', 'Residual' versus time.
'''
fig, (ax1, ax2, ax3, ax4) = pyplot.subplots(4, figsize=(15,15))
ax1.plot(Netflix.index, Netflix['value'])
ax2.plot(Netflix.index, decompose_model.trend)
ax3.plot(Netflix.index, decompose_model.seasonal)
ax4.plot(Netflix.index, decompose_model.resid)
ax1.title.set_text('API Value')
ax2.title.set_text('Trend')
ax3.title.set_text('Seasonality')
ax4.title.set_text('Residual Plot')
plt.show()
What is the overall trend? Is the search interest for your brand steadily growing?
According to the visualizations of API Value and Trend shown above, the overall trend is a downward trend from the beginning of 2014 through the end of 2017. The Netflix search interest reaches its peak in mid of 2014 and mid of 2015 while visiting its trough at the beginning of 2017. In general, we could observe a downwards tendency of trends.
For a more comprehensive analysis, we could break the timeline to only one year to avoid short-run effects.
What does the seasonality look like for this brand?
The seasonality plot shows that Netflix arrives at its maximum search interest at the beginning and end of each year. We could also observe another little peak around summer period. Seasonality is driven by the consumers because the search interest of Netflix depends on the spare time of consumers.
# Add a new column of 'seasonal' component from decompose_model into the dataframe.
Netflix['seasonal'] = decompose_model.seasonal
# Have a glance
Netflix.head(5)
# Start analyzing the seasonality trend in relatively short run, one-year range.
startdate = pd.to_datetime("2015-01-01").date()
enddate = pd.to_datetime("2015-12-31").date()
oneyear = Netflix.loc[startdate:enddate]
# Plot of Seasonality in 2015
fig, ax = pyplot.subplots(figsize=(15,8))
pyplot.plot(oneyear.index, oneyear.seasonal)
ax.title.set_text('Seasonality in One-year Range')
plt.show()
The plot above shows the seasonality in the one-year range. With a more clear insight, we could observe the Google search interest arrives at its peak at the beginning and end of the year. Moreover, the search interest starts decreasing from January to May and warms up a little bit during the summer. The search interest is unstable from September to November and starts climbing in December.
In section 2, I
Netflix = pd.read_csv('Netflix.googletrends.csv',parse_dates=['period'])
prophetdataframe = pd.DataFrame() # Create a new prophet
prophetdataframe['ds'] = Netflix['period'] # DataStamp
prophetdataframe['y'] = Netflix['value'] # Value
# Have a glance
prophetdataframe.head(5)
# Fit the model by fbprophet.
m = Prophet()
m.fit(prophetdataframe)
# Create a new dataframe for next one year with dates
future = m.make_future_dataframe(periods = 365)
# Have a glance
future.tail()
'''
New DataFrame generated by the prediction from fbprophet, including columns of trend, yhat,
additive terms, weekly, yearly, multiplicative terms(empty) and their ranges.
'''
forecast = m.predict(future)
forecast.tail()
# Model Diagnostics
erroranalytics = m.predict(prophetdataframe)
erroranalytics['value'] = prophetdataframe['y'] # import real value
erroranalytics['residuals'] = erroranalytics['value'] - erroranalytics['yhat'] # compare predict value to real value
erroranalytics['absoluteresiduals'] = abs(erroranalytics['residuals']) # absolute error
# Have a glance
erroranalytics.head()
# Visualization
fig1 = m.plot(forecast)
# component visualizations
fig2 = m.plot_components(forecast)
# sum of absolute residuals
totalerror = erroranalytics['absoluteresiduals'].sum()
print(totalerror)
# MAE
print(totalerror/len(erroranalytics))
# Meaning: the model was off about 1300 Google trend units every day on average.
erroranalytics['value'].describe()
goodholidays = [] # new list for holidays
for date, name in sorted(holidays.US(years=2014).items()): # take an example of 2014
goodholidays.append(name)
print(date, name)
# new DataFrame
sales = pd.DataFrame(columns=['holiday','ds','lower_window','upper_window'])
for year in range(2014,2021): # major holidays from 2014 to 2020
for date, name in sorted(holidays.US(years=year).items()):
lower_window = 0 # reset the 'lower_window'
upper_window = 0 # reset the 'upper_window'
if name in goodholidays:
dayofweek = date.weekday()
#add additional dates to current holidays
if name == "New Year's Day":
lower_window = -1
if name == "Thanksgiving":
upper_window = 3
if name == "Christmas Day":
lower_window = -5
upper_window = 6
if name == "Labor Day":
lower_window = -3
if name == "Independence Day":
upper_window = 3
sales.loc[len(sales)] = [name, date, lower_window, upper_window]
'''
Add include Superbowl as a typical holiday. Live stream on Netflix for sure!
'''
superbowls = pd.DataFrame({
'holiday': 'Superbowl',
'ds': pd.to_datetime(['2014-02-02', '2015-02-01', '2016-02-07',
'2017-02-05', '2018-02-04', '2019-02-03',
'2020-02-02']),
'lower_window': 0,
'upper_window': 1,
})
superbowls['ds'] = superbowls['ds'].dt.date
sales = pd.concat([sales, superbowls],ignore_index=True)
sales = sales.sort_values(['ds'], ascending=[True]).reset_index(drop=True)
# Have a glance
sales.head()
adv_prophetdataframe = pd.DataFrame()
adv_prophetdataframe['ds'] = Netflix['period']
adv_prophetdataframe['y'] = Netflix['value']
adv_prophetdataframe.head()
adv_m = Prophet(holidays=sales)
adv_m.add_country_holidays(country_name='US')
adv_m.fit(adv_prophetdataframe)
# add Netflix up a show time
print(adv_m.train_holiday_names)
'''
Create new DataFrame for prediction on next one year
'''
adv_future = adv_m.make_future_dataframe(periods = 365)
adv_forecast = adv_m.predict(adv_future)
adv_erroranalytics = adv_m.predict(adv_prophetdataframe)
adv_erroranalytics['value'] = adv_prophetdataframe['y']
adv_erroranalytics['residuals'] = adv_erroranalytics['value'] - adv_erroranalytics['yhat']
adv_erroranalytics['absoluteresiduals'] = abs(adv_erroranalytics['residuals'])
adv_erroranalytics.head()
adv_totalerror = adv_erroranalytics['absoluteresiduals'].sum()
print(adv_totalerror)
print(adv_totalerror/len(adv_erroranalytics))
How accurate does the model appear to be?
The accuracy of the model is representing by the average prediction error. Lower average prediction error indicates a higher accuracy in general. In this case, the forecast using holiday effects has a lower average prediction error (1127.7 compared with 1297.8) and is more accurate in our case.
fig, ax = pyplot.subplots(figsize=(15,8))
ax.plot(adv_erroranalytics['ds'].dt.to_pydatetime(), adv_erroranalytics['absoluteresiduals'], 'k.', color='#0072B2')
ax.title.set_text('Residual Plot')
plt.show()
Are the residuals consistent across time? If not, why do you think it varies?
Based on the residual plot above, we could find that the residuals are not consistent across time. The residuals tend to be larger at the beginning and end of each year while is it is similar on other occasions. I think the residuals are varied because of some holidays effects and outliers effect. Even though we have included holidays effect in our forecasting, the difference in search interest is still too significant to be minimized.
In section 3, I
# create forecast plot
fig1 = m.plot(forecast)
# create component plot
fig2 = adv_m.plot_components(forecast)
What are the projected high interest times?
Based on the weekly plot, we could find Sunday and Saturday are the times that Google search interests of Netflix are high. The yearly plot also indicates that January is the time that the search interests are high. These projected high-interest times indicate consumers tend to have more free time during the weekends and New Year to watch Netflix. Since some people might have a summer break, we could also observe summer as a relatively active search time.
What are the projected low interest times?
From the same graph, we could observe search interests decrease significantly from Monday and remain low from Tuesday to Thursday. The search interests would increase again from Friday and stay high during the weekend. In the annual aspect, May tends to have the lowest search interests. The search interests decrease from January to February and keep variating from September to November. These patterns indicate consumers are busy during the weekdays and have less time watching Netflix. Also, people are super busy before some big holidays.
In section 4, I
# load adspend data
adspend = pd.read_csv('netflix.adspend.csv', encoding="ISO-8859-1")
# perform data cleaning
adspend['TIME PERIOD'] = adspend['TIME PERIOD'].str.replace("WEEK OF ","",regex=True)
adspend['TIME PERIOD'] = adspend['TIME PERIOD'].str.replace(" \(B\)", "",regex=True)
adspend.columns = adspend.columns.str.replace(" DOLS \(000\)", "",regex=True)
# Ignore the last 5 rows to avoid NA.
adspend = adspend[:-5]
adspend.tail()
# display all the columns names
list(adspend)
# convert time period to datetime formate.
adspend['FIXED TIME'] = pd.to_datetime(adspend['TIME PERIOD'])
# aggregated the total ads spend by time
adspendnoproducts = adspend.pivot_table(index = 'FIXED TIME', aggfunc = np.sum)
adspendnoproducts.head()
list(adspendnoproducts)
adspendnoproducts.index
# plot total ads spend
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, figsize =(15,15), sharex = 'all')
ax1.plot(adspendnoproducts.index, adspendnoproducts['CABLE TV'])
ax2.plot(adspendnoproducts.index, adspendnoproducts['NETWORK TV'])
ax3.plot(adspendnoproducts.index, adspendnoproducts['SPOT TV'])
ax4.plot(adspendnoproducts.index, adspendnoproducts['TOTAL'])
ax1.title.set_text('CABLE TV')
ax2.title.set_text('NETWORK TV')
ax3.title.set_text('SPOT TV')
ax4.title.set_text('TOTAL')
plt.show()
# set period as index
Netflix.set_index('period', inplace=True)
# resample the data and calculate the average
Netflixweekly = Netflix.resample('W-MON', closed='left', label='left').mean()
Netflixweekly.head()
# join weekly Netflix ad spends with aggregated Google trends data.
Netflixweeklydata = adspendnoproducts.join(Netflixweekly)
Netflixweeklydata.head()
# make a visual comparison of the weekly average Netflix ad spend data.
dimensions = (10, 10)
fig, (ax1, ax2) = plt.subplots(2, figsize = dimensions, sharex = 'all')
ax1.plot(Netflixweeklydata.index, Netflixweeklydata['TOTAL'])
ax2.plot(Netflixweeklydata.index, Netflixweeklydata['NETWORK TV'])
ax1.title.set_text('Total')
ax2.title.set_text('NETWORK TV')
plt.show()
Make a correlation test between the total Netflix ad spend and average Google trends data. According to the result, these two factors are correlated at a very small level. To get a proper correlation, we make this data stationary.
# perform correlation test
Netflixweeklydata['TOTAL'].corr(Netflixweeklydata['value'])
# According to the result, these two factors are correlated at a very small level.
# We need to convert the data to stationary format for time series analysis.
Calculate one day differences in TOTAL
and store the values in a column called TOTALdiff
. Also calculate one day differences in value
and store the values in a column called valuediff
.
# calculate one day differences for Google search interest and ads spend.
Netflixweeklydata['TOTALdiff'] = Netflixweeklydata['TOTAL'].diff(1)
Netflixweeklydata['valuediff'] = Netflixweeklydata['value'].diff(1)
# calculate the correlation
Netflixweeklydata['TOTALdiff'].corr(Netflixweeklydata['valuediff'])
Netflixweeklydata.to_csv('adspend.googletrends.weekly.csv')
# read in data
adspend_trends = pd.read_csv('adspend.googletrends.weekly.csv', parse_dates = ['FIXED TIME'])
# set fixed time as index
adspend_trends.set_index('FIXED TIME', inplace = True)
list(adspend_trends)
# remove unknown variables we made and inspect the column names again
adspend_trends = adspend_trends.drop(columns = ['onediffvalue', 'Unnamed: 0', 'TOTALdiff', 'valuediff'])
list(adspend_trends)
# use ADF to check the significance of stationarity
for acolumn in list(adspend_trends):
adf_result = stattools.adfuller(adspend_trends[acolumn])
print(adf_result[1])
adspend_trends_diff = pd.DataFrame()
# calculate one day difference to see if we can get a stationary dataset
for acolumn in list(adspend_trends):
columnnames = "%s_diff" % (acolumn)
adspend_trends_diff[columnnames] = adspend_trends[acolumn].diff(1)
adspend_trends_diff.head()
for acolumn in list(adspend_trends_diff):
adf_result = stattools.adfuller(adspend_trends_diff[acolumn].iloc[1:])
print(adf_result[1])
list(adspend_trends_diff)
# calculate correlation between ads spends variables and Google search interests
correlationlist = []
for acolumn in list(adspend_trends_diff):
if "value" not in acolumn:
corr = adspend_trends_diff[acolumn].corr(adspend_trends_diff['value_diff'])
print(acolumn, corr)
correlationlist.append(corr)
# calculate the mean of these correlations.
np.mean(correlationlist)
# calculate the correlation again by using the original data.
correlationlist = []
for acolumn in list(adspend_trends):
if "value" not in acolumn:
corr = adspend_trends[acolumn].corr(adspend_trends['value'])
print(acolumn, corr)
correlationlist.append(corr)
np.mean(correlationlist)
# generate heatmap for correlation
f, ax = plt.subplots(figsize = (10, 8))
corr = adspend_trends_diff.corr()
sns.heatmap(corr, mask = np.zeros_like(corr, dtype = np.bool),
cmap = sns.diverging_palette(220, 10, as_cmap = True),
square = True, ax = ax)
plt.show()
# perform granger causality test
# set the measurement period
numofweeks = 12
significantlags = []
for acolumn in list(adspend_trends_diff):
if 'value' not in acolumn:
testframe = adspend_trends_diff[['value_diff', acolumn]]
testframe = testframe.iloc[1:]
results = stattools.grangercausalitytests(testframe, numofweeks, verbose = False)
for week in range(1, numofweeks + 1):
# set level of significance.
if results[week][0]['params_ftest'][1] < 0.05:
print('%s is significnat at %s weeks' % (acolumn, week))
significantlags.append(week)
Do any advertising expenditures drive Google search interest?
Yes, I think CABLE TV
and NEWSPAPER
could drive the Google search interest. Moreover, NAT SPOT RADIO
, NETWORK RADIO
, and SYNDICATION
also have long term effect on Google search interest. According to the correlation test between ad spend difference and Google search interest difference, we could find CABLE TV
and NEWSPAPER
have the highest positive correlation with the Google search interest, which suggests increasing the spend on these two categories might also increase the Google search interest. CABLE TV
and NEWSPAPER
are also effective approaches to advertise Netflix. As the most significant competitor of cable tv, Netflix could show its benefits through a direct comparison on TV.
The Granger causality test couldn't tell correlation directly; however, it indicates the significance of lags, which means the advertising expenditures are still able to drive search interest after seeing the ads for N weeks.
fig, ax = pyplot.subplots(figsize=(15,8))
ax.plot(adspend_trends_diff.index, adspend_trends_diff['TOTAL_diff'])
ax.plot(adspend_trends_diff.index, adspend_trends_diff['value_diff'])
ax.title.set_text('Time Series Visualizations of Ad Spend and Google Search Interest')
ax.legend(('Ad Spend','Google Search Interest'))
plt.show()
Create a time series visualizations that overlay both ad spend and Google search interest, to see if they visually align.
According to the plot above, we could find that the visualization of Google search interest does align with the visualization of ad spend.
# The most common lags in the list is 5 weeks.
mode(significantlags)
modelingdataset = adspend_trends_diff
modelingdataset.tail()
# created a 5-week lagged version of data
# attach 5 more weeks to the end of dataframe
date = pd.to_datetime('2018-10-08')
modelingdataset.loc[date] = np.nan
date = pd.to_datetime('2018-10-15')
modelingdataset.loc[date] = np.nan
date = pd.to_datetime('2018-10-22')
modelingdataset.loc[date] = np.nan
date = pd.to_datetime('2018-10-29')
modelingdataset.loc[date] = np.nan
date = pd.to_datetime('2018-11-05')
modelingdataset.loc[date] = np.nan
modelingdataset.tail()
# shift the column down by 5 rows.
for acolumn in list(modelingdataset):
if 'value' not in acolumn:
# get lag up to 5
for alag in range(1,6):
columnname = '%s_lag%s' % (acolumn, alag)
modelingdataset[columnname] = modelingdataset[acolumn].shift(alag)
# sort the dataset by the name of column names
modelingdataset.sort_index(axis=1, inplace = True)
modelingdataset.head()
modelingdataset.to_csv('google_adspend_differenced_lagged_fordatarobot.csv')
In section 5, I
# load final modeling data
alldata = pd.read_csv('google_adspend_differenced_lagged_fordatarobot.csv')
# drop rows contain empty values
alldata = alldata.iloc[6:]
alldata = alldata.iloc[:-5]
alldata['FIXED TIME'] = pd.to_datetime((alldata['FIXED TIME']))
Perform neural network modeling based on the github instructions.
# Gradient descent algorithms perform better if the variables are wihtin range [-1, 1].
# The `value_diff` variable is mixmax scaled to bound the tranformed variable within [-1,1].
scaler = MinMaxScaler(feature_range=(-1, 1))
alldata['scaled_value_diff'] = scaler.fit_transform(np.array(alldata['value_diff']).reshape(-1, 1))
# split data in two parts - train set and validation set
# The neural network is trained on the train set.
# This means computation of the loss function, back propagation and weights updated by a gradient descent algorithm is done on the train set.
split_date = datetime.datetime(year=2017, month=11, day=10, hour=0)
df_train = alldata.loc[alldata['FIXED TIME']<split_date]
df_val = alldata.loc[alldata['FIXED TIME']>=split_date]
print('Shape of train:', df_train.shape)
print('Shape of test:', df_val.shape)
# reset index
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
# plot train times series of the standardized value_diff
plt.figure(figsize=(15, 6))
g = sns.tsplot(df_train['scaled_value_diff'], color='b')
g.set_title('Time series of Google Trends data differences in train set')
g.set_xlabel('Index')
g.set_ylabel('Google Trends data differences')
# plot validation times series of the standardized value_diff
plt.figure(figsize=(15, 6))
g = sns.tsplot(df_val['scaled_value_diff'], color='r')
g.set_title('Time series of Google Trends data differences in validation set')
g.set_xlabel('Index')
g.set_ylabel('Google Trends data differences')
plt.show()
def makeXy(ts, nb_timesteps):
X = []
y = []
for i in range(nb_timesteps, ts.shape[0]):
X.append(list(ts.loc[i-nb_timesteps:i-1]))
y.append(ts.loc[i])
X, y = np.array(X), np.array(y)
return X, y
# generate regressors (X) and target variable (y) for train and validation sets
X_train, y_train = makeXy(df_train['scaled_value_diff'], 7)
print('Shape of train arrays:', X_train.shape, y_train.shape)
X_val, y_val = makeXy(df_val['scaled_value_diff'], 7)
print('Shape of validation arrays:', X_val.shape, y_val.shape)
# initialize model
model = Sequential()
model.add(Dense(7, activation='relu', kernel_initializer='uniform', input_shape=(7,)))
# training with noise
layer = GaussianNoise(0.1)
model.add(GaussianNoise(0.01, input_shape=(620,)))
model.add(Dense(16, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
# compile the model
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(x=X_train, y=y_train, batch_size=201, epochs=100, verbose=1, validation_data=(X_val, y_val), shuffle=True)
# predict the results
preds = model.predict(X_val)
# calculate R^2
r2 = r2_score(df_val['scaled_value_diff'].loc[7:], preds)
print('R-squared for the validation set:', round(r2,4))
alldata.set_index(alldata['FIXED TIME'], inplace=True)
alldata = alldata.drop(columns=['FIXED TIME'])
listofallpredictors = []
for i in list(alldata):
if "value" not in i:
listofallpredictors.append(i)
predictors = alldata[listofallpredictors]
target = alldata['value_diff']
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, test_size=.3, random_state=123)
model = LassoLarsCV(cv=10, precompute=False)
model = model.fit(pred_train.values, tar_train.values)
predictors_model = pd.DataFrame(listofallpredictors)
predictors_model.columns = ['label']
predictors_model['coeff'] = model.coef_
for index, row in predictors_model.iterrows():
if row['coeff'] > 0:
print(row.values)
test_error = mean_squared_error(tar_test, model.predict(pred_test))
print('test data MSE')
print(test_error)
rsquared_test = model.score(pred_test, tar_test)
print('test data r-square')
print(rsquared_test)
Assess the degree to which your advanced approach outperforms the methods taught in class.
Both of my approaches are not a good fit for my data. The LassoLarsCV has R^2 as -0.067 and Neural Network has R^2 as -9.784. Negative R^2 indicates the fitted model is even worse than a straight line.