import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

input_path = 'input/bggdata/'


plt.rcParams["font.family"] = "Arial"
plt.rcParams['grid.linestyle'] = 'solid'
plt.rcParams['axes.labelcolor'] = '0.2'
plt.rcParams['grid.color'] = '0.9'
plt.rcParams['xtick.color'] = '0.3'
plt.rcParams['ytick.color'] = '0.3'
plt.rcParams['axes.edgecolor'] = '0.3'
plt.rcParams['axes.titleweight']  = 'bold'
plt.rcParams['axes.facecolor'] = 'white'

games = pd.read_csv(input_path + 'games.csv')
mechanics = pd.read_csv(input_path + 'mechanics.csv')
subcategories = pd.read_csv(input_path + 'subcategories.csv')
designers = pd.read_csv(input_path + 'designers_reduced.csv')
themes = pd.read_csv(input_path + 'themes.csv')

train = games.sample(frac=0.8, random_state=12345)
test = games.drop(train.index)

print(f"Total: {games.shape}")
print(f"Train: {train.shape}")
print(f"Test: {test.shape}")

Total: (21925, 48)
Train: (17540, 48)
Test: (4385, 48)

plot_data = train.melt(id_vars = ['BGGId'],value_vars=['AvgRating', 'BayesAvgRating'], var_name='Rating Type', value_name='Rating')
ax = sns.kdeplot(data=plot_data, x="Rating", hue="Rating Type", fill=True)
# plt.show()

fig = ax.get_figure()

ax.grid(True)
ax.set_axisbelow(True)


# title
fig.text(0.5, 1, "BGG Boardgame Ratings", ha='center', weight='bold', size = 16)
# subtitle
fig.text(0.5, 0.94, "Distribution of Ratings", ha='center', weight='normal', size = 16)
# caption
caption_str = f"Source: BoardGameGeek.com; Jerome Williams\nNote: Plot based on training dataset of {len(train)} games."
fig.text(0.05, -0.1, caption_str, ha='left', size = 10)

ax.set_xlabel('Rating')
ax.set_ylabel('Density')

plt.savefig("output/03_avg_vs_bgg_rating_distn.pdf", format="pdf", bbox_inches = 'tight')

plot_data = (train[train['NumUserRatings'] > 1000]
             .melt(id_vars = ['BGGId'],
                   value_vars=['AvgRating', 'BayesAvgRating'],
                   var_name='RatingType', 
                   value_name='Rating')
                   )
ax = sns.kdeplot(plot_data, x="Rating", hue="RatingType", fill=True)
fig = ax.get_figure()

ax.grid(True)
ax.set_axisbelow(True)

# title
fig.text(0.5, 1, "BGG Boardgame Ratings", ha='center', weight='bold', size = 16)
# subtitle
fig.text(0.5, 0.94, "Distribution of Ratings, Games with at least 1000 ratings", ha='center', weight='normal', size = 16)
# caption
caption_str = f"Source: BoardGameGeek.com; Jerome Williams\nNote: Plot based on training dataset of {len(train)} games, \
filtered to games with at least 1,000 ratings."
fig.text(0.05, -0.1, caption_str, ha='left', size = 10)

ax.set_xlabel('Rating')
ax.set_ylabel('Density')

plt.savefig("output/04_avg_vs_bgg_rating_distn_1000.pdf", format="pdf", bbox_inches = 'tight')

mechanics = pd.read_csv('input/bggdata/mechanics.csv')
train = train.merge(mechanics, on = 'BGGId')
test = test.merge(mechanics, on = 'BGGId')
print(f"After joining mechanics, train: {train.shape}")
print(f"After joining mechanics, test: {test.shape}")

After joining mechanics, train: (17540, 205)
After joining mechanics, test: (4385, 205)

mechanics_list = mechanics.columns[1:].to_list()

mechanics_sum = train[mechanics_list].sum(axis=0).sort_values(ascending=False)
print('Top 10 mechanics:')
print(mechanics_sum[:10])

print(f'Number of unique mechanics: {len(mechanics_list)}')

Top 10 mechanics:
Dice Rolling              5207
Hand Management           3596
Set Collection            2352
Variable Player Powers    2172
Hexagon Grid              1975
Simulation                1725
Drafting                  1612
Tile Placement            1458
Modular Board             1355
Grid Movement             1310
dtype: int64
Number of unique mechanics: 157

subcategories = pd.read_csv('input/bggdata/subcategories.csv')
subcategories_new_columns = ['subcat_' + col for col in subcategories.columns[1:].to_list()]
subcategories.columns = ['BGGId'] + subcategories_new_columns
print(len(subcategories_new_columns))
print(subcategories.columns)

train = train.merge(subcategories, on = 'BGGId')
test = test.merge(subcategories, on = 'BGGId')

print(train.shape, test.shape)

10
Index(['BGGId', 'subcat_Exploration', 'subcat_Miniatures',
       'subcat_Territory Building', 'subcat_Card Game', 'subcat_Educational',
       'subcat_Puzzle', 'subcat_Collectible Components', 'subcat_Word Game',
       'subcat_Print & Play', 'subcat_Electronic'],
      dtype='object')
(17540, 215) (4385, 215)

designers = pd.read_csv('input/bggdata/designers_reduced.csv')
designers_columns = designers.columns.to_list()

# print(designers_columns.index('BGGId'))
# print(designers_columns.index('Low-Exp Designer'))

designers_new_columns = ['designer_' + col for col in designers.columns[:1592].to_list()]
designers.columns = designers_new_columns + ['BGGId', 'Low-Exp Designer']

train = train.merge(designers, on = 'BGGId')
test = test.merge(designers, on = 'BGGId')

print(train.shape, test.shape)

(17540, 1808) (4385, 1808)

designers_sum = train[designers_new_columns + ['Low-Exp Designer']].sum(axis=0).sort_values(ascending=False)
print('Top 10 designers:')
print(designers_sum[:10])
print(f'Number of unique designers: {len(designers_new_columns)}')

Top 10 designers:
Low-Exp Designer            6652
designer_(Uncredited)       1147
designer_Reiner Knizia       260
designer_Joseph Miranda      113
designer_Wolfgang Kramer     103
designer_Richard H. Berg      89
designer_Jim Dunnigan         81
designer_James Ernest         80
designer_Martin Wallace       75
designer_Frank Chadwick       74
dtype: int64
Number of unique designers: 1592

themes = pd.read_csv('input/bggdata/themes.csv')
themes_new_columns = ['theme_' + col for col in themes.columns[1:].to_list()]
themes.columns = ['BGGId'] + themes_new_columns

train = train.merge(themes, on = 'BGGId')
test = test.merge(themes, on = 'BGGId')
print(train.shape, test.shape)

(17540, 2025) (4385, 2025)

designers_sum = train[themes_new_columns].sum(axis=0).sort_values(ascending=False)
print('Top 10 themes:')
print(designers_sum[:10])
print(f'Number of unique themes: {len(themes_new_columns)}')

Top 10 themes:
theme_Fantasy                      2110
theme_Science Fiction              1332
theme_Fighting                     1330
theme_Economic                     1221
theme_Animals                      1061
theme_World War II                  996
theme_Humor                         963
theme_Adventure                     924
theme_Movies / TV / Radio theme     859
theme_Medieval                      824
dtype: int64
Number of unique themes: 217

feature_vars = (['MinPlayers', 'MaxPlayers', 'MfgPlaytime', 'ComMinPlaytime', 'ComMaxPlaytime', 
                'Cat:Thematic',
                'Cat:Strategy',
                'Cat:War',
                'Cat:Family',
                'Cat:CGS',
                'Cat:Abstract',
                'Cat:Party',
                'Cat:Childrens'] + 
            mechanics_list + 
            subcategories_new_columns + 
            designers_new_columns + ['Low-Exp Designer'] +
            themes_new_columns)
X_train = train[feature_vars].copy()
X_test = test[feature_vars].copy()

target_var = 'AvgRating'
y_train = train[target_var]
y_test = test[target_var]
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) 
# print(X_train.head(5))

(17540, 1990) (4385, 1990) (17540,) (4385,)

X_train['plays1'] = ((X_train['MinPlayers'] == 1) & (X_train['MaxPlayers'] >= 1)).astype(int)
X_train['plays2'] = ((X_train['MinPlayers'] <= 2) & (X_train['MaxPlayers'] >= 2)).astype(int)
X_train['plays3'] = ((X_train['MinPlayers'] <= 3) & (X_train['MaxPlayers'] >= 3)).astype(int)
X_train['plays4'] = ((X_train['MinPlayers'] <= 4) & (X_train['MaxPlayers'] >= 4)).astype(int)
X_train['plays5'] = ((X_train['MinPlayers'] <= 5) & (X_train['MaxPlayers'] >= 5)).astype(int)
X_train['plays6'] = ((X_train['MinPlayers'] <= 6) & (X_train['MaxPlayers'] >= 6)).astype(int)
X_train['plays7plus'] = (X_train['MaxPlayers'] >= 7).astype(int)
X_train['num_player_counts'] = X_train['MaxPlayers'] - X_train['MinPlayers'] + 1

X_test['plays1'] = ((X_test['MinPlayers'] == 1) & (X_test['MaxPlayers'] >= 1)).astype(int)
X_test['plays2'] = ((X_test['MinPlayers'] <= 2) & (X_test['MaxPlayers'] >= 2)).astype(int)
X_test['plays3'] = ((X_test['MinPlayers'] <= 3) & (X_test['MaxPlayers'] >= 3)).astype(int)
X_test['plays4'] = ((X_test['MinPlayers'] <= 4) & (X_test['MaxPlayers'] >= 4)).astype(int)
X_test['plays5'] = ((X_test['MinPlayers'] <= 5) & (X_test['MaxPlayers'] >= 5)).astype(int)
X_test['plays6'] = ((X_test['MinPlayers'] <= 6) & (X_test['MaxPlayers'] >= 6)).astype(int)
X_test['plays7plus'] = (X_test['MaxPlayers'] >= 7).astype(int)
X_test['num_player_counts'] = X_test['MaxPlayers'] - X_test['MinPlayers'] + 1

print(X_train.shape, X_test.shape)

(17540, 1998) (4385, 1998)

# scaling
from sklearn.preprocessing import RobustScaler

X_scaler = RobustScaler()

features = X_train.columns

Xtr2 = X_scaler.fit_transform(X_train[features])
Xte2 = X_scaler.transform(X_test[features])

y_scaler = RobustScaler()

ytr2 = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
yte2 = y_scaler.transform(y_test.values.reshape(-1, 1))

import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

n = 5
cv = KFold(n, shuffle = True, random_state = 12345)

def objective(trial):
    
    learning_rate = trial.suggest_float("learning_rate", 0, 0.1)
    n_estimators = trial.suggest_int('n_estimators', 500, 2000)

    scores = []
    
    for i, (train_index, val_index) in enumerate(cv.split(Xtr2, ytr2)):
        Xtr2_fold, X_val = Xtr2[train_index], Xtr2[val_index]
        ytr2_fold, y_val = ytr2[train_index], ytr2[val_index]
    
        xgb_model = xgb.XGBRegressor(objective = "reg:squarederror",
                                     verbosity = 0,
                                     learning_rate = learning_rate,
                                     n_estimators = n_estimators
                                    )
        
        xgb_model.fit(Xtr2_fold, ytr2_fold)
    
        xgb_preds = xgb_model.predict(X_val)
        score = mean_squared_error(y_val, xgb_preds)
    
        scores.append(score)
    
    return np.mean(scores)

study = optuna.create_study(direction = "minimize", sampler = TPESampler())
study.optimize(func = objective, n_trials = 30)
print(study.best_params)

/Users/jeromew/.pyenv/versions/3.9.1/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
[I 2024-03-19 22:44:06,244] A new study created in memory with name: no-name-aaabedc9-46b1-4d96-8380-55cbd091e661
[I 2024-03-19 22:45:04,341] Trial 0 finished with value: 0.30738036954574566 and parameters: {'learning_rate': 0.060833069383555406, 'n_estimators': 1880}. Best is trial 0 with value: 0.30738036954574566.
[I 2024-03-19 22:45:30,533] Trial 1 finished with value: 0.31015399413744166 and parameters: {'learning_rate': 0.09460936319049677, 'n_estimators': 774}. Best is trial 0 with value: 0.30738036954574566.
[I 2024-03-19 22:46:15,825] Trial 2 finished with value: 0.307577481719661 and parameters: {'learning_rate': 0.07204047491631509, 'n_estimators': 1449}. Best is trial 0 with value: 0.30738036954574566.
[I 2024-03-19 22:46:56,794] Trial 3 finished with value: 0.3092449938464673 and parameters: {'learning_rate': 0.06923893229177526, 'n_estimators': 1261}. Best is trial 0 with value: 0.30738036954574566.
[I 2024-03-19 22:47:25,644] Trial 4 finished with value: 0.3092600644066167 and parameters: {'learning_rate': 0.09754983694887447, 'n_estimators': 857}. Best is trial 0 with value: 0.30738036954574566.
[I 2024-03-19 22:47:50,051] Trial 5 finished with value: 0.3191691204250052 and parameters: {'learning_rate': 0.0530247042495883, 'n_estimators': 665}. Best is trial 0 with value: 0.30738036954574566.
[I 2024-03-19 22:48:33,966] Trial 6 finished with value: 0.30931469706954123 and parameters: {'learning_rate': 0.06444077495302468, 'n_estimators': 1328}. Best is trial 0 with value: 0.30738036954574566.
[I 2024-03-19 22:49:26,250] Trial 7 finished with value: 0.3097184429918773 and parameters: {'learning_rate': 0.051063539675557296, 'n_estimators': 1553}. Best is trial 0 with value: 0.30738036954574566.
[I 2024-03-19 22:50:01,194] Trial 8 finished with value: 0.3249103086468331 and parameters: {'learning_rate': 0.026099593905584306, 'n_estimators': 900}. Best is trial 0 with value: 0.30738036954574566.
[I 2024-03-19 22:50:57,406] Trial 9 finished with value: 0.3151928740535336 and parameters: {'learning_rate': 0.0294326492052586, 'n_estimators': 1603}. Best is trial 0 with value: 0.30738036954574566.
[I 2024-03-19 22:52:09,694] Trial 10 finished with value: 0.32227640400023777 and parameters: {'learning_rate': 0.014564746218513225, 'n_estimators': 1973}. Best is trial 0 with value: 0.30738036954574566.
[I 2024-03-19 22:53:12,429] Trial 11 finished with value: 0.30630432455435286 and parameters: {'learning_rate': 0.0805902980585565, 'n_estimators': 1938}. Best is trial 11 with value: 0.30630432455435286.
[I 2024-03-19 22:54:17,010] Trial 12 finished with value: 0.3058130642860485 and parameters: {'learning_rate': 0.08048588227575734, 'n_estimators': 1998}. Best is trial 12 with value: 0.3058130642860485.
[I 2024-03-19 22:55:14,999] Trial 13 finished with value: 0.3062823091602316 and parameters: {'learning_rate': 0.08374472433204481, 'n_estimators': 1750}. Best is trial 12 with value: 0.3058130642860485.
[I 2024-03-19 22:56:14,012] Trial 14 finished with value: 0.306195857233181 and parameters: {'learning_rate': 0.08597443652607173, 'n_estimators': 1730}. Best is trial 12 with value: 0.3058130642860485.
[I 2024-03-19 22:57:12,271] Trial 15 finished with value: 0.30630024779280285 and parameters: {'learning_rate': 0.08288170579717015, 'n_estimators': 1729}. Best is trial 12 with value: 0.3058130642860485.
[I 2024-03-19 22:57:51,274] Trial 16 finished with value: 0.31709825247952195 and parameters: {'learning_rate': 0.03924650352231968, 'n_estimators': 1025}. Best is trial 12 with value: 0.3058130642860485.
[I 2024-03-19 22:59:42,742] Trial 17 finished with value: 0.3696725984291914 and parameters: {'learning_rate': 0.001513941838738729, 'n_estimators': 1755}. Best is trial 12 with value: 0.3058130642860485.
[I 2024-03-19 23:00:30,425] Trial 18 finished with value: 0.30697863645312873 and parameters: {'learning_rate': 0.08877811645345303, 'n_estimators': 1437}. Best is trial 12 with value: 0.3058130642860485.
[I 2024-03-19 23:01:09,064] Trial 19 finished with value: 0.309753023607684 and parameters: {'learning_rate': 0.07389433722377375, 'n_estimators': 1082}. Best is trial 12 with value: 0.3058130642860485.
[I 2024-03-19 23:02:11,566] Trial 20 finished with value: 0.3051665343626448 and parameters: {'learning_rate': 0.09816410797966438, 'n_estimators': 1831}. Best is trial 20 with value: 0.3051665343626448.
[I 2024-03-19 23:03:13,171] Trial 21 finished with value: 0.30573173424293654 and parameters: {'learning_rate': 0.09605043241004642, 'n_estimators': 1797}. Best is trial 20 with value: 0.3051665343626448.
[I 2024-03-19 23:04:16,130] Trial 22 finished with value: 0.30560561322170504 and parameters: {'learning_rate': 0.09908057813759082, 'n_estimators': 1859}. Best is trial 20 with value: 0.3051665343626448.
[I 2024-03-19 23:05:18,786] Trial 23 finished with value: 0.3063965449381953 and parameters: {'learning_rate': 0.0996817965426739, 'n_estimators': 1830}. Best is trial 20 with value: 0.3051665343626448.
[I 2024-03-19 23:06:12,928] Trial 24 finished with value: 0.30577930717232704 and parameters: {'learning_rate': 0.0925811010598444, 'n_estimators': 1595}. Best is trial 20 with value: 0.3051665343626448.
[I 2024-03-19 23:07:08,321] Trial 25 finished with value: 0.3060915718375415 and parameters: {'learning_rate': 0.09977579176829841, 'n_estimators': 1632}. Best is trial 20 with value: 0.3051665343626448.
[I 2024-03-19 23:08:12,869] Trial 26 finished with value: 0.3051846627192057 and parameters: {'learning_rate': 0.09038473485505345, 'n_estimators': 1871}. Best is trial 20 with value: 0.3051665343626448.
[I 2024-03-19 23:09:02,370] Trial 27 finished with value: 0.30722468026563676 and parameters: {'learning_rate': 0.0766276264869035, 'n_estimators': 1421}. Best is trial 20 with value: 0.3051665343626448.
[I 2024-03-19 23:10:09,002] Trial 28 finished with value: 0.3064995308658916 and parameters: {'learning_rate': 0.09012860503999219, 'n_estimators': 1893}. Best is trial 20 with value: 0.3051665343626448.
[I 2024-03-19 23:11:12,457] Trial 29 finished with value: 0.3062500412000902 and parameters: {'learning_rate': 0.06389813461055553, 'n_estimators': 1859}. Best is trial 20 with value: 0.3051665343626448.

{'learning_rate': 0.09816410797966438, 'n_estimators': 1831}

study.best_params

xgb_model_best = xgb.XGBRegressor(objective = "reg:squarederror",
                                     verbosity = 0,
                                     importance_type = 'gain',
                                     **study.best_params
                                    )
xgb_model_best.fit(Xtr2, ytr2)
test_preds = xgb_model_best.predict(Xte2)
test_score = mean_squared_error(yte2, test_preds)
print(f"Test score: {test_score:.4f}")
print(f"RMSE:       {np.sqrt(test_score):.4f}")

Test score: 0.3003
RMSE:       0.5480

y_test_inverted = y_scaler.inverse_transform(yte2).flatten()
test_preds_inverted = y_scaler.inverse_transform(test_preds.reshape(-1, 1)).flatten()

test_df = pd.DataFrame({'y_test': y_test_inverted, 'test_preds': test_preds_inverted})
RMSE_inverted = np.sqrt(mean_squared_error(test_df['y_test'], test_df['test_preds']))
print(f"RMSE: {RMSE_inverted:.4f}")

RMSE: 0.6633

ax = sns.scatterplot(data = test_df, x = 'y_test', y = 'test_preds', alpha = 0.6, marker = '.', edgecolor=None)
fig = ax.get_figure()

ax.set_xlabel('Actual rating')
ax.set_ylabel('Predicted rating')
ax.set_facecolor('white')
ax.grid(True)
ax.set_axisbelow(True)


# title
fig.text(0.5, 1, "Predicting BGG Boardgame Ratings", ha='center', weight='bold', size = 16)

# subtitle
fig.text(0.5, 0.94, "Predicted Ratings vs. Actual Ratings", ha='center', weight='normal', size = 16)

# caption
caption_text = """
Source: BoardGameGeek.com; Jerome Williams
Note: Predicted ratings from XGBoost model."""
fig.text(0.1, -0.10, caption_text, ha='left', size=9)

plt.savefig("output/01_predicted_vs_actual_XGB01.pdf", format="pdf", bbox_inches = 'tight')

importance_df = pd.DataFrame({'feature' : features, 'importance' : xgb_model_best.feature_importances_})
importance_df.sort_values('importance', ascending = False, inplace = True)

ax = sns.barplot(data = importance_df.head(20), x = 'importance', y = 'feature')
fig = ax.get_figure()

ax.set_axisbelow(True)
ax.grid(True)

# title
fig.text(0.5, 1, "Predicting BGG Boardgame Ratings", ha='center', weight='bold', size = 16)

# subtitle
fig.text(0.5, 0.94, "Top 20 features by importance, XGBoost model", ha='center', weight='normal', size = 16)

# caption
caption_text = """Source: BoardGameGeek.com; Jerome Williams
Note: Feature importance is based on the gain metric, which measures the average improvement in accuracy 
across all splits the feature is used in."""

fig.text(-0.2, -0.15, caption_text, ha='left', size = 9)

ax.set_xlabel('Feature Importance')
ax.set_ylabel('Feature')
plt.show()
# plt.savefig("output/02_variable_importances_XGB01.pdf", format="pdf", bbox_inches = 'tight')

Filename	Description
artists_reduced.csv	Artists associated with each game, stored in one-hot format.
designers_reduced.csv	Designers associated with each game, stored in one-hot format.
games.csv	Tabular data for each boardgame, including year published, minimum and maximum number of players, and average rating.
mechanics.csv	Game mechanics associated with each boardgame, stored in one-hot format.
publishers_reduced.csv	Publishers associated with each game, stored in one-hot format.
ratings_distribution.csv	The distribution of ratings for each game, stored as, for each numerical rating (e.g., 7.5), the number of times the boardgame has received that rating.
subcategories.csv	Subcategories associated with each game. Example subcategories are "Exploration," "Miniatures," "Territory Building," and "Card Game." Categories, as opposed to subcategories, are stored in games.csv.
themes.csv	Themes associated with each game. Example themes are "World War I", "Humor", and "Traffic".
user_ratings.csv	This seems to be a complete set of ratings data. Each row is a boardgame, username, and numerical rating.

Predicting BGG ratings from boardgame characteristics¶

Preamble¶

Data¶

Initial exploration¶

Rating data¶

Boardgames with at least 1,000 reviews¶

Feature engineering¶

Mechanics¶

Subcategories¶

Designers¶

Themes¶

Playing time and player count¶

Variable scaling¶

Results¶

Predictions¶

Feature importance¶