广告投放中的CTR预估模型
广告投放中的CTR预估模型
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import gc
import os
import lightgbm as lgb数据清洗
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import gc
import os
import lightgbm as lgbdata_path = './datasets/'train = pd.read_csv(os.path.join(data_path,'train.csv'))test = pd.read_csv(os.path.join(data_path,'test.csv'))特征字段 字段描述 id 用户行为id,唯一表示,无重复 date 行为时间,精确到秒 user_id 用户id product 产品 campaign_id 活动id webpage_id 网页id product_category_id 产品类型id user_group_id 用户所属群组id gender 性别 age_level 年龄等级 user_depth 用户价值深度 var_1 匿名特征 isClick 是否点击,1为点击,0为未点击
traintestdata = pd.concat([train,test],ignore_index=True)datadata['day_id'] = data['date'].apply(lambda x:int(x[3:5]))data['minute_id']=data['date'].apply(lambda x:int(x[-5:-3])*60 + int(x[-2:]))data特征工程
构建用户每天前后两次浏览行为之间的时间间隔及其衍生均值特征, 因为用户浏览时间往往和其是否点击具有相关性
data['minute_id'].shift(-1)0 0.0
1 0.0
2 0.0
3 1.0
4 1.0
...
463286 1439.0
463287 1439.0
463288 1439.0
463289 1439.0
463290 NaN
Name: minute_id, Length: 463291, dtype: float64
data.groupby(['user_id','day_id'])['minute_id'].apply(lambda x :x.shift(-1) -x)0 NaN
1 0.0
2 2.0
3 NaN
4 NaN
...
463286 NaN
463287 NaN
463288 0.0
463289 NaN
463290 NaN
Name: minute_id, Length: 463291, dtype: float64
data.groupby(['user_id','day_id'])['minute_id'].agg(lambda x :x.shift(-1) -x)user_id day_id
0 2 NaN
6 NaN
1 2 [0.0, 2.0, nan]
2 2 NaN
3 [0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 6.0, nan]
...
150342 7 [0.0, nan]
150343 7 NaN
150344 7 NaN
150345 7 NaN
150346 7 [0.0, nan]
Name: minute_id, Length: 249625, dtype: object
data.groupby(['user_id','day_id'])['minute_id'].transform(lambda x :x.shift(-1) -x)0 NaN
1 0.0
2 2.0
3 NaN
4 NaN
...
463286 NaN
463287 NaN
463288 0.0
463289 NaN
463290 NaN
Name: minute_id, Length: 463291, dtype: float64
data['minute_id_diff'] = data.groupby(['user_id','day_id'])['minute_id'].transform(lambda x :x.shift(-1) -x)data.groupby(['user_id','day_id'])['minute_id_diff'].transform('mean')0 NaN
1 1.0
2 1.0
3 NaN
4 NaN
...
463286 NaN
463287 34.0
463288 154.8
463289 154.8
463290 26.0
Name: minute_id_diff, Length: 463291, dtype: float64
data.groupby(['user_id','day_id'])['minute_id_diff'].agg('mean')user_id day_id
0 2 NaN
6 NaN
1 2 1.000000
2 2 NaN
3 1.142857
...
150342 7 0.000000
150343 7 NaN
150344 7 NaN
150345 7 NaN
150346 7 0.000000
Name: minute_id_diff, Length: 249625, dtype: float64
data.groupby(['user_id','day_id'])['minute_id_diff'].mean()user_id day_id
0 2 NaN
6 NaN
1 2 1.000000
2 2 NaN
3 1.142857
...
150342 7 0.000000
150343 7 NaN
150344 7 NaN
150345 7 NaN
150346 7 0.000000
Name: minute_id_diff, Length: 249625, dtype: float64
data['minute_id_diff_mean'] = data.groupby(['user_id','day_id'])['minute_id_diff'].transform('mean')for col in ['user_id','product','campaign_id','webpage_id','product_category_id','user_group_id']:
data['{}_count'.format(col)] = data.groupby(col)['minute_id'].transform('count')
data.columnsIndex(['id', 'date', 'user_id', 'product', 'campaign_id', 'webpage_id',
'product_category_id', 'user_group_id', 'gender', 'age_level',
'user_depth', 'var_1', 'isClick', 'day_id', 'minute_id',
'minute_id_diff', 'minute_id_diff_mean', 'user_id_count',
'product_count', 'campaign_id_count', 'webpage_id_count',
'product_category_id_count', 'user_group_id_count'],
dtype='object')
ycol = 'isClick'
drop_list = [
ycol,
'id',
'date'
]
features = [x for x in data.columns if x not in drop_list]print("使用{} 个特征:{}".format(len(features),features))使用20 个特征:['user_id', 'product', 'campaign_id', 'webpage_id', 'product_category_id', 'user_group_id', 'gender', 'age_level', 'user_depth', 'var_1', 'day_id', 'minute_id', 'minute_id_diff', 'minute_id_diff_mean', 'user_id_count', 'product_count', 'campaign_id_count', 'webpage_id_count', 'product_category_id_count', 'user_group_id_count']
# 下列特征转化为类别特征
categorical_feature = [
'user_id',
'product',
'campaign_id',
'webpage_id',
'product_category_id',
'user_group_id',
'gender',
'age_level',
'user_depth',
]
for col in categorical_feature:
data[col] = data[col].astype('category')train = data[~data[ycol].isnull()]test = data[data[ycol].isnull()]train.shape(391825, 23)
test.shape(71466, 23)
del(data)gc.collect()0
五折交叉验证训练模型
NFLOD =5
random_state = 2021
KF = StratifiedKFold(n_splits = NFLOD,shuffle=True,random_state=random_state)
params_lgb = {
'boosting':'gbdt',
'objective':'binary',
'metric':'auc',
'force_row_size':True,
'random_state':random_state,
'learning_rate':0.03,
'max_depth':8,
'num_leaves':40,
'subsamples':0.8,
'subsample_freq':3,
'colsample_bytree':0.8,
'n_jobs':-1,
'verbose':-1
}oof_lgb = np.zeros(len(train))predictions_lgb = np.zeros(len(test))df_importance_list = []# 五折交叉验证
for fold_,(trn_idx,val_idx) in enumerate(KF.split(train[features],train[ycol])):
print('------------fold{}-----------'.format(fold_ + 1))
trn_data = lgb.Dataset(train.iloc[trn_idx][features] , label=train.iloc[trn_idx][ycol])
val_data = lgb.Dataset(train.iloc[val_idx][features], label=train.iloc[val_idx][ycol],reference=trn_data)
clf_lgb = lgb.train(
params = params_lgb,
train_set = trn_data,
valid_sets = [trn_data,val_data],
valid_names = ('train','val'),
num_boost_round = 50000,
early_stopping_rounds = 200,
verbose_eval = 100,
)
oof_lgb[val_idx] = clf_lgb.predict(train.iloc[val_idx][features],num_iteration=clf_lgb.best_iteration)
predictions_lgb[:] += (clf_lgb.predict(test[features],num_iteration = clf_lgb.best_iteration)/ NFLOD)
df_importance = pd.DataFrame({
'column':features,
'importance_split':clf_lgb.feature_importance(importance_type = 'split'),
'importance_gain':clf_lgb.feature_importance(importance_type = 'gain')
})
df_importance_list.append(df_importance)------------fold1-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
/Users/gaozhiyong/Documents/pyenv/pyenv3.6/lib/python3.6/site-packages/lightgbm/basic.py:1433: UserWarning: Overriding the parameters from Reference Dataset.
_log_warning('Overriding the parameters from Reference Dataset.')
/Users/gaozhiyong/Documents/pyenv/pyenv3.6/lib/python3.6/site-packages/lightgbm/basic.py:1245: UserWarning: categorical_column in param dict is overridden.
_log_warning('{} in param dict is overridden.'.format(cat_alias))
Training until validation scores don't improve for 200 rounds
[100] train's auc: 0.677174 val's auc: 0.628599
[200] train's auc: 0.690672 val's auc: 0.63003
[300] train's auc: 0.702212 val's auc: 0.631905
[400] train's auc: 0.711116 val's auc: 0.632443
[500] train's auc: 0.7182 val's auc: 0.632929
[600] train's auc: 0.725516 val's auc: 0.632789
Early stopping, best iteration is:
[499] train's auc: 0.718131 val's auc: 0.632934
------------fold2-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
Training until validation scores don't improve for 200 rounds
[100] train's auc: 0.679197 val's auc: 0.620851
[200] train's auc: 0.693258 val's auc: 0.622213
[300] train's auc: 0.703885 val's auc: 0.623473
[400] train's auc: 0.714255 val's auc: 0.62404
[500] train's auc: 0.720288 val's auc: 0.624115
[600] train's auc: 0.726659 val's auc: 0.624428
[700] train's auc: 0.732689 val's auc: 0.624295
[800] train's auc: 0.73929 val's auc: 0.624619
[900] train's auc: 0.744832 val's auc: 0.624663
[1000] train's auc: 0.754675 val's auc: 0.62499
[1100] train's auc: 0.759126 val's auc: 0.624757
[1200] train's auc: 0.763615 val's auc: 0.62502
[1300] train's auc: 0.769984 val's auc: 0.624564
Early stopping, best iteration is:
[1164] train's auc: 0.761882 val's auc: 0.625101
------------fold3-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
Training until validation scores don't improve for 200 rounds
[100] train's auc: 0.677937 val's auc: 0.623831
[200] train's auc: 0.691909 val's auc: 0.6257
[300] train's auc: 0.702329 val's auc: 0.626483
[400] train's auc: 0.712792 val's auc: 0.626889
[500] train's auc: 0.719476 val's auc: 0.627282
[600] train's auc: 0.726372 val's auc: 0.627444
[700] train's auc: 0.732891 val's auc: 0.627488
[800] train's auc: 0.739184 val's auc: 0.627696
[900] train's auc: 0.747272 val's auc: 0.62776
Early stopping, best iteration is:
[756] train's auc: 0.736675 val's auc: 0.627805
------------fold4-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
Training until validation scores don't improve for 200 rounds
[100] train's auc: 0.678244 val's auc: 0.632375
[200] train's auc: 0.693808 val's auc: 0.633363
[300] train's auc: 0.703073 val's auc: 0.633338
[400] train's auc: 0.712667 val's auc: 0.633605
Early stopping, best iteration is:
[212] train's auc: 0.695547 val's auc: 0.633654
------------fold5-----------
[LightGBM] [Warning] Unknown parameter: subsamples
[LightGBM] [Warning] Unknown parameter: force_row_size
Training until validation scores don't improve for 200 rounds
[100] train's auc: 0.677896 val's auc: 0.627158
[200] train's auc: 0.692087 val's auc: 0.628503
[300] train's auc: 0.701904 val's auc: 0.62927
[400] train's auc: 0.711159 val's auc: 0.629929
[500] train's auc: 0.718229 val's auc: 0.629463
[600] train's auc: 0.725839 val's auc: 0.629527
Early stopping, best iteration is:
[401] train's auc: 0.711302 val's auc: 0.630001
valid_auc_score = roc_auc_score(train[ycol],oof_lgb)valid_auc_score0.629614298701527
特征重要性
df_features_importances= pd.concat(df_importance_list)df_features_importance= df_features_importances.groupby('column').mean().reset_index()df_features_importance| column | importance_split | importance_gain | |
|---|---|---|---|
| 0 | age_level | 40.2 | 407.363945 |
| 1 | campaign_id | 565.6 | 17419.711442 |
| 2 | campaign_id_count | 1049.4 | 5384.477741 |
| 3 | day_id | 1194.6 | 9639.523684 |
| 4 | gender | 54.6 | 253.445407 |
| 5 | minute_id | 3935.8 | 22181.027330 |
| 6 | minute_id_diff | 3431.4 | 23767.593485 |
| 7 | minute_id_diff_mean | 3201.4 | 21643.519522 |
| 8 | product | 489.6 | 6321.783218 |
| 9 | product_category_id | 176.8 | 2965.427346 |
| 10 | product_category_id_count | 970.4 | 6571.430858 |
| 11 | product_count | 1215.4 | 7685.000637 |
| 12 | user_depth | 151.4 | 900.218823 |
| 13 | user_group_id | 395.0 | 3675.279637 |
| 14 | user_group_id_count | 610.2 | 3107.873730 |
| 15 | user_id | 3435.2 | 67579.315769 |
| 16 | user_id_count | 2032.4 | 31719.678195 |
| 17 | var_1 | 248.8 | 1597.959461 |
| 18 | webpage_id | 91.6 | 2045.332514 |
| 19 | webpage_id_count | 359.8 | 1744.924732 |
df_features_importance.sort_values('importance_gain',ascending=False)| column | importance_split | importance_gain | |
|---|---|---|---|
| 15 | user_id | 3435.2 | 67579.315769 |
| 16 | user_id_count | 2032.4 | 31719.678195 |
| 6 | minute_id_diff | 3431.4 | 23767.593485 |
| 5 | minute_id | 3935.8 | 22181.027330 |
| 7 | minute_id_diff_mean | 3201.4 | 21643.519522 |
| 1 | campaign_id | 565.6 | 17419.711442 |
| 3 | day_id | 1194.6 | 9639.523684 |
| 11 | product_count | 1215.4 | 7685.000637 |
| 10 | product_category_id_count | 970.4 | 6571.430858 |
| 8 | product | 489.6 | 6321.783218 |
| 2 | campaign_id_count | 1049.4 | 5384.477741 |
| 13 | user_group_id | 395.0 | 3675.279637 |
| 14 | user_group_id_count | 610.2 | 3107.873730 |
| 9 | product_category_id | 176.8 | 2965.427346 |
| 18 | webpage_id | 91.6 | 2045.332514 |
| 19 | webpage_id_count | 359.8 | 1744.924732 |
| 17 | var_1 | 248.8 | 1597.959461 |
| 12 | user_depth | 151.4 | 900.218823 |
| 0 | age_level | 40.2 | 407.363945 |
| 4 | gender | 54.6 | 253.445407 |
预测
test.head()test.columnsIndex(['id', 'date', 'user_id', 'product', 'campaign_id', 'webpage_id',
'product_category_id', 'user_group_id', 'gender', 'age_level',
'user_depth', 'var_1', 'isClick', 'day_id', 'minute_id',
'minute_id_diff', 'minute_id_diff_mean', 'user_id_count',
'product_count', 'campaign_id_count', 'webpage_id_count',
'product_category_id_count', 'user_group_id_count'],
dtype='object')
test.loc[:,ycol] = predictions_lgb/Users/gaozhiyong/Documents/pyenv/pyenv3.6/lib/python3.6/site-packages/pandas/core/indexing.py:1743: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
isetter(ilocs[0], value)
test.loc[:,ycol]391825 0.082708
391826 0.084275
391827 0.046645
391828 0.050657
391829 0.126505
...
463286 0.079529
463287 0.041204
463288 0.046967
463289 0.048984
463290 0.078428
Name: isClick, Length: 71466, dtype: float64
test[['id',ycol]]保存预测结果
test[['user_id','product','campaign_id',ycol]].to_csv('res.csv',index=False)About ME
👋 读书城南,🤔 在未来面前,我们都是孩子~
- 📙 一个热衷于探索学习新方向、新事物的智能产品经理,闲暇时间喜欢coding💻、画图🎨、音乐🎵、学习ing~
👋 Social Media
🛠️ Blog: http://oceaneyes.top
⚡ PM导航: https://pmhub.oceangzy.top
☘️ CNBLOG: https://www.cnblogs.com/oceaneyes-gzy/
🌱 AI PRJ自己部署的一些算法demo: http://ai.oceangzy.top/
📫 Email: 1450136519@qq.com
💬 WeChat: OCEANGZY
💬 公众号: UncleJoker-GZY
👋 加入小组~

👋 感谢打赏~

本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 OCAEN.GZY读书城南!