代码拉取完成,页面将自动刷新
DF赛题 企业非法集资风险预测 https://www.datafountain.cn/competitions/469
特征工程
# ===================== amount_feas 分箱特征 ===============
for fea in tqdm(amount_feas, desc="分箱特征"):
# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是loanAmnt/1000
data['{}_bin1'.format(fea)] = np.floor_divide(data[fea], 1000)
## 通过对数函数映射到指数宽度分箱
data['{}_bin2'.format(fea)] = np.floor(np.log10(data[fea]))
# ===================== amount_feas 基本聚合特征 ===============
for f in tqdm(amount_feas, desc="amount_feas 基本聚合特征"):
for cate in category_fea:
if f != cate:
data['{}_{}_medi'.format(cate, f)] = data.groupby(cate)[f].transform('median')
data['{}_{}_mean'.format(cate, f)] = data.groupby(cate)[f].transform('mean')
data['{}_{}_max'.format(cate, f)] = data.groupby(cate)[f].transform('max')
data['{}_{}_min'.format(cate, f)] = data.groupby(cate)[f].transform('min')
data['{}_{}_std'.format(cate, f)] = data.groupby(cate)[f].transform('std')
# =================== amount_feas 基本交叉特征 =============================
for f1 in tqdm(amount_feas, desc="amount_feas 基本交叉特征"):
for f2 in amount_feas:
if f1 != f2:
data['{}_{}_ratio'.format(f1, f2)] = data[f1].values / data[f2].values
data['{}_{}_multi'.format(f1, f2)] = data[f1].values * data[f2].values
data['{}_{}_add'.format(f1, f2)] = data[f1].values + data[f2].values
data['{}_{}_diff'.format(f1, f2)] = data[f1].values - data[f2].values
data['nmean'] = data[amount_feas].mean(1)
data['ntd'] = data[amount_feas].std(1)
data['nsum'] = data[amount_feas].sum(1)
for i in tqdm(category_fea, desc="类别特征nunique特征"):
for j in category_fea:
if i != j:
data['nuni_{0}_{1}'.format(i, j)] = data[i].map(data.groupby(i)[j].nunique())
# ===================== 五折转化率特征 ====================
data['ID'] = data.index
data['fold'] = data['ID'] % 5
data.loc[data['label'].isnull(), 'fold'] = 5
target_feat = []
for i in tqdm(cat_list, desc="5折转化率特征"):
target_feat.extend([i + '_mean_last_1'])
data[i + '_mean_last_1'] = None
for fold in range(6):
data.loc[data['fold'] == fold, i + '_mean_last_1'] = data[data['fold'] == fold][i].map(
data[(data['fold'] != fold) & (data['fold'] != 5)].groupby(i)['label'].mean()
)
data[i + '_mean_last_1'] = data[i + '_mean_last_1'].astype(float)
模型选定
为了保证模型结果的稳定性,模型融合没有采取复杂的操作,而是将lgb、xgb、cat三者模型的结果概率文件相加然后取平均,进行融合,B榜结果很稳定
具体可查看image下的README
首先下载网盘,镜像百度网盘地址,
链接:https://pan.baidu.com/s/1u--ICcRNktnyZGXqjXknUA 提取码:6a55 复制这段内容后打开百度网盘手机App,操作更方便哦
加载下载好的镜像tar包,名称为'great.tar',以实际为准。然后加载镜像docker load -i great.tar
查看加载进行的ID,运行run.sh
docker run -v /bdci2020/469/illegal-fundraising-prediction/:/data a4683ac2023f sh /data/image/run.sh
docker run -v /home/quincyqiang/qiye/:/data a4683ac2023f sh /data/image/run.sh
解释:
/home/quincyqiang/qiye/ 为本地宿主机目录
/data 为容器的目录
a4683ac2023f 为镜像ID
成绩复现项目目录结构如下:
qiye/
├── data
│ ├── code 程序目录
│ │ ├── catboot.py
│ │ ├── ensemble.py
│ │ ├── gen_feas.py
│ │ ├── gen_feas_v2.py
│ │ ├── init.py
│ │ ├── lgb.py
│ │ ├── nohup.out
│ │ └── xgb.py
│ ├── prediction_result 运行结果文件
│ │ ├── cat.csv:catboost模型结果文件
│ │ ├── lgb.csv:lightgbm 模型结果文件
│ │ ├── result.csv 三个单模融合文件
│ │ └── xgb.csv xgboost模型结果文件
│ ├── raw_data 比赛数据目录
│ │ ├── entprise_evaluate.csv
│ │ ├── entprise_submit.csv
│ │ └── train
│ │ ├── annual_report_info.csv
│ │ ├── base_info.csv
│ │ ├── change_info.csv
│ │ ├── entprise_info.csv
│ │ ├── news_info.csv
│ │ ├── other_info.csv
│ │ └── tax_info.csv
│ ├── README.md
│ └── user_data
└── image
├── great.tar 打包完整的镜像
├── READEME.md
└── run.sh 程序运行入口sh文件
Package Version
--------------- -------
catboost 0.18.1
cycler 0.10.0
dill 0.3.3
graphviz 0.15
joblib 0.17.0
kiwisolver 1.3.1
lightgbm 2.3.1
matplotlib 3.3.3
numpy 1.17.4
pandarallel 1.5.1
pandas 1.1.4
Pillow 8.0.1
pip 20.3.1
plotly 4.14.1
pyparsing 2.4.7
python-dateutil 2.8.1
pytz 2020.4
retrying 1.3.3
scikit-learn 0.23.2
scipy 1.5.4
setuptools 41.0.1
six 1.15.0
threadpoolctl 2.1.0
tqdm 4.54.1
wheel 0.33.1
xgboost 0.90
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。