gbdt+lr sklearn api的简单实现
- 2020 年 4 月 25 日
- AI
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator,ClassifierMixin
class gbdt_lr(BaseEstimator, ClassifierMixin):
def __init__(self, gbdt=None, lr=None,gbdt_params=None,lr_params=None,cv=CountVectorizer(analyzer='word',preprocessor=None,ngram_range=(1,1),stop_words=None,min_df=0,)):
self.gbdt=gbdt(**gbdt_params)
self.lr=lr(**lr_params)
self.cv=cv
def fit(self, X, y):
self.gbdt.fit(X,y)
leaf = (self.gbdt.predict(X, pred_leaf=True)).astype(str).tolist()
leaf=[' '.join(item) for item in leaf]
self.result=self.cv.fit_transform(leaf)
X=
self.lr.fit(self.result,y)
return self
def predict_proba(self, X):
leaf=self.gbdt.predict(X, pred_leaf=True)
leaf = (self.gbdt.predict(X, pred_leaf=True)).astype(str).tolist()
leaf=[' '.join(item) for item in leaf]
result=self.cv.transform(leaf)
return self.lr.predict_proba(result)
from sklearn.metrics import roc_auc_score
import gc
import warnings
warnings.filterwarnings("ignore")
skt=StratifiedKFold(5,shuffle=True)
splits = skt.split(X, y)
score_train = []
score_valid = []
clfs=[]
X=pd.DataFrame(X)
y=pd.DataFrame(y)
for fold_n, (train_index, valid_index) in enumerate(splits):
X_train, X_valid = X.iloc[train_index].values, X.iloc[valid_index].values
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
clf = clf=gbdt_lr(gbdt=lgb.LGBMClassifier,gbdt_params={'n_estimators':500},lr=LogisticRegression,lr_params={'C':1.0})
clf.fit(X_train,y_train)
y_pred_train = clf.predict_proba(X_train)[:,1]
y_pred_valid = clf.predict_proba(X_valid)[:,1]
score_train.append(roc_auc_score(y_train,y_pred_train))
score_valid.append(roc_auc_score(y_valid, y_pred_valid))
del X_train, X_valid, y_train, y_valid
gc.collect()
简单实现,类型检查,报错提醒啥的都没写,太麻烦了,反正自己用,叶节点的转化用词袋很方便,暂时没做什么性能优化。有感兴趣的可以改改然后分享分享。