gbdt+lr sklearn api的简单实现

  • 2020 年 4 月 25 日
  • AI
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator,ClassifierMixin


class gbdt_lr(BaseEstimator, ClassifierMixin):

    def __init__(self, gbdt=None, lr=None,gbdt_params=None,lr_params=None,cv=CountVectorizer(analyzer='word',preprocessor=None,ngram_range=(1,1),stop_words=None,min_df=0,)):
        self.gbdt=gbdt(**gbdt_params)
        self.lr=lr(**lr_params)
        self.cv=cv
    def fit(self, X, y):
        self.gbdt.fit(X,y)
        leaf = (self.gbdt.predict(X, pred_leaf=True)).astype(str).tolist()

        leaf=[' '.join(item) for item in leaf]
        self.result=self.cv.fit_transform(leaf)
        X=
        self.lr.fit(self.result,y)
        return self
    
    def predict_proba(self, X):
        leaf=self.gbdt.predict(X, pred_leaf=True)
        leaf = (self.gbdt.predict(X, pred_leaf=True)).astype(str).tolist()
        leaf=[' '.join(item) for item in leaf]
        result=self.cv.transform(leaf)
        
        return self.lr.predict_proba(result)
     

from sklearn.metrics import roc_auc_score
import gc
import warnings
warnings.filterwarnings("ignore")

skt=StratifiedKFold(5,shuffle=True)

splits = skt.split(X, y)
score_train = []
score_valid = []
clfs=[]
X=pd.DataFrame(X)
y=pd.DataFrame(y)
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X.iloc[train_index].values, X.iloc[valid_index].values
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]


    clf = clf=gbdt_lr(gbdt=lgb.LGBMClassifier,gbdt_params={'n_estimators':500},lr=LogisticRegression,lr_params={'C':1.0})

    clf.fit(X_train,y_train)

    y_pred_train = clf.predict_proba(X_train)[:,1]
    y_pred_valid = clf.predict_proba(X_valid)[:,1]

    score_train.append(roc_auc_score(y_train,y_pred_train))
    score_valid.append(roc_auc_score(y_valid, y_pred_valid))

    del X_train, X_valid, y_train, y_valid

    gc.collect()

简单实现,类型检查,报错提醒啥的都没写,太麻烦了,反正自己用,叶节点的转化用词袋很方便,暂时没做什么性能优化。有感兴趣的可以改改然后分享分享。