lr、lgb、lr+lgb、lr+lgb+原始特征对比

  • 2020 年 4 月 25 日
  • AI
import numpy as np
import pandas as pd
import os
import lightgbm as lgb
import xgboost as xgb
import catboost as cab
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler,LabelEncoder


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,StratifiedKFold
import os
import random
import pandas as pd
import numpy as np
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(123123123)
import category_encoders as ce

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, concatenate, Embedding, Reshape
from tensorflow.keras.layers import Flatten, concatenate, Lambda, Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2, l1_l2


def maybe_download(train_data,test_data):
    """if adult data "train.csv" and "test.csv" are not in your directory,
    download them.
    """

    COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
               "marital_status", "occupation", "relationship", "race", "gender",
               "capital_gain", "capital_loss", "hours_per_week", "native_country",
               "income_bracket"]

    if not os.path.exists(train_data):
        print("downloading training data...")
        df_train = pd.read_csv("//mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data",
            names=COLUMNS, skipinitialspace=True)
    else:
        df_train = pd.read_csv("train.csv")

    if not os.path.exists(test_data):
        print("downloading testing data...")
        df_test = pd.read_csv("//mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test",
            names=COLUMNS, skipinitialspace=True, skiprows=1)
    else:
        df_test = pd.read_csv("test.csv")

    return df_train, df_test

train,test=maybe_download('train.csv','test.csv')
train.to_csv('train.csv',index=False)
test.to_csv('test.csv',index=False)

数据字段的含义:

数据源:

年龄:连续。

工作类别:私人,无个人收入,无个人收入,联邦政府,地方政府,州政府,无薪,从未工作过。

fnlwgt:连续。

教育程度:学士,某大学,11年级,高中毕业生,教授学校,Assoc-acdm,Assoc-voc,9、7-8-8、12,硕士,1-4至10,博士学位,5-6至学前班。

教育人数:连续。

婚姻状况:已婚公民配偶,离婚,未婚,分居,丧偶,已婚配偶缺席,已婚配偶。

职业:技术支持,工艺修理,其他服务,销售,执行管理,专业教授,装卸清洁员,机器操作检查,行政助理,农家捕鱼,运输移动,私人住宅serv,保护serv,武装部队。

关系:妻子,独生子女,丈夫,亲戚,其他亲戚,未婚。

种族:白色,亚洲人-帕斯岛,亚美-印度-爱斯基摩人,其他,黑人。

性别:女,男。

资本收益:连续。

资本损失:连续。

每周工作时间:连续。

国籍:美国,柬埔寨,英国,波多黎各,加拿大,德国,美国以外的关岛,印度,日本,希腊,南部,中国,古巴,伊朗,洪都拉斯,菲律宾,意大利,波兰,牙买加,越南,墨西哥,葡萄牙,爱尔兰,法国,多米尼加共和国,老挝,厄瓜多尔,台湾,海地,哥伦比亚,匈牙利,危地马拉,尼加拉瓜,苏格兰,泰国,南斯拉夫,萨尔瓦多,特立尼达、多巴哥,秘鲁,荷兰霍兰。

y_train=train.pop('income_bracket')
y_test=test.pop('income_bracket')
target={'<=50K':0,'>50K':1}
y_train=y_train.map(target)
y_test=y_test.map(target)
cat=list(train.select_dtypes(include=['object']).columns)
train[cat]=train[cat].astype('category')
test[cat]=test[cat].astype('category')

lr_train=pd.get_dummies(train)
sd=StandardScaler()
lr_train=sd.fit_transform(lr_train)
clf=LogisticRegression(max_iter=100000)

scores = cross_val_score(clf, lr_train, y_train, cv=StratifiedKFold(5,shuffle=True),scoring='roc_auc')

print(scores)

逻辑回归表现:

clf=lgb.LGBMClassifier(n_estimators=250)

scores = cross_val_score(clf, train, y_train, cv=StratifiedKFold(5,shuffle=True),scoring='roc_auc')

print(scores)

lgb表现:

lgb+lr表现(去除原始特征)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator,ClassifierMixin


class gbdt_lr(BaseEstimator, ClassifierMixin):

    def __init__(self, gbdt=None, lr=None,gbdt_params=None,lr_params=None,cv=CountVectorizer(analyzer='word',preprocessor=None,ngram_range=(1,1),stop_words=None,min_df=0,)):
        self.gbdt=gbdt(**gbdt_params)
        self.lr=lr(**lr_params)
        self.cv=cv
    def fit(self, X, y):
        self.gbdt.fit(X,y)
        leaf = (self.gbdt.predict(X, pred_leaf=True)).astype(str).tolist()

        leaf=[' '.join(item) for item in leaf]
        self.result=self.cv.fit_transform(leaf)
        self.lr.fit(self.result,y)
        return self
    
    def predict_proba(self, X):
        leaf=self.gbdt.predict(X, pred_leaf=True)
        leaf = (self.gbdt.predict(X, pred_leaf=True)).astype(str).tolist()
        leaf=[' '.join(item) for item in leaf]
        result=self.cv.transform(leaf)
        
        return self.lr.predict_proba(result)
     
from sklearn.metrics import roc_auc_score
import gc
import warnings
warnings.filterwarnings("ignore")

skt=StratifiedKFold(5,shuffle=True)

splits = skt.split(X, y)
score_train = []
score_valid = []
clfs=[]
X=pd.DataFrame(X)
y=pd.DataFrame(y)
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X.iloc[train_index].values, X.iloc[valid_index].values
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]


    clf = clf=gbdt_lr(gbdt=lgb.LGBMClassifier,gbdt_params={'n_estimators':250},lr=LogisticRegression,lr_params={'C':1.0})

    clf.fit(X_train,y_train)

    y_pred_train = clf.predict_proba(X_train)[:,1]
    y_pred_valid = clf.predict_proba(X_valid)[:,1]

    score_train.append(roc_auc_score(y_train,y_pred_train))
    score_valid.append(roc_auc_score(y_valid, y_pred_valid))

    del X_train, X_valid, y_train, y_valid

    gc.collect()

lgb+lr表现(加入原始特征)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator,ClassifierMixin
from scipy import sparse as sp

class gbdt_lr(BaseEstimator, ClassifierMixin):

    def __init__(self, gbdt=None, lr=None,gbdt_params=None,lr_params=None,cv=CountVectorizer(analyzer='word',preprocessor=None,ngram_range=(1,1),stop_words=None,min_df=0,)):
        self.gbdt=gbdt(**gbdt_params)
        self.lr=lr(**lr_params)
        self.cv=cv
    def fit(self, X, y):
        self.gbdt.fit(X,y)
        leaf = (self.gbdt.predict(X, pred_leaf=True)).astype(str).tolist()

        leaf=[' '.join(item) for item in leaf]
        self.result=self.cv.fit_transform(leaf)
        new_X=sp.csr_matrix(X)
        self.result=sp.hstack([self.result,new_X])
        self.lr.fit(self.result,y)
        return self
    
    def predict_proba(self, X):
        leaf=self.gbdt.predict(X, pred_leaf=True)
        leaf = (self.gbdt.predict(X, pred_leaf=True)).astype(str).tolist()
        leaf=[' '.join(item) for item in leaf]
        result=self.cv.transform(leaf)
        new_X=sp.csr_matrix(X)
        return self.lr.predict_proba(sp.hstack([result,new_X]))
     

from sklearn.metrics import roc_auc_score
import gc
import warnings
warnings.filterwarnings("ignore")

skt=StratifiedKFold(5,shuffle=True)

splits = skt.split(X, y)
score_train = []
score_valid = []
clfs=[]
X=pd.DataFrame(X)
y=pd.DataFrame(y)
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X.iloc[train_index].values, X.iloc[valid_index].values
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]


    clf = clf=gbdt_lr(gbdt=lgb.LGBMClassifier,gbdt_params={'n_estimators':250},lr=LogisticRegression,lr_params={'C':1.0})

    clf.fit(X_train,y_train)

    y_pred_train = clf.predict_proba(X_train)[:,1]
    y_pred_valid = clf.predict_proba(X_valid)[:,1]

    score_train.append(roc_auc_score(y_train,y_pred_train))
    score_valid.append(roc_auc_score(y_valid, y_pred_valid))

    del X_train, X_valid, y_train, y_valid

    gc.collect()

效果显著,当然仅仅是针对这个数据集而言的,可能别的数据集效果就一般了。

通过稀疏矩阵的方式存储矩阵,使用sparse进行稀疏矩阵层面的矩阵操作,大大降低数据对内存的需求,当然前提是你的数据足够稀疏才有意义。