lr、lgb、lr+lgb、lr+lgb+原始特徵對比

  • 2020 年 4 月 25 日
  • AI
import numpy as np
import pandas as pd
import os
import lightgbm as lgb
import xgboost as xgb
import catboost as cab
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler,LabelEncoder


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,StratifiedKFold
import os
import random
import pandas as pd
import numpy as np
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(123123123)
import category_encoders as ce

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, concatenate, Embedding, Reshape
from tensorflow.keras.layers import Flatten, concatenate, Lambda, Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2, l1_l2


def maybe_download(train_data,test_data):
    """if adult data "train.csv" and "test.csv" are not in your directory,
    download them.
    """

    COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
               "marital_status", "occupation", "relationship", "race", "gender",
               "capital_gain", "capital_loss", "hours_per_week", "native_country",
               "income_bracket"]

    if not os.path.exists(train_data):
        print("downloading training data...")
        df_train = pd.read_csv("//mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data",
            names=COLUMNS, skipinitialspace=True)
    else:
        df_train = pd.read_csv("train.csv")

    if not os.path.exists(test_data):
        print("downloading testing data...")
        df_test = pd.read_csv("//mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test",
            names=COLUMNS, skipinitialspace=True, skiprows=1)
    else:
        df_test = pd.read_csv("test.csv")

    return df_train, df_test

train,test=maybe_download('train.csv','test.csv')
train.to_csv('train.csv',index=False)
test.to_csv('test.csv',index=False)

數據字段的含義:

數據源:

年齡:連續。

工作類別:私人,無個人收入,無個人收入,聯邦政府,地方政府,州政府,無薪,從未工作過。

fnlwgt:連續。

教育程度:學士,某大學,11年級,高中畢業生,教授學校,Assoc-acdm,Assoc-voc,9、7-8-8、12,碩士,1-4至10,博士學位,5-6至學前班。

教育人數:連續。

婚姻狀況:已婚公民配偶,離婚,未婚,分居,喪偶,已婚配偶缺席,已婚配偶。

職業:技術支持,工藝修理,其他服務,銷售,執行管理,專業教授,裝卸清潔員,機器操作檢查,行政助理,農家捕魚,運輸移動,私人住宅serv,保護serv,武裝部隊。

關係:妻子,獨生子女,丈夫,親戚,其他親戚,未婚。

種族:白色,亞洲人-帕斯島,亞美-印度-愛斯基摩人,其他,黑人。

性別:女,男。

資本收益:連續。

資本損失:連續。

每周工作時間:連續。

國籍:美國,柬埔寨,英國,波多黎各,加拿大,德國,美國以外的關島,印度,日本,希臘,南部,中國,古巴,伊朗,洪都拉斯,菲律賓,意大利,波蘭,牙買加,越南,墨西哥,葡萄牙,愛爾蘭,法國,多米尼加共和國,老撾,厄瓜多爾,台灣,海地,哥倫比亞,匈牙利,危地馬拉,尼加拉瓜,蘇格蘭,泰國,南斯拉夫,薩爾瓦多,特立尼達、多巴哥,秘魯,荷蘭霍蘭。

y_train=train.pop('income_bracket')
y_test=test.pop('income_bracket')
target={'<=50K':0,'>50K':1}
y_train=y_train.map(target)
y_test=y_test.map(target)
cat=list(train.select_dtypes(include=['object']).columns)
train[cat]=train[cat].astype('category')
test[cat]=test[cat].astype('category')

lr_train=pd.get_dummies(train)
sd=StandardScaler()
lr_train=sd.fit_transform(lr_train)
clf=LogisticRegression(max_iter=100000)

scores = cross_val_score(clf, lr_train, y_train, cv=StratifiedKFold(5,shuffle=True),scoring='roc_auc')

print(scores)

邏輯回歸表現:

clf=lgb.LGBMClassifier(n_estimators=250)

scores = cross_val_score(clf, train, y_train, cv=StratifiedKFold(5,shuffle=True),scoring='roc_auc')

print(scores)

lgb表現:

lgb+lr表現(去除原始特徵)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator,ClassifierMixin


class gbdt_lr(BaseEstimator, ClassifierMixin):

    def __init__(self, gbdt=None, lr=None,gbdt_params=None,lr_params=None,cv=CountVectorizer(analyzer='word',preprocessor=None,ngram_range=(1,1),stop_words=None,min_df=0,)):
        self.gbdt=gbdt(**gbdt_params)
        self.lr=lr(**lr_params)
        self.cv=cv
    def fit(self, X, y):
        self.gbdt.fit(X,y)
        leaf = (self.gbdt.predict(X, pred_leaf=True)).astype(str).tolist()

        leaf=[' '.join(item) for item in leaf]
        self.result=self.cv.fit_transform(leaf)
        self.lr.fit(self.result,y)
        return self
    
    def predict_proba(self, X):
        leaf=self.gbdt.predict(X, pred_leaf=True)
        leaf = (self.gbdt.predict(X, pred_leaf=True)).astype(str).tolist()
        leaf=[' '.join(item) for item in leaf]
        result=self.cv.transform(leaf)
        
        return self.lr.predict_proba(result)
     
from sklearn.metrics import roc_auc_score
import gc
import warnings
warnings.filterwarnings("ignore")

skt=StratifiedKFold(5,shuffle=True)

splits = skt.split(X, y)
score_train = []
score_valid = []
clfs=[]
X=pd.DataFrame(X)
y=pd.DataFrame(y)
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X.iloc[train_index].values, X.iloc[valid_index].values
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]


    clf = clf=gbdt_lr(gbdt=lgb.LGBMClassifier,gbdt_params={'n_estimators':250},lr=LogisticRegression,lr_params={'C':1.0})

    clf.fit(X_train,y_train)

    y_pred_train = clf.predict_proba(X_train)[:,1]
    y_pred_valid = clf.predict_proba(X_valid)[:,1]

    score_train.append(roc_auc_score(y_train,y_pred_train))
    score_valid.append(roc_auc_score(y_valid, y_pred_valid))

    del X_train, X_valid, y_train, y_valid

    gc.collect()

lgb+lr表現(加入原始特徵)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator,ClassifierMixin
from scipy import sparse as sp

class gbdt_lr(BaseEstimator, ClassifierMixin):

    def __init__(self, gbdt=None, lr=None,gbdt_params=None,lr_params=None,cv=CountVectorizer(analyzer='word',preprocessor=None,ngram_range=(1,1),stop_words=None,min_df=0,)):
        self.gbdt=gbdt(**gbdt_params)
        self.lr=lr(**lr_params)
        self.cv=cv
    def fit(self, X, y):
        self.gbdt.fit(X,y)
        leaf = (self.gbdt.predict(X, pred_leaf=True)).astype(str).tolist()

        leaf=[' '.join(item) for item in leaf]
        self.result=self.cv.fit_transform(leaf)
        new_X=sp.csr_matrix(X)
        self.result=sp.hstack([self.result,new_X])
        self.lr.fit(self.result,y)
        return self
    
    def predict_proba(self, X):
        leaf=self.gbdt.predict(X, pred_leaf=True)
        leaf = (self.gbdt.predict(X, pred_leaf=True)).astype(str).tolist()
        leaf=[' '.join(item) for item in leaf]
        result=self.cv.transform(leaf)
        new_X=sp.csr_matrix(X)
        return self.lr.predict_proba(sp.hstack([result,new_X]))
     

from sklearn.metrics import roc_auc_score
import gc
import warnings
warnings.filterwarnings("ignore")

skt=StratifiedKFold(5,shuffle=True)

splits = skt.split(X, y)
score_train = []
score_valid = []
clfs=[]
X=pd.DataFrame(X)
y=pd.DataFrame(y)
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X.iloc[train_index].values, X.iloc[valid_index].values
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]


    clf = clf=gbdt_lr(gbdt=lgb.LGBMClassifier,gbdt_params={'n_estimators':250},lr=LogisticRegression,lr_params={'C':1.0})

    clf.fit(X_train,y_train)

    y_pred_train = clf.predict_proba(X_train)[:,1]
    y_pred_valid = clf.predict_proba(X_valid)[:,1]

    score_train.append(roc_auc_score(y_train,y_pred_train))
    score_valid.append(roc_auc_score(y_valid, y_pred_valid))

    del X_train, X_valid, y_train, y_valid

    gc.collect()

效果顯著,當然僅僅是針對這個數據集而言的,可能別的數據集效果就一般了。

通過稀疏矩陣的方式存儲矩陣,使用sparse進行稀疏矩陣層面的矩陣操作,大大降低數據對內存的需求,當然前提是你的數據足夠稀疏才有意義。