kaggle案例重复:科比的投篮选择之二
- 2020 年 3 月 3 日
- 筆記
今天继续重复kaggle案例:科比的投篮选择。原文地址https://www.kaggle.com/xvivancos/kobe-bryant-shot-selection/report
读入数据、加载需要用到的包
setwd("../Desktop/Data_analysis_practice/Kaggle/Kobe_shot_selection/") shots<-read.csv("data.csv") dim(shots) shots<-na.omit(shots) dim(shots) library(ggplot2) library(tidyverse) library(gridExtra)
不同进攻方式的投篮命中率
这里用到 group_by()
和 summarise()
函数。一个简单的小例子理解这两个函数的用法
df<-data.frame(First=c("A","A","A","B","B","B"), Second=c(1,2,1,4,5,6)) df%>% group_by(First)%>% summarise(Accuracy=mean(Second), counts=n()) # A tibble: 2 x 3 First Accuracy counts <fct> <dbl> <int> 1 A 1.33 3 2 B 5.00 3
shots%>% group_by(action_type)%>% summarise(Accuracy=mean(shot_made_flag),counts=n())%>% filter(counts>20)%>% ggplot(aes(x=reorder(action_type,Accuracy),y=Accuracy))+ geom_point(aes(colour=Accuracy),size=3)+ scale_colour_gradient(low="orangered",high="chartreuse3")+ labs(title="Accurancy by shot type")+theme_bw()+ theme(axis.title.y=element_blank(), legend.position="none", plot.title=element_text(hjust=0.5))+ coord_flip()

这里又涉及一个小知识点:从小到大排序使用 reorder()
函数。小例子:
df<-data.frame(First=LETTERS[1:5], Second=c(1,4,5,3,2)) p1<-ggplot(df,aes(x=First,y=Second))+ geom_bar(stat="identity",fill="darkgreen") p2<-ggplot(df,aes(x=reorder(First,Second),y=Second))+ geom_bar(stat="identity",fill="orange") ggpubr::ggarrange(p1,p2,ncol=1,nrow=2,labels=c("p1","p2"))

那么从大到小排序呢?暂时想到一种解决办法:
df1<-df[order(df$Second,decreasing=T),] df1$First<-fct_inorder(df1$First) ggplot(df1,aes(x=First,y=Second))+ geom_bar(stat="identity",fill="orangered")

每个赛季的命中率
shots%>% group_by(season)%>% summarise(Accuracy=mean(shot_made_flag))%>% ggplot(aes(x=season,y=Accuracy,group=1))+ geom_line(aes(colour=Accuracy))+ geom_point(aes(colour=Accuracy),size=3)+ scale_colour_gradient(low="orangered",high="chartreuse3")+ labs(title="Accuracy by season",x="Season")+theme_bw()+ theme(legend.position="none", plot.title=element_text(hjust=0.5), axis.text.x=element_text(angle=45,hjust=1))

由上图可以看出最后三个赛季科比的命中率断崖式下跌。原文作者的话:As we see, the accuracy begins to decrease badly from the 2013-14 season. Why didn't you retire before, Kobe?
常规赛季后赛命中率对比
shots%>% group_by(season)%>% summarise(Playoff=mean(shot_made_flag[playoffs==1]), RegularSeason=mean(shot_made_flag[playoffs==0]))%>% ggplot(aes(x=season,group=1))+ geom_line(aes(y=Playoff,color="Playoff"))+ geom_line(aes(y=RegularSeason,colour="RegularSeason"))+ geom_point(aes(y=Playoff,color="Playoff"),size=3)+ geom_point(aes(y=RegularSeason,color="RegularSeason"))+ labs(title="Accuracy by season", subtitle="Playoff and Regular Season", x="Season",y="Accuracy")+theme_bw()+ theme(legend.title=element_blank(), plot.title=element_text(hjust=0.5), plot.subtitle=element_text(hjust=0.5), axis.text.x=element_text(angle=45,hjust=1))

两分球和三分球命中率
shots %>% group_by(season) %>% summarise(TwoPoint=mean(shot_made_flag[shot_type=="2PT Field Goal"]), ThreePoint=mean(shot_made_flag[shot_type=="3PT Field Goal"])) %>% ggplot(aes(x=season, group=1)) + geom_line(aes(y=TwoPoint, colour="TwoPoint")) + geom_line(aes(y=ThreePoint, colour="ThreePoint")) + geom_point(aes(y=TwoPoint, colour="TwoPoint"), size=3) + geom_point(aes(y=ThreePoint, colour="ThreePoint"), size=3) + labs(title="Accuracy by season", subtitle="2PT Field Goal and 3PT Field Goal", x="Season", y="Accuracy") + theme_bw() + theme(legend.title=element_blank(), plot.title=element_text(hjust=0.5), plot.subtitle=element_text(hjust=0.5), axis.text.x=element_text(angle=45, hjust=1))

从上图看到2013-2014赛季科比的3分命中率极低。哪位忠实的球迷还能想起来2013-2014赛季的科比是什么情况吗?
不同的对手两分球三分球命中率
shots %>% group_by(opponent) %>% summarise(TwoPoint=mean(shot_made_flag[shot_type=="2PT Field Goal"]), ThreePoint=mean(shot_made_flag[shot_type=="3PT Field Goal"])) %>% ggplot(aes(x=opponent, group=1)) + geom_line(aes(y=TwoPoint, colour="TwoPoint")) + geom_line(aes(y=ThreePoint, colour="ThreePoint")) + geom_point(aes(y=TwoPoint, colour="TwoPoint"), size=3) + geom_point(aes(y=ThreePoint, colour="ThreePoint"), size=3) + labs(title="Accuracy by opponent", subtitle="2PT Field Goal and 3PT Field Goal", x="Opponent", y="Accuracy") + theme_bw() + theme(legend.title=element_blank(), plot.title=element_text(hjust=0.5), plot.subtitle=element_text(hjust=0.5), axis.text.x=element_text(angle=45, hjust=1))

不同出手距离投篮命中率
shots %>% group_by(shot_distance) %>% summarise(Accuracy=mean(shot_made_flag)) %>% ggplot(aes(x=shot_distance, y=Accuracy)) + geom_line(aes(colour=Accuracy)) + geom_point(aes(colour=Accuracy), size=2) + scale_colour_gradient(low="orangered", high="chartreuse3") + labs(title="Accuracy by shot distance", x="Shot distance (ft.)") + xlim(c(0,45)) + theme_bw() + theme(legend.position="none", plot.title=element_text(hjust=0.5))

不同区域的投篮命中率
p7 <- shots %>% select(lat, lon, shot_zone_range, shot_made_flag) %>% group_by(shot_zone_range) %>% mutate(Accuracy=mean(shot_made_flag)) %>% ggplot(aes(x=lon, y=lat)) + geom_point(aes(colour=Accuracy)) + scale_colour_gradient(low="red", high="lightgreen") + labs(title="Accuracy by shot zone range") + ylim(c(33.7, 34.0883)) + theme_void() + theme(plot.title=element_text(hjust=0.5) p8 <- shots %>% select(lat, lon, shot_zone_area, shot_made_flag) %>% group_by(shot_zone_area) %>% mutate(Accuracy=mean(shot_made_flag)) %>% ggplot(aes(x=lon, y=lat)) + geom_point(aes(colour=Accuracy)) + scale_colour_gradient(low="red", high="lightgreen") + labs(title="Accuracy by shot zone area") + ylim(c(33.7, 34.0883)) + theme_void() + theme(legend.position="none", plot.title=element_text(hjust=0.5)) p9 <- shots %>% select(lat, lon, shot_zone_basic, shot_made_flag) %>% group_by(shot_zone_basic) %>% mutate(Accuracy=mean(shot_made_flag)) %>% ggplot(aes(x=lon, y=lat)) + geom_point(aes(colour=Accuracy)) + scale_colour_gradient(low="red", high="lightgreen") + labs(title="Accuracy by shot zone basic") + ylim(c(33.7, 34.0883)) + theme_void() + theme(legend.position="none", plot.title=element_text(hjust=0.5)) grid.arrange(p7, p8, p9, layout_matrix=cbind(c(1,2), c(1,3)))
